antoniomae1234
/

voice-xtts2

Model card Files Files and versions Community

antoniomae1234 commited on May 9, 2024

Commit

2493d72

verified ·

1 Parent(s): 945170a

changes in flenema

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.cardboardlint.yml +5 -0
.circleci/config.yml +53 -0
.compute +17 -0
.dockerignore +1 -0
.github/ISSUE_TEMPLATE.md +19 -0
.github/PR_TEMPLATE.md +18 -0
.github/stale.yml +19 -0
.gitignore +132 -0
.pylintrc +586 -0
CODE_OF_CONDUCT.md +19 -0
CODE_OWNERS.rst +75 -0
CONTRIBUTING.md +51 -0
LICENSE.txt +373 -0
MANIFEST.in +11 -0
README.md +281 -3
TTS/.models.json +77 -0
TTS/__init__.py +0 -0
TTS/bin/__init__.py +0 -0
TTS/bin/compute_attention_masks.py +166 -0
TTS/bin/compute_embeddings.py +130 -0
TTS/bin/compute_statistics.py +90 -0
TTS/bin/convert_melgan_tflite.py +32 -0
TTS/bin/convert_melgan_torch_to_tf.py +116 -0
TTS/bin/convert_tacotron2_tflite.py +37 -0
TTS/bin/convert_tacotron2_torch_to_tf.py +213 -0
TTS/bin/distribute.py +69 -0
TTS/bin/synthesize.py +218 -0
TTS/bin/train_encoder.py +274 -0
TTS/bin/train_glow_tts.py +657 -0
TTS/bin/train_speedy_speech.py +618 -0
TTS/bin/train_tacotron.py +731 -0
TTS/bin/train_vocoder_gan.py +664 -0
TTS/bin/train_vocoder_wavegrad.py +511 -0
TTS/bin/train_vocoder_wavernn.py +539 -0
TTS/bin/tune_wavegrad.py +91 -0
TTS/server/README.md +65 -0
TTS/server/__init__.py +0 -0
TTS/server/conf.json +12 -0
TTS/server/server.py +116 -0
TTS/server/static/TTS_circle.png +0 -0
TTS/server/templates/details.html +131 -0
TTS/server/templates/index.html +114 -0
TTS/speaker_encoder/README.md +18 -0
TTS/speaker_encoder/__init__.py +0 -0
TTS/speaker_encoder/config.json +103 -0
TTS/speaker_encoder/dataset.py +169 -0
TTS/speaker_encoder/losses.py +160 -0
TTS/speaker_encoder/model.py +112 -0
TTS/speaker_encoder/requirements.txt +2 -0
TTS/speaker_encoder/umap.png +0 -0

.cardboardlint.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+linters:
+- pylint:
+    # pylintrc: pylintrc
+    filefilter: ['- test_*.py', '+ *.py', '- *.npy']
+    # exclude:

.circleci/config.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+version: 2
+workflows:
+  version: 2
+  test:
+    jobs:
+      - test-3.6
+      - test-3.7
+      - test-3.8
+executor: ubuntu-latest
+on:
+  push:
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  test-3.6: &test-template
+    docker:
+      - image: circleci/python:3.6
+    resource_class: large
+    working_directory: ~/repo
+    steps:
+      - checkout
+      - run: |
+          sudo apt update
+          sudo apt install espeak git
+      - run: sudo pip install --upgrade pip
+      - run: sudo pip install -e .
+      - run: |
+          sudo pip install --quiet --upgrade cardboardlint pylint
+          cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto
+      - run: nosetests tests --nocapture
+      - run: |
+          sudo ./tests/test_server_package.sh
+          sudo ./tests/test_glow-tts_train.sh
+          sudo ./tests/test_server_package.sh
+          sudo ./tests/test_tacotron_train.sh
+          sudo ./tests/test_vocoder_gan_train.sh
+          sudo ./tests/test_vocoder_wavegrad_train.sh
+          sudo ./tests/test_vocoder_wavernn_train.sh
+          sudo ./tests/test_speedy_speech_train.sh
+  test-3.7:
+    <<: *test-template
+    docker:
+      - image: circleci/python:3.7
+  test-3.8:
+    <<: *test-template
+    docker:
+      - image: circleci/python:3.8

.compute ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+yes | apt-get install sox
+yes | apt-get install ffmpeg
+yes | apt-get install espeak
+yes | apt-get install tmux
+yes | apt-get install zsh
+sh -c "$(curl -fsSL https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)"
+pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
+sudo sh install.sh
+# pip install pytorch==1.7.0+cu100
+# python3 setup.py develop
+# python3 distribute.py --config_path config.json  --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
+# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
+# python3 distribute.py --config_path config_tacotron_gst.json  --data_path ../tmp/Mozilla_22050/
+# python3 distribute.py --config_path config.json  --data_path /data/rw/home/LibriTTS/train-clean-360
+# python3 distribute.py --config_path config.json
+while true; do sleep 1000000; done

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .git/

.github/ISSUE_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,19 @@

+---
+name: 'TTS Discourse '
+about: Pls consider to use TTS Discourse page.
+title: ''
+labels: ''
+assignees: ''
+---
+<b>Questions</b> will not be answered here!!
+Help is much more valuable if it's shared publicly, so that more people can benefit from it.
+Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page or matrix [chat room](https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org) if your issue is not directly related to TTS development (Bugs, code updates etc.).
+You can also check https://github.com/mozilla/TTS/wiki/FAQ for common questions and answers.
+Happy posting!
+https://discourse.mozilla.org/c/tts

.github/PR_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,18 @@

+---
+name: 'Contribution Guideline '
+about: Refer to Contirbution Guideline
+title: ''
+labels: ''
+assignees: ''
+---
+### Contribution Guideline
+Please send your PRs to `dev` branch if it is not directly related to a specific branch.
+Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter.
+We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command:
+```bash
+pip install pylint cardboardlint
+cardboardlinter --refspec master
+```

.github/stale.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions. You might also look our discourse page for further help.
+  https://discourse.mozilla.org/c/tts
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false

.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+WadaSNR/
+.idea/
+*.pyc
+.DS_Store
+./__init__.py
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# vim
+*.swp
+*.swm
+*.swn
+*.swo
+# pytorch models
+*.pth.tar
+result/
+# setup.py
+version.py
+# jupyter dummy files
+core
+tests/outputs/*
+TODO.txt
+.vscode/*
+data/*
+notebooks/data/*
+TTS/tts/layers/glow_tts/monotonic_align/core.c

.pylintrc ADDED Viewed

	@@ -0,0 +1,586 @@

+[MASTER]
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-whitelist=
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+# Pickle collected data for later comparisons.
+persistent=yes
+# Specify a configuration file.
+#rcfile=
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+[MESSAGES CONTROL]
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=missing-docstring,
+        line-too-long,
+        fixme,
+        wrong-import-order,
+        ungrouped-imports,
+        wrong-import-position,
+        import-error,
+        invalid-name,
+        too-many-instance-attributes,
+        arguments-differ,
+        no-name-in-module,
+        no-member,
+        unsubscriptable-object,
+        print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        useless-object-inheritance,
+        too-few-public-methods,
+        too-many-branches,
+        too-many-arguments,
+        too-many-locals,
+        too-many-statements,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape,
+        duplicate-code
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+[REPORTS]
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+# Tells whether to display a full report or only the messages.
+reports=no
+# Activate the evaluation score.
+score=yes
+[REFACTORING]
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
+[LOGGING]
+# Format style used to check logging format string. `old` means using %
+# formatting, while `new` is for `{}` formatting.
+logging-format-style=old
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+[SPELLING]
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package..
+spelling-dict=
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+[MISCELLANEOUS]
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+[TYPECHECK]
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+[VARIABLES]
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+[FORMAT]
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+# Maximum number of characters on a single line.
+max-line-length=100
+# Maximum number of lines in a module.
+max-module-lines=1000
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,
+               dict-separator
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+[SIMILARITIES]
+# Ignore comments when computing similarities.
+ignore-comments=yes
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+# Ignore imports when computing similarities.
+ignore-imports=no
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+[BASIC]
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+argument-rgx=[a-z_][a-z0-9_]{0,30}$
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+# Naming style matching correct function names.
+function-naming-style=snake_case
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           x,
+           ex,
+           Run,
+           _
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+# Naming style matching correct method names.
+method-naming-style=snake_case
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+# Naming style matching correct module names.
+module-naming-style=snake_case
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+variable-rgx=[a-z_][a-z0-9_]{0,30}$
+[STRING]
+# This flag controls whether the implicit-str-concat-in-sequence should
+# generate a warning on implicit string concatenation in sequences defined over
+# several lines.
+check-str-concat-over-line-jumps=no
+[IMPORTS]
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+[CLASSES]
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+[DESIGN]
+# Maximum number of arguments for function / method.
+max-args=5
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+# Maximum number of boolean expressions in an if statement.
+max-bool-expr=5
+# Maximum number of branch for function / method body.
+max-branches=12
+# Maximum number of locals for function / method body.
+max-locals=15
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+# Maximum number of return / yield for function / method body.
+max-returns=6
+# Maximum number of statements in function / method body.
+max-statements=50
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+[EXCEPTIONS]
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Ethical Notice
+Please consider possible consequences and be mindful of any adversarial use cases of this project. In this regard, please contact us if you have any concerns.
+# Community Participation Guidelines
+This repository is governed by Mozilla's code of conduct and etiquette guidelines.
+For more details, please read the
+[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/).
+## How to Report
+For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.
+<!--
+## Project Specific Etiquette
+In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
+Please update for your project.
+-->

CODE_OWNERS.rst ADDED Viewed

	@@ -0,0 +1,75 @@

+TTS code owners / governance system
+==========================================
+TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
+Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
+Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
+The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
+This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
+There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
+Global owners
+----------------
+These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+Training, feeding
+-----------------
+- Eren Gölge (@erogol)
+Model exporting
+---------------
+- Eren Gölge (@erogol)
+Multi-Speaker TTS
+-----------------
+- Eren Gölge (@erogol)
+- Edresson Casanova (@edresson)
+TTS
+---
+- Eren Gölge (@erogol)
+Vocoders
+--------
+- Eren Gölge (@erogol)
+Speaker Encoder
+---------------
+- Eren Gölge (@erogol)
+Testing & CI
+------------
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+Python bindings
+---------------
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+Documentation
+-------------
+- Eren Gölge (@erogol)
+Third party bindings
+--------------------
+Owned by the author.

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Contribution guidelines
+This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/).
+Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter. We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the differences between your work and master, you can use the follow command:
+```bash
+pip install pylint cardboardlint
+cardboardlinter --refspec master
+```
+This will compare the code against master and run the linter on all the changes. To run it automatically as a git pre-commit hook, you can do do the following:
+```bash
+cat <<\EOF > .git/hooks/pre-commit
+#!/bin/bash
+if [ ! -x "$(command -v cardboardlinter)" ]; then
+    exit 0
+fi
+# First, stash index and work dir, keeping only the
+# to-be-committed changes in the working directory.
+echo "Stashing working tree changes..." 1>&2
+old_stash=$(git rev-parse -q --verify refs/stash)
+git stash save -q --keep-index
+new_stash=$(git rev-parse -q --verify refs/stash)
+# If there were no changes (e.g., `--amend` or `--allow-empty`)
+# then nothing was stashed, and we should skip everything,
+# including the tests themselves.  (Presumably the tests passed
+# on the previous commit, so there is no need to re-run them.)
+if [ "$old_stash" = "$new_stash" ]; then
+    echo "No changes, skipping lint." 1>&2
+    exit 0
+fi
+# Run tests
+cardboardlinter --refspec HEAD -n auto
+status=$?
+# Restore changes
+echo "Restoring working tree changes..." 1>&2
+git reset --hard -q && git stash apply --index -q && git stash drop -q
+# Exit with status from test-run: nonzero prevents commit
+exit $status
+EOF
+chmod +x .git/hooks/pre-commit
+```
+This will run the linters on just the changes made in your commit.

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,373 @@

+Mozilla Public License Version 2.0
+==================================
+1. Definitions
+--------------
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+1.5. "Incompatible With Secondary Licenses"
+    means
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in
+    a separate file or files, that is not Covered Software.
+1.8. "License"
+    means this document.
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+1.10. "Modifications"
+    means any of the following:
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+2. License Grants and Conditions
+--------------------------------
+2.1. Grants
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+2.2. Effective Date
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+2.3. Limitations on Grant Scope
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+2.4. Subsequent Licenses
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+2.5. Representation
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+2.6. Fair Use
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+2.7. Conditions
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+3. Responsibilities
+-------------------
+3.1. Distribution of Source Form
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+3.2. Distribution of Executable Form
+If You distribute Covered Software in Executable Form then:
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+3.3. Distribution of a Larger Work
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+3.4. Notices
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+3.5. Application of Additional Terms
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+5. Termination
+--------------
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+8. Litigation
+-------------
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+9. Miscellaneous
+----------------
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+10. Versions of the License
+---------------------------
+10.1. New Versions
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+10.2. Effect of New Versions
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+10.3. Modified Versions
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+You may add additional accurate notices of copyright ownership.
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,11 @@

+include README.md
+include LICENSE.txt
+include requirements.txt
+recursive-include TTS *.json
+recursive-include TTS *.html
+recursive-include TTS *.png
+recursive-include TTS *.md
+recursive-include TTS *.py
+recursive-include TTS *.pyx
+recursive-include images *.png

README.md CHANGED Viewed

@@ -1,3 +1,281 @@
----
-license: apache-2.0
----

+<img src="https://user-images.githubusercontent.com/1402048/104139991-3fd15e00-53af-11eb-8640-3a78a64641dd.png" data-canonical-src="![TTS banner](https://user-images.githubusercontent.com/1402048/104139991-3fd15e00-53af-11eb-8640-3a78a64641dd.png =250x250)
+" width="256" height="256" align="right" />
+# TTS: Text-to-Speech for all.
+TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
+TTS comes with [pretrained models](https://github.com/mozilla/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
+[![CircleCI](<https://circleci.com/gh/mozilla/TTS/tree/dev.svg?style=svg>)]()
+[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
+[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
+:loudspeaker: [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
+:man_cook:  [TTS training recipes](https://github.com/erogol/TTS_recipes)
+:page_facing_up: [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
+## 💬 Where to ask questions
+Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it.
+| Type                            | Platforms                               |
+| ------------------------------- | --------------------------------------- |
+| 🚨 **Bug Reports**              | [GitHub Issue Tracker]                  |
+| ❔ **FAQ**                       | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/FAQ)                              |
+| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker]                  |
+| 👩‍💻 **Usage Questions**          | [Discourse Forum]                       |
+| 🗯 **General Discussion**        | [Discourse Forum] and [Matrix Channel]  |
+[github issue tracker]: https://github.com/mozilla/tts/issues
+[discourse forum]: https://discourse.mozilla.org/c/tts/
+[matrix channel]: https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org
+[Tutorials and Examples]: https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials
+## 🔗 Links and Resources
+| Type                            | Links                               |
+| ------------------------------- | --------------------------------------- |
+| 💾 **Installation** | [TTS/README.md](https://github.com/mozilla/TTS/tree/dev#install-tts)|
+| 👩🏾‍🏫 **Tutorials and Examples**  | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) |
+| 🚀 **Released Models**         | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
+| 💻 **Docker Image**            | [Repository by @synesthesiam](https://github.com/synesthesiam/docker-mozillatts)|
+| 🖥️ **Demo Server**             | [TTS/server](https://github.com/mozilla/TTS/tree/master/TTS/server)|
+| 🤖 **Running TTS on Terminal** | [TTS/README.md](https://github.com/mozilla/TTS#example-synthesizing-speech-on-terminal-using-the-released-models)|
+| ✨ **How to contribute**       |[TTS/README.md](#contribution-guidelines)|
+## 🥇 TTS Performance
+<p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
+"Mozilla*" and "Judy*" are our models.
+[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
+## Features
+- High performance Deep Learning models for Text2Speech tasks.
+    - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
+    - Speaker Encoder to compute speaker embeddings efficiently.
+    - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
+- Fast and efficient model training.
+- Detailed training logs on console and Tensorboard.
+- Support for multi-speaker TTS.
+- Efficient Multi-GPUs training.
+- Ability to convert PyTorch models to Tensorflow 2.0 and TFLite for inference.
+- Released models in PyTorch, Tensorflow and TFLite.
+- Tools to curate Text2Speech datasets under```dataset_analysis```.
+- Demo server for model testing.
+- Notebooks for extensive model benchmarking.
+- Modular (but not too much) code base enabling easy testing for new ideas.
+## Implemented Models
+### Text-to-Spectrogram
+- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
+- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
+- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
+- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
+### Attention Methods
+- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
+- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
+- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
+- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
+### Speaker Encoder
+- GE2E: [paper](https://arxiv.org/abs/1710.10467)
+- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
+### Vocoders
+- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
+- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
+- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
+- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
+- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
+- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
+You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
+## Install TTS
+TTS supports **python >= 3.6, <3.9**.
+If you are only interested in [synthesizing speech](https://github.com/mozilla/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released TTS models, installing from PyPI is the easiest option.
+```bash
+pip install TTS
+```
+If you plan to code or train models, clone TTS and install it locally.
+```bash
+git clone https://github.com/mozilla/TTS
+pip install -e .
+```
+## Directory Structure
+```
+|- notebooks/       (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
+|- utils/           (common utilities.)
+|- TTS
+    |- bin/             (folder for all the executables.)
+      |- train*.py                  (train your target model.)
+      |- distribute.py              (train your TTS model using Multiple GPUs.)
+      |- compute_statistics.py      (compute dataset statistics for normalization.)
+      |- convert*.py                (convert target torch model to TF.)
+    |- tts/             (text to speech models)
+        |- layers/          (model layer definitions)
+        |- models/          (model definitions)
+        |- tf/              (Tensorflow 2 utilities and model implementations)
+        |- utils/           (model specific utilities.)
+    |- speaker_encoder/ (Speaker Encoder models.)
+        |- (same)
+    |- vocoder/         (Vocoder models.)
+        |- (same)
+```
+## Sample Model Output
+Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
+> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
+Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
+<img src="images/example_model_output.png?raw=true" alt="example_output" width="400"/>
+## Datasets and Data-Loading
+TTS provides a generic dataloader easy to use for your custom dataset.
+You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
+After that, you need to set ```dataset``` fields in ```config.json```.
+Some of the public datasets that we successfully applied TTS:
+- [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
+- [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
+- [TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset)
+- [M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/)
+- [LibriTTS](https://openslr.org/60/)
+- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
+## Example: Synthesizing Speech on Terminal Using the Released Models.
+After the installation, TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under the TTS project.
+Listing released TTS models.
+```bash
+tts --list_models
+```
+Run a tts and a vocoder model from the released model list. (Simply copy and paste the full model names from the list as arguments for the command below.)
+```bash
+tts --text "Text for TTS" \
+    --model_name "<type>/<language>/<dataset>/<model_name>" \
+    --vocoder_name "<type>/<language>/<dataset>/<model_name>" \
+    --out_path folder/to/save/output/
+```
+Run your own TTS model (Using Griffin-Lim Vocoder)
+```bash
+tts --text "Text for TTS" \
+    --model_path path/to/model.pth.tar \
+    --config_path path/to/config.json \
+    --out_path output/path/speech.wav
+```
+Run your own TTS and Vocoder models
+```bash
+tts --text "Text for TTS" \
+    --model_path path/to/config.json \
+    --config_path path/to/model.pth.tar \
+    --out_path output/path/speech.wav \
+    --vocoder_path path/to/vocoder.pth.tar \
+    --vocoder_config_path path/to/vocoder_config.json
+```
+**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder.
+## Example: Training and Fine-tuning LJ-Speech Dataset
+Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
+To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go.
+```
+shuf metadata.csv > metadata_shuf.csv
+head -n 12000 metadata_shuf.csv > metadata_train.csv
+tail -n 1100 metadata_shuf.csv > metadata_val.csv
+```
+To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script.
+For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps.
+```bash
+python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json
+```
+To fine-tune a model, use ```--restore_path```.
+```bash
+python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar
+```
+To continue an old training run, use ```--continue_path```.
+```bash
+python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/
+```
+For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting.
+```bash
+CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json
+```
+Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs.
+In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed.
+You can also enjoy Tensorboard,  if you point Tensorboard argument```--logdir``` to the experiment folder.
+## Contribution Guidelines
+This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines.](https://www.mozilla.org/about/governance/policies/participation/)
+1. Create a new branch.
+2. Implement your changes.
+3. (if applicable) Add [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings.
+4. (if applicable) Implement a test case under ```tests``` folder.
+5. (Optional but Prefered) Run tests.
+```bash
+./run_tests.sh
+```
+6. Run the linter.
+```bash
+pip install pylint cardboardlint
+cardboardlinter --refspec master
+```
+7. Send a PR to ```dev``` branch, explain what the change is about.
+8. Let us discuss until we make it perfect :).
+9. We merge it to the ```dev``` branch once things look good.
+Feel free to ping us at any step you need help using our communication channels.
+## Collaborative Experimentation Guide
+If you like to use TTS to try a new idea and like to share your experiments with the community, we urge you to use the following guideline for a better collaboration.
+(If you have an idea for better collaboration, let us know)
+- Create a new branch.
+- Open an issue pointing your branch.
+- Explain your idea and experiment.
+- Share your results regularly. (Tensorboard log files, audio results, visuals etc.)
+## Major TODOs
+- [x] Implement the model.
+- [x] Generate human-like speech on LJSpeech dataset.
+- [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
+- [x] Train TTS with r=1 successfully.
+- [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
+- [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN)
+- [x] Multi-speaker embedding.
+- [x] Model optimization (model export, model pruning etc.)
+### Acknowledgement
+- https://github.com/keithito/tacotron (Dataset pre-processing)
+- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
+- https://github.com/kan-bayashi/ParallelWaveGAN (vocoder library)
+- https://github.com/jaywalnut310/glow-tts (Original Glow-TTS implementation)
+- https://github.com/fatchord/WaveRNN/ (Original WaveRNN implementation)

TTS/.models.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "tts_models":{
+        "en":{
+            "ljspeech":{
+                "glow-tts":{
+                    "description": "",
+                    "model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n",
+                    "config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t",
+                    "stats_file": null,
+                    "commit": ""
+                },
+                "tacotron2-DCA": {
+                    "description": "",
+                    "model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7",
+                    "config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1",
+                    "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK",
+                    "commit": ""
+                },
+                "speedy-speech-wn":{
+                    "description": "Speedy Speech model with wavenet decoder.",
+                    "model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ",
+                    "config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3",
+                    "stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR",
+                    "commit": "77b6145"
+                }
+            }
+        },
+        "es":{
+            "mai":{
+                "tacotron2-DDC":{
+                    "model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw",
+                    "config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0",
+                    "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
+                    "commit": ""
+                }
+            }
+        },
+        "fr":{
+            "mai":{
+                "tacotron2-DDC":{
+                    "model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS",
+                    "config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG",
+                    "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
+                    "commit": ""
+                }
+            }
+        }
+    },
+    "vocoder_models":{
+        "universal":{
+            "libri-tts":{
+                "wavegrad":{
+                    "model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6",
+                    "config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s",
+                    "stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0",
+                    "commit": "ea976b0"
+                },
+                "fullband-melgan":{
+                    "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
+                    "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
+                    "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
+                    "commit": "4132240"
+                }
+            }
+        },
+        "en": {
+            "ljspeech":{
+                "mulitband-melgan":{
+                    "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
+                    "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
+                    "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
+                    "commit": "ea976b0"
+                }
+            }
+        }
+    }
+}

TTS/__init__.py ADDED Viewed

File without changes

TTS/bin/__init__.py ADDED Viewed

File without changes

TTS/bin/compute_attention_masks.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import argparse
+import importlib
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from argparse import RawTextHelpFormatter
+from TTS.tts.datasets.TTSDataset import MyDataset
+from TTS.tts.utils.generic_utils import setup_model
+from TTS.tts.utils.io import load_checkpoint
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='''Extract attention masks from trained Tacotron/Tacotron2 models.
+These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
+'''Each attention mask is written to the same path as the input wav file with ".npy" file extension.
+(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n'''
+'''
+Example run:
+    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
+        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
+        --dataset_metafile /root/LJSpeech-1.1/metadata.csv
+        --data_path /root/LJSpeech-1.1/
+        --batch_size 32
+        --dataset ljspeech
+        --use_cuda True
+''',
+        formatter_class=RawTextHelpFormatter
+        )
+    parser.add_argument('--model_path',
+                        type=str,
+                        required=True,
+                        help='Path to Tacotron/Tacotron2 model file ')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        required=True,
+        help='Path to Tacotron/Tacotron2 config file.',
+    )
+    parser.add_argument('--dataset',
+                        type=str,
+                        default='',
+                        required=True,
+                        help='Target dataset processor name from TTS.tts.dataset.preprocess.')
+    parser.add_argument(
+        '--dataset_metafile',
+        type=str,
+        default='',
+        required=True,
+        help='Dataset metafile inclusing file paths with transcripts.')
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='',
+        help='Defines the data path. It overwrites config.json.')
+    parser.add_argument('--use_cuda',
+                        type=bool,
+                        default=False,
+                        help="enable/disable cuda.")
+    parser.add_argument(
+        '--batch_size',
+        default=16,
+        type=int,
+        help='Batch size for the model. Use batch_size=1 if you have no CUDA.')
+    args = parser.parse_args()
+    C = load_config(args.config_path)
+    ap = AudioProcessor(**C.audio)
+    # if the vocabulary was passed, replace the default
+    if 'characters' in C.keys():
+        symbols, phonemes = make_symbols(**C.characters)
+    # load the model
+    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
+    # TODO: handle multi-speaker
+    model = setup_model(num_chars, num_speakers=0, c=C)
+    model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda)
+    model.eval()
+    # data loader
+    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')
+    preprocessor = getattr(preprocessor, args.dataset)
+    meta_data = preprocessor(args.data_path, args.dataset_metafile)
+    dataset = MyDataset(model.decoder.r,
+                        C.text_cleaner,
+                        compute_linear_spec=False,
+                        ap=ap,
+                        meta_data=meta_data,
+                        tp=C.characters if 'characters' in C.keys() else None,
+                        add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
+                        use_phonemes=C.use_phonemes,
+                        phoneme_cache_path=C.phoneme_cache_path,
+                        phoneme_language=C.phoneme_language,
+                        enable_eos_bos=C.enable_eos_bos_chars)
+    dataset.sort_items()
+    loader = DataLoader(dataset,
+                        batch_size=args.batch_size,
+                        num_workers=4,
+                        collate_fn=dataset.collate_fn,
+                        shuffle=False,
+                        drop_last=False)
+    # compute attentions
+    file_paths = []
+    with torch.no_grad():
+        for data in tqdm(loader):
+            # setup input data
+            text_input = data[0]
+            text_lengths = data[1]
+            linear_input = data[3]
+            mel_input = data[4]
+            mel_lengths = data[5]
+            stop_targets = data[6]
+            item_idxs = data[7]
+            # dispatch data to GPU
+            if args.use_cuda:
+                text_input = text_input.cuda()
+                text_lengths = text_lengths.cuda()
+                mel_input = mel_input.cuda()
+                mel_lengths = mel_lengths.cuda()
+            mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(
+                text_input, text_lengths, mel_input)
+            alignments = alignments.detach()
+            for idx, alignment in enumerate(alignments):
+                item_idx = item_idxs[idx]
+                # interpolate if r > 1
+                alignment = torch.nn.functional.interpolate(
+                    alignment.transpose(0, 1).unsqueeze(0),
+                    size=None,
+                    scale_factor=model.decoder.r,
+                    mode='nearest',
+                    align_corners=None,
+                    recompute_scale_factor=None).squeeze(0).transpose(0, 1)
+                # remove paddings
+                alignment = alignment[:mel_lengths[idx], :text_lengths[idx]].cpu().numpy()
+                # set file paths
+                wav_file_name = os.path.basename(item_idx)
+                align_file_name = os.path.splitext(wav_file_name)[0] + '.npy'
+                file_path = item_idx.replace(wav_file_name, align_file_name)
+                # save output
+                file_paths.append([item_idx, file_path])
+                np.save(file_path, alignment)
+        # ourput metafile
+        metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
+        with open(metafile, "w") as f:
+            for p in file_paths:
+                f.write(f"{p[0]}|{p[1]}\n")
+        print(f" >> Metafile created: {metafile}")

TTS/bin/compute_embeddings.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import argparse
+import glob
+import os
+import numpy as np
+from tqdm import tqdm
+import torch
+from TTS.speaker_encoder.model import SpeakerEncoder
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+from TTS.tts.utils.speakers import save_speaker_mapping
+from TTS.tts.datasets.preprocess import load_meta_data
+parser = argparse.ArgumentParser(
+    description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.')
+parser.add_argument(
+    'model_path',
+    type=str,
+    help='Path to model outputs (checkpoint, tensorboard etc.).')
+parser.add_argument(
+    'config_path',
+    type=str,
+    help='Path to config file for training.',
+)
+parser.add_argument(
+    'data_path',
+    type=str,
+    help='Data path for wav files - directory or CSV file')
+parser.add_argument(
+    'output_path',
+    type=str,
+    help='path for training outputs.')
+parser.add_argument(
+    '--target_dataset',
+    type=str,
+    default='',
+    help='Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.'
+)
+parser.add_argument(
+    '--use_cuda', type=bool, help='flag to set cuda.', default=False
+)
+parser.add_argument(
+    '--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
+)
+args = parser.parse_args()
+c = load_config(args.config_path)
+ap = AudioProcessor(**c['audio'])
+data_path = args.data_path
+split_ext = os.path.splitext(data_path)
+sep = args.separator
+if args.target_dataset != '':
+    # if target dataset is defined
+    dataset_config = [
+        {
+            "name": args.target_dataset,
+            "path": args.data_path,
+            "meta_file_train": None,
+            "meta_file_val": None
+        },
+    ]
+    wav_files, _ = load_meta_data(dataset_config, eval_split=False)
+    output_files = [wav_file[1].replace(data_path, args.output_path).replace(
+        '.wav', '.npy') for wav_file in wav_files]
+else:
+    # if target dataset is not defined
+    if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
+        # Parse CSV
+        print(f'CSV file: {data_path}')
+        with open(data_path) as f:
+            wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
+            wav_files = []
+            print(f'Separator is: {sep}')
+            for line in f:
+                components = line.split(sep)
+                if len(components) != 2:
+                    print("Invalid line")
+                    continue
+                wav_file = os.path.join(wav_path, components[0] + '.wav')
+                #print(f'wav_file: {wav_file}')
+                if os.path.exists(wav_file):
+                    wav_files.append(wav_file)
+        print(f'Count of wavs imported: {len(wav_files)}')
+    else:
+        # Parse all wav files in data_path
+        wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
+        output_files = [wav_file.replace(data_path, args.output_path).replace(
+            '.wav', '.npy') for wav_file in wav_files]
+for output_file in output_files:
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+# define Encoder model
+model = SpeakerEncoder(**c.model)
+model.load_state_dict(torch.load(args.model_path)['model'])
+model.eval()
+if args.use_cuda:
+    model.cuda()
+# compute speaker embeddings
+speaker_mapping = {}
+for idx, wav_file in enumerate(tqdm(wav_files)):
+    if isinstance(wav_file, list):
+        speaker_name = wav_file[2]
+        wav_file = wav_file[1]
+    mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T
+    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
+    if args.use_cuda:
+        mel_spec = mel_spec.cuda()
+    embedd = model.compute_embedding(mel_spec)
+    embedd = embedd.detach().cpu().numpy()
+    np.save(output_files[idx], embedd)
+    if args.target_dataset != '':
+        # create speaker_mapping if target dataset is defined
+        wav_file_name = os.path.basename(wav_file)
+        speaker_mapping[wav_file_name] = {}
+        speaker_mapping[wav_file_name]['name'] = speaker_name
+        speaker_mapping[wav_file_name]['embedding'] = embedd.flatten().tolist()
+if args.target_dataset != '':
+    # save speaker_mapping if target dataset is defined
+    mapping_file_path = os.path.join(args.output_path, 'speakers.json')
+    save_speaker_mapping(args.output_path, speaker_mapping)

TTS/bin/compute_statistics.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import glob
+import argparse
+import numpy as np
+from tqdm import tqdm
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.utils.io import load_config
+from TTS.utils.audio import AudioProcessor
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Compute mean and variance of spectrogtram features.")
+    parser.add_argument("--config_path", type=str, required=True,
+                        help="TTS config file path to define audio processin parameters.")
+    parser.add_argument("--out_path", default=None, type=str,
+                        help="directory to save the output file.")
+    args = parser.parse_args()
+    # load config
+    CONFIG = load_config(args.config_path)
+    CONFIG.audio['signal_norm'] = False  # do not apply earlier normalization
+    CONFIG.audio['stats_path'] = None  # discard pre-defined stats
+    # load audio processor
+    ap = AudioProcessor(**CONFIG.audio)
+    # load the meta data of target dataset
+    if 'data_path' in CONFIG.keys():
+        dataset_items = glob.glob(os.path.join(CONFIG.data_path, '**', '*.wav'), recursive=True)
+    else:
+        dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
+    print(f" > There are {len(dataset_items)} files.")
+    mel_sum = 0
+    mel_square_sum = 0
+    linear_sum = 0
+    linear_square_sum = 0
+    N = 0
+    for item in tqdm(dataset_items):
+        # compute features
+        wav = ap.load_wav(item if isinstance(item, str) else item[1])
+        linear = ap.spectrogram(wav)
+        mel = ap.melspectrogram(wav)
+        # compute stats
+        N += mel.shape[1]
+        mel_sum += mel.sum(1)
+        linear_sum += linear.sum(1)
+        mel_square_sum += (mel ** 2).sum(axis=1)
+        linear_square_sum += (linear ** 2).sum(axis=1)
+    mel_mean = mel_sum / N
+    mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
+    linear_mean = linear_sum / N
+    linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
+    output_file_path = args.out_path
+    stats = {}
+    stats['mel_mean'] = mel_mean
+    stats['mel_std'] = mel_scale
+    stats['linear_mean'] = linear_mean
+    stats['linear_std'] = linear_scale
+    print(f' > Avg mel spec mean: {mel_mean.mean()}')
+    print(f' > Avg mel spec scale: {mel_scale.mean()}')
+    print(f' > Avg linear spec mean: {linear_mean.mean()}')
+    print(f' > Avg lienar spec scale: {linear_scale.mean()}')
+    # set default config values for mean-var scaling
+    CONFIG.audio['stats_path'] = output_file_path
+    CONFIG.audio['signal_norm'] = True
+    # remove redundant values
+    del CONFIG.audio['max_norm']
+    del CONFIG.audio['min_level_db']
+    del CONFIG.audio['symmetric_norm']
+    del CONFIG.audio['clip_norm']
+    stats['audio_config'] = CONFIG.audio
+    np.save(output_file_path, stats, allow_pickle=True)
+    print(f' > stats saved to {output_file_path}')
+if __name__ == "__main__":
+    main()

TTS/bin/convert_melgan_tflite.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Convert Tensorflow Tacotron2 model to TF-Lite binary
+import argparse
+from TTS.utils.io import load_config
+from TTS.vocoder.tf.utils.generic_utils import setup_generator
+from TTS.vocoder.tf.utils.io import load_checkpoint
+from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite
+parser = argparse.ArgumentParser()
+parser.add_argument('--tf_model',
+                    type=str,
+                    help='Path to target torch model to be converted to TF.')
+parser.add_argument('--config_path',
+                    type=str,
+                    help='Path to config file of torch model.')
+parser.add_argument('--output_path',
+                    type=str,
+                    help='path to tflite output binary.')
+args = parser.parse_args()
+# Set constants
+CONFIG = load_config(args.config_path)
+# load the model
+model = setup_generator(CONFIG)
+model.build_inference()
+model = load_checkpoint(model, args.tf_model)
+# create tflite model
+tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)

TTS/bin/convert_melgan_torch_to_tf.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import argparse
+from difflib import SequenceMatcher
+import os
+import numpy as np
+import tensorflow as tf
+import torch
+from TTS.utils.io import load_config
+from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
+    compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
+from TTS.vocoder.tf.utils.generic_utils import \
+    setup_generator as setup_tf_generator
+from TTS.vocoder.tf.utils.io import save_checkpoint
+from TTS.vocoder.utils.generic_utils import setup_generator
+# prevent GPU use
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+# define args
+parser = argparse.ArgumentParser()
+parser.add_argument('--torch_model_path',
+                    type=str,
+                    help='Path to target torch model to be converted to TF.')
+parser.add_argument('--config_path',
+                    type=str,
+                    help='Path to config file of torch model.')
+parser.add_argument(
+    '--output_path',
+    type=str,
+    help='path to output file including file name to save TF model.')
+args = parser.parse_args()
+# load model config
+config_path = args.config_path
+c = load_config(config_path)
+num_speakers = 0
+# init torch model
+model = setup_generator(c)
+checkpoint = torch.load(args.torch_model_path,
+                        map_location=torch.device('cpu'))
+state_dict = checkpoint['model']
+model.load_state_dict(state_dict)
+model.remove_weight_norm()
+state_dict = model.state_dict()
+# init tf model
+model_tf = setup_tf_generator(c)
+common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
+# get tf_model graph by passing an input
+# B x D x T
+dummy_input = tf.random.uniform((7, 80, 64), dtype=tf.float32)
+mel_pred = model_tf(dummy_input, training=False)
+# get tf variables
+tf_vars = model_tf.weights
+# match variable names with fuzzy logic
+torch_var_names = list(state_dict.keys())
+tf_var_names = [we.name for we in model_tf.weights]
+var_map = []
+for tf_name in tf_var_names:
+    # skip re-mapped layer names
+    if tf_name in [name[0] for name in var_map]:
+        continue
+    tf_name_edited = convert_tf_name(tf_name)
+    ratios = [
+        SequenceMatcher(None, torch_name, tf_name_edited).ratio()
+        for torch_name in torch_var_names
+    ]
+    max_idx = np.argmax(ratios)
+    matching_name = torch_var_names[max_idx]
+    del torch_var_names[max_idx]
+    var_map.append((tf_name, matching_name))
+# pass weights
+tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
+# Compare TF and TORCH models
+# check embedding outputs
+model.eval()
+dummy_input_torch = torch.ones((1, 80, 10))
+dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
+dummy_input_tf = tf.transpose(dummy_input_tf, perm=[0, 2, 1])
+dummy_input_tf = tf.expand_dims(dummy_input_tf, 2)
+out_torch = model.layers[0](dummy_input_torch)
+out_tf = model_tf.model_layers[0](dummy_input_tf)
+out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
+assert compare_torch_tf(out_torch, out_tf_) < 1e-5
+for i in range(1, len(model.layers)):
+    print(f"{i} -> {model.layers[i]} vs {model_tf.model_layers[i]}")
+    out_torch = model.layers[i](out_torch)
+    out_tf = model_tf.model_layers[i](out_tf)
+    out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
+    diff = compare_torch_tf(out_torch, out_tf_)
+    assert diff < 1e-5, diff
+torch.manual_seed(0)
+dummy_input_torch = torch.rand((1, 80, 100))
+dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
+model.inference_padding = 0
+model_tf.inference_padding = 0
+output_torch = model.inference(dummy_input_torch)
+output_tf = model_tf(dummy_input_tf, training=False)
+assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
+    output_torch, output_tf)
+# save tf model
+save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
+                args.output_path)
+print(' > Model conversion is successfully completed :).')

TTS/bin/convert_tacotron2_tflite.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Convert Tensorflow Tacotron2 model to TF-Lite binary
+import argparse
+from TTS.utils.io import load_config
+from TTS.tts.utils.text.symbols import symbols, phonemes
+from TTS.tts.tf.utils.generic_utils import setup_model
+from TTS.tts.tf.utils.io import load_checkpoint
+from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite
+parser = argparse.ArgumentParser()
+parser.add_argument('--tf_model',
+                    type=str,
+                    help='Path to target torch model to be converted to TF.')
+parser.add_argument('--config_path',
+                    type=str,
+                    help='Path to config file of torch model.')
+parser.add_argument('--output_path',
+                    type=str,
+                    help='path to tflite output binary.')
+args = parser.parse_args()
+# Set constants
+CONFIG = load_config(args.config_path)
+# load the model
+c = CONFIG
+num_speakers = 0
+num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+model = setup_model(num_chars, num_speakers, c, enable_tflite=True)
+model.build_inference()
+model = load_checkpoint(model, args.tf_model)
+model.decoder.set_max_decoder_steps(1000)
+# create tflite model
+tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)

TTS/bin/convert_tacotron2_torch_to_tf.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# %%
+# %%
+import argparse
+from difflib import SequenceMatcher
+import os
+import sys
+# %%
+# print variable match
+from pprint import pprint
+import numpy as np
+import tensorflow as tf
+import torch
+from TTS.tts.tf.models.tacotron2 import Tacotron2
+from TTS.tts.tf.utils.convert_torch_to_tf_utils import (
+    compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
+from TTS.tts.tf.utils.generic_utils import save_checkpoint
+from TTS.tts.utils.generic_utils import setup_model
+from TTS.tts.utils.text.symbols import phonemes, symbols
+from TTS.utils.io import load_config
+sys.path.append('/home/erogol/Projects')
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+parser = argparse.ArgumentParser()
+parser.add_argument('--torch_model_path',
+                    type=str,
+                    help='Path to target torch model to be converted to TF.')
+parser.add_argument('--config_path',
+                    type=str,
+                    help='Path to config file of torch model.')
+parser.add_argument('--output_path',
+                    type=str,
+                    help='path to output file including file name to save TF model.')
+args = parser.parse_args()
+# load model config
+config_path = args.config_path
+c = load_config(config_path)
+num_speakers = 0
+# init torch model
+num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+model = setup_model(num_chars, num_speakers, c)
+checkpoint = torch.load(args.torch_model_path,
+                        map_location=torch.device('cpu'))
+state_dict = checkpoint['model']
+model.load_state_dict(state_dict)
+# init tf model
+model_tf = Tacotron2(num_chars=num_chars,
+                     num_speakers=num_speakers,
+                     r=model.decoder.r,
+                     postnet_output_dim=c.audio['num_mels'],
+                     decoder_output_dim=c.audio['num_mels'],
+                     attn_type=c.attention_type,
+                     attn_win=c.windowing,
+                     attn_norm=c.attention_norm,
+                     prenet_type=c.prenet_type,
+                     prenet_dropout=c.prenet_dropout,
+                     forward_attn=c.use_forward_attn,
+                     trans_agent=c.transition_agent,
+                     forward_attn_mask=c.forward_attn_mask,
+                     location_attn=c.location_attn,
+                     attn_K=c.attention_heads,
+                     separate_stopnet=c.separate_stopnet,
+                     bidirectional_decoder=c.bidirectional_decoder)
+# set initial layer mapping - these are not captured by the below heuristic approach
+# TODO: set layer names so that we can remove these manual matching
+common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
+var_map = [
+    ('embedding/embeddings:0', 'embedding.weight'),
+    ('encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
+     'encoder.lstm.weight_ih_l0'),
+    ('encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
+     'encoder.lstm.weight_hh_l0'),
+    ('encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
+     'encoder.lstm.weight_ih_l0_reverse'),
+    ('encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
+     'encoder.lstm.weight_hh_l0_reverse'),
+    ('encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
+     ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
+    ('encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
+     ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
+    ('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
+    ('decoder/linear_projection/kernel:0',
+     'decoder.linear_projection.linear_layer.weight'),
+    ('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
+]
+# %%
+# get tf_model graph
+model_tf.build_inference()
+# get tf variables
+tf_vars = model_tf.weights
+# match variable names with fuzzy logic
+torch_var_names = list(state_dict.keys())
+tf_var_names = [we.name for we in model_tf.weights]
+for tf_name in tf_var_names:
+    # skip re-mapped layer names
+    if tf_name in [name[0] for name in var_map]:
+        continue
+    tf_name_edited = convert_tf_name(tf_name)
+    ratios = [
+        SequenceMatcher(None, torch_name, tf_name_edited).ratio()
+        for torch_name in torch_var_names
+    ]
+    max_idx = np.argmax(ratios)
+    matching_name = torch_var_names[max_idx]
+    del torch_var_names[max_idx]
+    var_map.append((tf_name, matching_name))
+pprint(var_map)
+pprint(torch_var_names)
+# pass weights
+tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
+# Compare TF and TORCH models
+# %%
+# check embedding outputs
+model.eval()
+input_ids = torch.randint(0, 24, (1, 128)).long()
+o_t = model.embedding(input_ids)
+o_tf = model_tf.embedding(input_ids.detach().numpy())
+assert abs(o_t.detach().numpy() -
+           o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
+                                           o_tf.numpy()).sum()
+# compare encoder outputs
+oo_en = model.encoder.inference(o_t.transpose(1, 2))
+ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
+assert compare_torch_tf(oo_en, ooo_en) < 1e-5
+#pylint: disable=redefined-builtin
+# compare decoder.attention_rnn
+inp = torch.rand([1, 768])
+inp_tf = inp.numpy()
+model.decoder._init_states(oo_en, mask=None)  #pylint: disable=protected-access
+output, cell_state = model.decoder.attention_rnn(inp)
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
+                                                         states[2],
+                                                         training=False)
+assert compare_torch_tf(output, output_tf).mean() < 1e-5
+query = output
+inputs = torch.rand([1, 128, 512])
+query_tf = query.detach().numpy()
+inputs_tf = inputs.numpy()
+# compare decoder.attention
+model.decoder.attention.init_states(inputs)
+processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
+loc_attn, proc_query = model.decoder.attention.get_location_attention(
+    query, processes_inputs)
+context = model.decoder.attention(query, inputs, processes_inputs, None)
+attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1]
+model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
+loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states)
+context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False)
+assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5
+assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5
+assert compare_torch_tf(context, context_tf) < 1e-5
+# compare decoder.decoder_rnn
+input = torch.rand([1, 1536])
+input_tf = input.numpy()
+model.decoder._init_states(oo_en, mask=None)  #pylint: disable=protected-access
+output, cell_state = model.decoder.decoder_rnn(
+    input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
+                                                       states[3],
+                                                       training=False)
+assert abs(input - input_tf).mean() < 1e-5
+assert compare_torch_tf(output, output_tf).mean() < 1e-5
+# compare decoder.linear_projection
+input = torch.rand([1, 1536])
+input_tf = input.numpy()
+output = model.decoder.linear_projection(input)
+output_tf = model_tf.decoder.linear_projection(input_tf, training=False)
+assert compare_torch_tf(output, output_tf) < 1e-5
+# compare decoder outputs
+model.decoder.max_decoder_steps = 100
+model_tf.decoder.set_max_decoder_steps(100)
+output, align, stop = model.decoder.inference(oo_en)
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
+assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
+# compare the whole model output
+outputs_torch = model.inference(input_ids)
+outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
+print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
+assert compare_torch_tf(outputs_torch[2][:, 50, :],
+                        outputs_tf[2][:, 50, :]) < 1e-5
+assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
+# %%
+# save tf model
+save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
+                checkpoint['r'], args.output_path)
+print(' > Model conversion is successfully completed :).')

TTS/bin/distribute.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+import pathlib
+import time
+import subprocess
+import argparse
+import torch
+def main():
+    """
+    Call train.py as a new process and pass command arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--script',
+        type=str,
+        help='Target training script to distibute.')
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        help='Path to config file for training.',
+        required='--continue_path' not in sys.argv
+    )
+    args = parser.parse_args()
+    num_gpus = torch.cuda.device_count()
+    group_id = time.strftime("%Y_%m_%d-%H%M%S")
+    # set arguments for train.py
+    folder_path = pathlib.Path(__file__).parent.absolute()
+    command = [os.path.join(folder_path, args.script)]
+    command.append('--continue_path={}'.format(args.continue_path))
+    command.append('--restore_path={}'.format(args.restore_path))
+    command.append('--config_path={}'.format(args.config_path))
+    command.append('--group_id=group_{}'.format(group_id))
+    command.append('')
+    # run processes
+    processes = []
+    for i in range(num_gpus):
+        my_env = os.environ.copy()
+        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
+        command[-1] = '--rank={}'.format(i)
+        stdout = None if i == 0 else open(os.devnull, 'w')
+        p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
+        processes.append(p)
+        print(command)
+    for p in processes:
+        p.wait()
+if __name__ == '__main__':
+    main()

TTS/bin/synthesize.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import sys
+import string
+from argparse import RawTextHelpFormatter
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    if v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+def main():
+    # pylint: disable=bad-continuation
+    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''
+    '''You can either use your trained model or choose a model from the provided list.\n'''\
+    '''
+    Example runs:
+    # list provided models
+    ./TTS/bin/synthesize.py --list_models
+    # run a model from the list
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    # run your own TTS model (Using Griffin-Lim Vocoder)
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    # run your own TTS and Vocoder models
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    ''',
+        formatter_class=RawTextHelpFormatter)
+    parser.add_argument(
+        '--list_models',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        default=False,
+        help='list available pre-trained tts and vocoder models.'
+        )
+    parser.add_argument(
+        '--text',
+        type=str,
+        default=None,
+        help='Text to generate speech.'
+        )
+    # Args for running pre-trained TTS models.
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default=None,
+        help=
+        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
+    )
+    parser.add_argument(
+        '--vocoder_name',
+        type=str,
+        default=None,
+        help=
+        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
+    )
+    # Args for running custom models
+    parser.add_argument(
+        '--config_path',
+        default=None,
+        type=str,
+        help='Path to model config file.'
+        )
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default=None,
+        help='Path to model file.',
+    )
+    parser.add_argument(
+        '--out_path',
+        type=str,
+        default=Path(__file__).resolve().parent,
+        help='Path to save final wav file. Wav file will be named as the given text.',
+    )
+    parser.add_argument(
+        '--use_cuda',
+        type=bool,
+        help='Run model on CUDA.',
+        default=False
+        )
+    parser.add_argument(
+        '--vocoder_path',
+        type=str,
+        help=
+        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
+        default=None,
+    )
+    parser.add_argument(
+        '--vocoder_config_path',
+        type=str,
+        help='Path to vocoder model config file.',
+        default=None)
+    # args for multi-speaker synthesis
+    parser.add_argument(
+        '--speakers_json',
+        type=str,
+        help="JSON file for multi-speaker model.",
+        default=None)
+    parser.add_argument(
+        '--speaker_idx',
+        type=str,
+        help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
+        default=None)
+    parser.add_argument(
+        '--gst_style',
+        help="Wav path file for GST stylereference.",
+        default=None)
+    # aux args
+    parser.add_argument(
+        '--save_spectogram',
+        type=bool,
+        help="If true save raw spectogram for further (vocoder) processing in out_path.",
+        default=False)
+    args = parser.parse_args()
+    # load model manager
+    path = Path(__file__).parent / "../.models.json"
+    manager = ModelManager(path)
+    model_path = None
+    config_path = None
+    vocoder_path = None
+    vocoder_config_path = None
+    # CASE1: list pre-trained TTS models
+    if args.list_models:
+        manager.list_models()
+        sys.exit()
+    # CASE2: load pre-trained models
+    if args.model_name is not None:
+        model_path, config_path = manager.download_model(args.model_name)
+    if args.vocoder_name is not None:
+        vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name)
+    # CASE3: load custome models
+    if args.model_path is not None:
+        model_path = args.model_path
+        config_path = args.config_path
+    if args.vocoder_path is not None:
+        vocoder_path = args.vocoder_path
+        vocoder_config_path = args.vocoder_config_path
+    # RUN THE SYNTHESIS
+    # load models
+    synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda)
+    use_griffin_lim = vocoder_path is None
+    print(" > Text: {}".format(args.text))
+    # # handle multi-speaker setting
+    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
+    #     if args.speaker_idx.isdigit():
+    #         args.speaker_idx = int(args.speaker_idx)
+    #     else:
+    #         args.speaker_idx = None
+    # else:
+    #     args.speaker_idx = None
+    # if args.gst_style is None:
+    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
+    #         gst_style = model_config.gst['gst_style_input']
+    #     else:
+    #         gst_style = None
+    # else:
+    #     # check if gst_style string is a dict, if is dict convert  else use string
+    #     try:
+    #         gst_style = json.loads(args.gst_style)
+    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
+    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
+    #     except ValueError:
+    #         gst_style = args.gst_style
+    # kick it
+    wav = synthesizer.tts(args.text)
+    # save the results
+    file_name = args.text.replace(" ", "_")[0:20]
+    file_name = file_name.translate(
+        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
+    out_path = os.path.join(args.out_path, file_name)
+    print(" > Saving output to {}".format(out_path))
+    synthesizer.save_wav(wav, out_path)
+if __name__ == "__main__":
+    main()

TTS/bin/train_encoder.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import sys
+import time
+import traceback
+import torch
+from torch.utils.data import DataLoader
+from TTS.speaker_encoder.dataset import MyDataset
+from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
+from TTS.speaker_encoder.model import SpeakerEncoder
+from TTS.speaker_encoder.utils.generic_utils import \
+    check_config_speaker_encoder, save_best_model
+from TTS.speaker_encoder.utils.visual import plot_embeddings
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import (count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.radam import RAdam
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import NoamLR, check_update
+torch.backends.cudnn.enabled = True
+torch.backends.cudnn.benchmark = True
+torch.manual_seed(54321)
+use_cuda = torch.cuda.is_available()
+num_gpus = torch.cuda.device_count()
+print(" > Using CUDA: ", use_cuda)
+print(" > Number of GPUs: ", num_gpus)
+def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False):
+    if is_val:
+        loader = None
+    else:
+        dataset = MyDataset(ap,
+                            meta_data_eval if is_val else meta_data_train,
+                            voice_len=1.6,
+                            num_utter_per_speaker=c.num_utters_per_speaker,
+                            num_speakers_in_batch=c.num_speakers_in_batch,
+                            skip_speakers=False,
+                            storage_size=c.storage["storage_size"],
+                            sample_from_storage_p=c.storage["sample_from_storage_p"],
+                            additive_noise=c.storage["additive_noise"],
+                            verbose=verbose)
+        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(dataset,
+                            batch_size=c.num_speakers_in_batch,
+                            shuffle=False,
+                            num_workers=c.num_loader_workers,
+                            collate_fn=dataset.collate_fn)
+    return loader
+def train(model, criterion, optimizer, scheduler, ap, global_step):
+    data_loader = setup_loader(ap, is_val=False, verbose=True)
+    model.train()
+    epoch_time = 0
+    best_loss = float('inf')
+    avg_loss = 0
+    avg_loader_time = 0
+    end_time = time.time()
+    for _, data in enumerate(data_loader):
+        start_time = time.time()
+        # setup input data
+        inputs = data[0]
+        loader_time = time.time() - end_time
+        global_step += 1
+        # setup lr
+        if c.lr_decay:
+            scheduler.step()
+        optimizer.zero_grad()
+        # dispatch data to GPU
+        if use_cuda:
+            inputs = inputs.cuda(non_blocking=True)
+            # labels = labels.cuda(non_blocking=True)
+        # forward pass model
+        outputs = model(inputs)
+        # loss computation
+        loss = criterion(
+            outputs.view(c.num_speakers_in_batch,
+                         outputs.shape[0] // c.num_speakers_in_batch, -1))
+        loss.backward()
+        grad_norm, _ = check_update(model, c.grad_clip)
+        optimizer.step()
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # Averaged Loss and Averaged Loader Time
+        avg_loss = 0.01 * loss.item() \
+                   + 0.99 * avg_loss if avg_loss != 0 else loss.item()
+        avg_loader_time = 1/c.num_loader_workers * loader_time + \
+                          (c.num_loader_workers-1) / c.num_loader_workers * avg_loader_time if avg_loader_time != 0 else loader_time
+        current_lr = optimizer.param_groups[0]['lr']
+        if global_step % c.steps_plot_stats == 0:
+            # Plot Training Epoch Stats
+            train_stats = {
+                "loss": avg_loss,
+                "lr": current_lr,
+                "grad_norm": grad_norm,
+                "step_time": step_time,
+                "avg_loader_time": avg_loader_time
+            }
+            tb_logger.tb_train_epoch_stats(global_step, train_stats)
+            figures = {
+                # FIXME: not constant
+                "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(),
+                                             10),
+            }
+            tb_logger.tb_train_figures(global_step, figures)
+        if global_step % c.print_step == 0:
+            print(
+                "   | > Step:{}  Loss:{:.5f}  AvgLoss:{:.5f}  GradNorm:{:.5f}  "
+                "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
+                    global_step, loss.item(), avg_loss, grad_norm, step_time,
+                    loader_time, avg_loader_time, current_lr),
+                flush=True)
+        # save best model
+        best_loss = save_best_model(model, optimizer, avg_loss, best_loss,
+                                    OUT_PATH, global_step)
+        end_time = time.time()
+    return avg_loss, global_step
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data_train
+    global meta_data_eval
+    ap = AudioProcessor(**c.audio)
+    model = SpeakerEncoder(input_dim=c.model['input_dim'],
+                           proj_dim=c.model['proj_dim'],
+                           lstm_dim=c.model['lstm_dim'],
+                           num_lstm_layers=c.model['num_lstm_layers'])
+    optimizer = RAdam(model.parameters(), lr=c.lr)
+    if c.loss == "ge2e":
+        criterion = GE2ELoss(loss_method='softmax')
+    elif c.loss == "angleproto":
+        criterion = AngleProtoLoss()
+    else:
+        raise Exception("The %s  not is a loss supported" % c.loss)
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path)
+        try:
+            # TODO: fix optimizer init, model.cuda() needs to be called before
+            # optimizer restore
+            # optimizer.load_state_dict(checkpoint['optimizer'])
+            if c.reinit_layers:
+                raise RuntimeError
+            model.load_state_dict(checkpoint['model'])
+        except KeyError:
+            print(" > Partial model initialization.")
+            model_dict = model.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint, c)
+            model.load_state_dict(model_dict)
+            del model_dict
+        for group in optimizer.param_groups:
+            group['lr'] = c.lr
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model = model.cuda()
+        criterion.cuda()
+    if c.lr_decay:
+        scheduler = NoamLR(optimizer,
+                           warmup_steps=c.warmup_steps,
+                           last_epoch=args.restore_step - 1)
+    else:
+        scheduler = None
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    # pylint: disable=redefined-outer-name
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets)
+    global_step = args.restore_step
+    _, global_step = train(model, criterion, optimizer, scheduler, ap,
+                           global_step)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Path to model outputs (checkpoint, tensorboard etc.).',
+        default=0)
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        required=True,
+        help='Path to config file for training.',
+    )
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=True,
+                        help='Do not verify commit integrity to run training.')
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='',
+        help='Defines the data path. It overwrites config.json.')
+    parser.add_argument('--output_path',
+                        type=str,
+                        help='path for training outputs.',
+                        default='')
+    parser.add_argument('--output_folder',
+                        type=str,
+                        default='',
+                        help='folder name for training outputs.')
+    args = parser.parse_args()
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    check_config_speaker_encoder(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    if args.data_path != '':
+        c.data_path = args.data_path
+    if args.output_path == '':
+        OUT_PATH = os.path.join(_, c.output_path)
+    else:
+        OUT_PATH = args.output_path
+    if args.output_folder == '':
+        OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
+    else:
+        OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
+    new_fields = {}
+    if args.restore_path:
+        new_fields["restore_path"] = args.restore_path
+    new_fields["github_branch"] = get_git_branch()
+    copy_model_files(c,  args.config_path, OUT_PATH,
+                     new_fields)
+    LOG_DIR = OUT_PATH
+    tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_glow_tts.py ADDED Viewed

	@@ -0,0 +1,657 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import glob
+import os
+import sys
+import time
+import traceback
+from random import randrange
+import torch
+# DISTRIBUTED
+from torch.nn.parallel import DistributedDataParallel as DDP_th
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.tts.datasets.TTSDataset import MyDataset
+from TTS.tts.layers.losses import GlowTTSLoss
+from TTS.tts.utils.generic_utils import check_config_tts, setup_model
+from TTS.tts.utils.io import save_best_model, save_checkpoint
+from TTS.tts.utils.measures import alignment_diagonal_score
+from TTS.tts.utils.speakers import parse_speakers
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.distribute import init_distributed, reduce_tensor
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.radam import RAdam
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import NoamLR, setup_torch_training_env
+use_cuda, num_gpus = setup_torch_training_env(True, False)
+def setup_loader(ap, r, is_val=False, verbose=False):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        dataset = MyDataset(
+            r,
+            c.text_cleaner,
+            compute_linear_spec=False,
+            meta_data=meta_data_eval if is_val else meta_data_train,
+            ap=ap,
+            tp=c.characters if 'characters' in c.keys() else None,
+            add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
+            batch_group_size=0 if is_val else c.batch_group_size *
+            c.batch_size,
+            min_seq_len=c.min_seq_len,
+            max_seq_len=c.max_seq_len,
+            phoneme_cache_path=c.phoneme_cache_path,
+            use_phonemes=c.use_phonemes,
+            phoneme_language=c.phoneme_language,
+            enable_eos_bos=c.enable_eos_bos_chars,
+            use_noise_augment=c['use_noise_augment'] and not is_val,
+            verbose=verbose,
+            speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
+        if c.use_phonemes and c.compute_input_seq_cache:
+            # precompute phonemes to have a better estimate of sequence lengths.
+            dataset.compute_input_seq(c.num_loader_workers)
+        dataset.sort_items()
+        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(
+            dataset,
+            batch_size=c.eval_batch_size if is_val else c.batch_size,
+            shuffle=False,
+            collate_fn=dataset.collate_fn,
+            drop_last=False,
+            sampler=sampler,
+            num_workers=c.num_val_loader_workers
+            if is_val else c.num_loader_workers,
+            pin_memory=False)
+    return loader
+def format_data(data):
+    # setup input data
+    text_input = data[0]
+    text_lengths = data[1]
+    speaker_names = data[2]
+    mel_input = data[4].permute(0, 2, 1)  # B x D x T
+    mel_lengths = data[5]
+    item_idx = data[7]
+    attn_mask = data[9]
+    avg_text_length = torch.mean(text_lengths.float())
+    avg_spec_length = torch.mean(mel_lengths.float())
+    if c.use_speaker_embedding:
+        if c.use_external_speaker_embedding_file:
+            # return precomputed embedding vector
+            speaker_c = data[8]
+        else:
+            # return speaker_id to be used by an embedding layer
+            speaker_c = [
+                speaker_mapping[speaker_name] for speaker_name in speaker_names
+            ]
+            speaker_c = torch.LongTensor(speaker_c)
+    else:
+        speaker_c = None
+    # dispatch data to GPU
+    if use_cuda:
+        text_input = text_input.cuda(non_blocking=True)
+        text_lengths = text_lengths.cuda(non_blocking=True)
+        mel_input = mel_input.cuda(non_blocking=True)
+        mel_lengths = mel_lengths.cuda(non_blocking=True)
+        if speaker_c is not None:
+            speaker_c = speaker_c.cuda(non_blocking=True)
+        if attn_mask is not None:
+            attn_mask = attn_mask.cuda(non_blocking=True)
+    return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
+         avg_text_length, avg_spec_length, attn_mask, item_idx
+def data_depended_init(data_loader, model, ap):
+    """Data depended initialization for activation normalization."""
+    if hasattr(model, 'module'):
+        for f in model.module.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                f.set_ddi(True)
+    else:
+        for f in model.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                f.set_ddi(True)
+    model.train()
+    print(" > Data depended initialization ... ")
+    num_iter = 0
+    with torch.no_grad():
+        for _, data in enumerate(data_loader):
+            # format data
+            text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\
+                _, _, attn_mask, item_idx = format_data(data)
+            # forward pass model
+            _ = model.forward(
+                text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed)
+            if num_iter == c.data_dep_init_iter:
+                break
+            num_iter += 1
+    if hasattr(model, 'module'):
+        for f in model.module.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                f.set_ddi(False)
+    else:
+        for f in model.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                f.set_ddi(False)
+    return model
+def train(data_loader, model, criterion, optimizer, scheduler,
+          ap, global_step, epoch):
+    model.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(
+            len(data_loader.dataset) / (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
+            avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        optimizer.zero_grad()
+        # forward pass model
+        with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+            z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
+                text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
+            # compute loss
+            loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
+                                o_dur_log, o_total_dur, text_lengths)
+        # backward pass with loss scaling
+        if c.mixed_precision:
+            scaler.scale(loss_dict['loss']).backward()
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss_dict['loss'].backward()
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.grad_clip)
+            optimizer.step()
+        # setup lr
+        if c.noam_schedule:
+            scheduler.step()
+        # current_lr
+        current_lr = optimizer.param_groups[0]['lr']
+        # compute alignment error (the lower the better )
+        align_error = 1 - alignment_diagonal_score(alignments, binary=True)
+        loss_dict['align_error'] = align_error
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # aggregate losses from processes
+        if num_gpus > 1:
+            loss_dict['log_mle'] = reduce_tensor(loss_dict['log_mle'].data, num_gpus)
+            loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
+            loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+        # detach loss values
+        loss_dict_new = dict()
+        for key, value in loss_dict.items():
+            if isinstance(value, (int, float)):
+                loss_dict_new[key] = value
+            else:
+                loss_dict_new[key] = value.item()
+        loss_dict = loss_dict_new
+        # update avg stats
+        update_train_values = dict()
+        for key, value in loss_dict.items():
+            update_train_values['avg_' + key] = value
+        update_train_values['avg_loader_time'] = loader_time
+        update_train_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training progress
+        if global_step % c.print_step == 0:
+            log_dict = {
+                "avg_spec_length": [avg_spec_length, 1],  # value, precision
+                "avg_text_length": [avg_text_length, 1],
+                "step_time": [step_time, 4],
+                "loader_time": [loader_time, 2],
+                "current_lr": current_lr,
+            }
+            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                      log_dict, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Plot Training Iter Stats
+            # reduce TB load
+            if global_step % c.tb_plot_step == 0:
+                iter_stats = {
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "step_time": step_time
+                }
+                iter_stats.update(loss_dict)
+                tb_logger.tb_train_iter_stats(global_step, iter_stats)
+            if global_step % c.save_step == 0:
+                if c.checkpoint:
+                    # save model
+                    save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
+                                    model_loss=loss_dict['loss'])
+                # wait all kernels to be completed
+                torch.cuda.synchronize()
+                # Diagnostic visualizations
+                # direct pass on model for spec predictions
+                target_speaker = None if speaker_c is None else speaker_c[:1]
+                if hasattr(model, 'module'):
+                    spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
+                else:
+                    spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
+                spec_pred = spec_pred.permute(0, 2, 1)
+                gt_spec = mel_input.permute(0, 2, 1)
+                const_spec = spec_pred[0].data.cpu().numpy()
+                gt_spec = gt_spec[0].data.cpu().numpy()
+                align_img = alignments[0].data.cpu().numpy()
+                figures = {
+                    "prediction": plot_spectrogram(const_spec, ap),
+                    "ground_truth": plot_spectrogram(gt_spec, ap),
+                    "alignment": plot_alignment(align_img),
+                }
+                tb_logger.tb_train_figures(global_step, figures)
+                # Sample audio
+                train_audio = ap.inv_melspectrogram(const_spec.T)
+                tb_logger.tb_train_audios(global_step,
+                                          {'TrainAudio': train_audio},
+                                          c.audio["sample_rate"])
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Epoch Stats
+    if args.rank == 0:
+        epoch_stats = {"epoch_time": epoch_time}
+        epoch_stats.update(keep_avg.avg_values)
+        tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+        if c.tb_model_param_stats:
+            tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(data_loader, model, criterion, ap, global_step, epoch):
+    model.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    c_logger.print_eval_start()
+    if data_loader is not None:
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
+            # format data
+            text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
+                _, _, attn_mask, item_idx = format_data(data)
+            # forward pass model
+            z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
+                text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c)
+            # compute loss
+            loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
+                                  o_dur_log, o_total_dur, text_lengths)
+            # step time
+            step_time = time.time() - start_time
+            epoch_time += step_time
+            # compute alignment score
+            align_error = 1 - alignment_diagonal_score(alignments)
+            loss_dict['align_error'] = align_error
+            # aggregate losses from processes
+            if num_gpus > 1:
+                loss_dict['log_mle'] = reduce_tensor(loss_dict['log_mle'].data, num_gpus)
+                loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
+                loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+            # detach loss values
+            loss_dict_new = dict()
+            for key, value in loss_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict_new[key] = value
+                else:
+                    loss_dict_new[key] = value.item()
+            loss_dict = loss_dict_new
+            # update avg stats
+            update_train_values = dict()
+            for key, value in loss_dict.items():
+                update_train_values['avg_' + key] = value
+            keep_avg.update_values(update_train_values)
+            if c.print_eval:
+                c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Diagnostic visualizations
+            # direct pass on model for spec predictions
+            target_speaker = None if speaker_c is None else speaker_c[:1]
+            if hasattr(model, 'module'):
+                spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
+            else:
+                spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
+            spec_pred = spec_pred.permute(0, 2, 1)
+            gt_spec = mel_input.permute(0, 2, 1)
+            const_spec = spec_pred[0].data.cpu().numpy()
+            gt_spec = gt_spec[0].data.cpu().numpy()
+            align_img = alignments[0].data.cpu().numpy()
+            eval_figures = {
+                "prediction": plot_spectrogram(const_spec, ap),
+                "ground_truth": plot_spectrogram(gt_spec, ap),
+                "alignment": plot_alignment(align_img)
+            }
+            # Sample audio
+            eval_audio = ap.inv_melspectrogram(const_spec.T)
+            tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                     c.audio["sample_rate"])
+            # Plot Validation Stats
+            tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+            tb_logger.tb_eval_figures(global_step, eval_figures)
+    if args.rank == 0 and epoch >= c.test_delay_epochs:
+        if c.test_sentences_file is None:
+            test_sentences = [
+                "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                "Be a voice, not an echo.",
+                "I'm sorry Dave. I'm afraid I can't do that.",
+                "This cake is great. It's so delicious and moist.",
+                "Prior to November 22, 1963."
+            ]
+        else:
+            with open(c.test_sentences_file, "r") as f:
+                test_sentences = [s.strip() for s in f.readlines()]
+        # test sentences
+        test_audios = {}
+        test_figures = {}
+        print(" | > Synthesizing test sentences")
+        if c.use_speaker_embedding:
+            if c.use_external_speaker_embedding_file:
+                speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding']
+                speaker_id = None
+            else:
+                speaker_id = 0
+                speaker_embedding = None
+        else:
+            speaker_id = None
+            speaker_embedding = None
+        style_wav = c.get("style_wav_for_test")
+        for idx, test_sentence in enumerate(test_sentences):
+            try:
+                wav, alignment, _, postnet_output, _, _ = synthesis(
+                    model,
+                    test_sentence,
+                    c,
+                    use_cuda,
+                    ap,
+                    speaker_id=speaker_id,
+                    speaker_embedding=speaker_embedding,
+                    style_wav=style_wav,
+                    truncated=False,
+                    enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
+                    use_griffin_lim=True,
+                    do_trim_silence=False)
+                file_path = os.path.join(AUDIO_PATH, str(global_step))
+                os.makedirs(file_path, exist_ok=True)
+                file_path = os.path.join(file_path,
+                                         "TestSentence_{}.wav".format(idx))
+                ap.save_wav(wav, file_path)
+                test_audios['{}-audio'.format(idx)] = wav
+                test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
+                    postnet_output, ap)
+                test_figures['{}-alignment'.format(idx)] = plot_alignment(
+                    alignment)
+            except: #pylint: disable=bare-except
+                print(" !! Error creating Test Sentence -", idx)
+                traceback.print_exc()
+        tb_logger.tb_test_audios(global_step, test_audios,
+                                 c.audio['sample_rate'])
+        tb_logger.tb_test_figures(global_step, test_figures)
+    return keep_avg.avg_values
+# FIXME: move args definition/parsing inside of main?
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
+    # Audio processor
+    ap = AudioProcessor(**c.audio)
+    if 'characters' in c.keys():
+        symbols, phonemes = make_symbols(**c.characters)
+    # DISTRUBUTED
+    if num_gpus > 1:
+        init_distributed(args.rank, num_gpus, args.group_id,
+                         c.distributed["backend"], c.distributed["url"])
+    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+    # load data instances
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets)
+    # set the portion of the data used for training
+    if 'train_portion' in c.keys():
+        meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
+    if 'eval_portion' in c.keys():
+        meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
+    # parse speakers
+    num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH)
+    # setup model
+    model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim)
+    optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9)
+    criterion = GlowTTSLoss()
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location='cpu')
+        try:
+            # TODO: fix optimizer init, model.cuda() needs to be called before
+            # optimizer restore
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if c.reinit_layers:
+                raise RuntimeError
+            model.load_state_dict(checkpoint['model'])
+        except: #pylint: disable=bare-except
+            print(" > Partial model initialization.")
+            model_dict = model.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model.load_state_dict(model_dict)
+            del model_dict
+        for group in optimizer.param_groups:
+            group['initial_lr'] = c.lr
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model.cuda()
+        criterion.cuda()
+    # DISTRUBUTED
+    if num_gpus > 1:
+        model = DDP_th(model, device_ids=[args.rank])
+    if c.noam_schedule:
+        scheduler = NoamLR(optimizer,
+                           warmup_steps=c.warmup_steps,
+                           last_epoch=args.restore_step - 1)
+    else:
+        scheduler = None
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    if 'best_loss' not in locals():
+        best_loss = float('inf')
+    # define dataloaders
+    train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
+    eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
+    global_step = args.restore_step
+    model = data_depended_init(train_loader, model, ap)
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
+                                                 scheduler, ap, global_step,
+                                                 epoch)
+        eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = train_avg_loss_dict['avg_loss']
+        if c.run_eval:
+            target_loss = eval_avg_loss_dict['avg_loss']
+        best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
+                                    OUT_PATH)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        help='Path to config file for training.',
+        required='--continue_path' not in sys.argv
+    )
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=False,
+                        help='Do not verify commit integrity to run training.')
+    # DISTRUBUTED
+    parser.add_argument(
+        '--rank',
+        type=int,
+        default=0,
+        help='DISTRIBUTED: process rank for distributed training.')
+    parser.add_argument('--group_id',
+                        type=str,
+                        default="",
+                        help='DISTRIBUTED: process group id.')
+    args = parser.parse_args()
+    if args.continue_path != '':
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, 'config.json')
+        list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+        print(f" > Training continues for {args.restore_path}")
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    # check_config(c)
+    check_config_tts(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    if c.mixed_precision:
+        print("   > Mixed precision enabled.")
+    OUT_PATH = args.continue_path
+    if args.continue_path == '':
+        OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
+    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(c,  args.config_path,
+                         OUT_PATH, new_fields)
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
+        # write model desc to tensorboard
+        tb_logger.tb_add_text('model-description', c['run_description'], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_speedy_speech.py ADDED Viewed

	@@ -0,0 +1,618 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import glob
+import os
+import sys
+import time
+import traceback
+import numpy as np
+from random import randrange
+import torch
+# DISTRIBUTED
+from torch.nn.parallel import DistributedDataParallel as DDP_th
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.tts.datasets.TTSDataset import MyDataset
+from TTS.tts.layers.losses import SpeedySpeechLoss
+from TTS.tts.utils.generic_utils import check_config_tts, setup_model
+from TTS.tts.utils.io import save_best_model, save_checkpoint
+from TTS.tts.utils.measures import alignment_diagonal_score
+from TTS.tts.utils.speakers import parse_speakers
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.distribute import init_distributed, reduce_tensor
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.radam import RAdam
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import NoamLR, setup_torch_training_env
+use_cuda, num_gpus = setup_torch_training_env(True, False)
+def setup_loader(ap, r, is_val=False, verbose=False):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        dataset = MyDataset(
+            r,
+            c.text_cleaner,
+            compute_linear_spec=False,
+            meta_data=meta_data_eval if is_val else meta_data_train,
+            ap=ap,
+            tp=c.characters if 'characters' in c.keys() else None,
+            add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
+            batch_group_size=0 if is_val else c.batch_group_size *
+            c.batch_size,
+            min_seq_len=c.min_seq_len,
+            max_seq_len=c.max_seq_len,
+            phoneme_cache_path=c.phoneme_cache_path,
+            use_phonemes=c.use_phonemes,
+            phoneme_language=c.phoneme_language,
+            enable_eos_bos=c.enable_eos_bos_chars,
+            use_noise_augment=not is_val,
+            verbose=verbose,
+            speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
+        if c.use_phonemes and c.compute_input_seq_cache:
+            # precompute phonemes to have a better estimate of sequence lengths.
+            dataset.compute_input_seq(c.num_loader_workers)
+        dataset.sort_items()
+        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(
+            dataset,
+            batch_size=c.eval_batch_size if is_val else c.batch_size,
+            shuffle=False,
+            collate_fn=dataset.collate_fn,
+            drop_last=False,
+            sampler=sampler,
+            num_workers=c.num_val_loader_workers
+            if is_val else c.num_loader_workers,
+            pin_memory=False)
+    return loader
+def format_data(data):
+    # setup input data
+    text_input = data[0]
+    text_lengths = data[1]
+    speaker_names = data[2]
+    mel_input = data[4].permute(0, 2, 1)  # B x D x T
+    mel_lengths = data[5]
+    item_idx = data[7]
+    attn_mask = data[9]
+    avg_text_length = torch.mean(text_lengths.float())
+    avg_spec_length = torch.mean(mel_lengths.float())
+    if c.use_speaker_embedding:
+        if c.use_external_speaker_embedding_file:
+            # return precomputed embedding vector
+            speaker_c = data[8]
+        else:
+            # return speaker_id to be used by an embedding layer
+            speaker_c = [
+                speaker_mapping[speaker_name] for speaker_name in speaker_names
+            ]
+            speaker_c = torch.LongTensor(speaker_c)
+    else:
+        speaker_c = None
+    # compute durations from attention mask
+    durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2])
+    for idx, am in enumerate(attn_mask):
+        # compute raw durations
+        c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1]
+        # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True)
+        c_idxs, counts = torch.unique(c_idxs, return_counts=True)
+        dur = torch.ones([text_lengths[idx]]).to(counts.dtype)
+        dur[c_idxs] = counts
+        # smooth the durations and set any 0 duration to 1
+        # by cutting off from the largest duration indeces.
+        extra_frames = dur.sum() - mel_lengths[idx]
+        largest_idxs = torch.argsort(-dur)[:extra_frames]
+        dur[largest_idxs] -= 1
+        assert dur.sum() == mel_lengths[idx], f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+        durations[idx, :text_lengths[idx]] = dur
+    # dispatch data to GPU
+    if use_cuda:
+        text_input = text_input.cuda(non_blocking=True)
+        text_lengths = text_lengths.cuda(non_blocking=True)
+        mel_input = mel_input.cuda(non_blocking=True)
+        mel_lengths = mel_lengths.cuda(non_blocking=True)
+        if speaker_c is not None:
+            speaker_c = speaker_c.cuda(non_blocking=True)
+        attn_mask = attn_mask.cuda(non_blocking=True)
+        durations = durations.cuda(non_blocking=True)
+    return text_input, text_lengths, mel_input, mel_lengths, speaker_c,\
+         avg_text_length, avg_spec_length, attn_mask, durations, item_idx
+def train(data_loader, model, criterion, optimizer, scheduler,
+          ap, global_step, epoch):
+    model.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(
+            len(data_loader.dataset) / (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
+            avg_text_length, avg_spec_length, _, dur_target, _ = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        optimizer.zero_grad()
+        # forward pass model
+        with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+            decoder_output, dur_output, alignments = model.forward(
+                text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
+            # compute loss
+            loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
+        # backward pass with loss scaling
+        if c.mixed_precision:
+            scaler.scale(loss_dict['loss']).backward()
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss_dict['loss'].backward()
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.grad_clip)
+            optimizer.step()
+        # setup lr
+        if c.noam_schedule:
+            scheduler.step()
+        # current_lr
+        current_lr = optimizer.param_groups[0]['lr']
+        # compute alignment error (the lower the better )
+        align_error = 1 - alignment_diagonal_score(alignments, binary=True)
+        loss_dict['align_error'] = align_error
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # aggregate losses from processes
+        if num_gpus > 1:
+            loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
+            loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
+            loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
+            loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+        # detach loss values
+        loss_dict_new = dict()
+        for key, value in loss_dict.items():
+            if isinstance(value, (int, float)):
+                loss_dict_new[key] = value
+            else:
+                loss_dict_new[key] = value.item()
+        loss_dict = loss_dict_new
+        # update avg stats
+        update_train_values = dict()
+        for key, value in loss_dict.items():
+            update_train_values['avg_' + key] = value
+        update_train_values['avg_loader_time'] = loader_time
+        update_train_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training progress
+        if global_step % c.print_step == 0:
+            log_dict = {
+                "avg_spec_length": [avg_spec_length, 1],  # value, precision
+                "avg_text_length": [avg_text_length, 1],
+                "step_time": [step_time, 4],
+                "loader_time": [loader_time, 2],
+                "current_lr": current_lr,
+            }
+            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                      log_dict, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Plot Training Iter Stats
+            # reduce TB load
+            if global_step % c.tb_plot_step == 0:
+                iter_stats = {
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "step_time": step_time
+                }
+                iter_stats.update(loss_dict)
+                tb_logger.tb_train_iter_stats(global_step, iter_stats)
+            if global_step % c.save_step == 0:
+                if c.checkpoint:
+                    # save model
+                    save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH,
+                                    model_loss=loss_dict['loss'])
+                # wait all kernels to be completed
+                torch.cuda.synchronize()
+                # Diagnostic visualizations
+                idx = np.random.randint(mel_targets.shape[0])
+                pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
+                gt_spec = mel_targets[idx].data.cpu().numpy().T
+                align_img = alignments[idx].data.cpu()
+                figures = {
+                    "prediction": plot_spectrogram(pred_spec, ap),
+                    "ground_truth": plot_spectrogram(gt_spec, ap),
+                    "alignment": plot_alignment(align_img),
+                }
+                tb_logger.tb_train_figures(global_step, figures)
+                # Sample audio
+                train_audio = ap.inv_melspectrogram(pred_spec.T)
+                tb_logger.tb_train_audios(global_step,
+                                          {'TrainAudio': train_audio},
+                                          c.audio["sample_rate"])
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Epoch Stats
+    if args.rank == 0:
+        epoch_stats = {"epoch_time": epoch_time}
+        epoch_stats.update(keep_avg.avg_values)
+        tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+        if c.tb_model_param_stats:
+            tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(data_loader, model, criterion, ap, global_step, epoch):
+    model.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    c_logger.print_eval_start()
+    if data_loader is not None:
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
+            # format data
+            text_input, text_lengths, mel_targets, mel_lengths, speaker_c,\
+                _, _, _, dur_target, _ = format_data(data)
+            # forward pass model
+            with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+                decoder_output, dur_output, alignments = model.forward(
+                    text_input, text_lengths, mel_lengths, dur_target, g=speaker_c)
+                # compute loss
+                loss_dict = criterion(decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths)
+            # step time
+            step_time = time.time() - start_time
+            epoch_time += step_time
+            # compute alignment score
+            align_error = 1 - alignment_diagonal_score(alignments, binary=True)
+            loss_dict['align_error'] = align_error
+            # aggregate losses from processes
+            if num_gpus > 1:
+                loss_dict['loss_l1'] = reduce_tensor(loss_dict['loss_l1'].data, num_gpus)
+                loss_dict['loss_ssim'] = reduce_tensor(loss_dict['loss_ssim'].data, num_gpus)
+                loss_dict['loss_dur'] = reduce_tensor(loss_dict['loss_dur'].data, num_gpus)
+                loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+            # detach loss values
+            loss_dict_new = dict()
+            for key, value in loss_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict_new[key] = value
+                else:
+                    loss_dict_new[key] = value.item()
+            loss_dict = loss_dict_new
+            # update avg stats
+            update_train_values = dict()
+            for key, value in loss_dict.items():
+                update_train_values['avg_' + key] = value
+            keep_avg.update_values(update_train_values)
+            if c.print_eval:
+                c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Diagnostic visualizations
+            idx = np.random.randint(mel_targets.shape[0])
+            pred_spec = decoder_output[idx].detach().data.cpu().numpy().T
+            gt_spec = mel_targets[idx].data.cpu().numpy().T
+            align_img = alignments[idx].data.cpu()
+            eval_figures = {
+                "prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
+                "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
+                "alignment": plot_alignment(align_img, output_fig=False)
+            }
+            # Sample audio
+            eval_audio = ap.inv_melspectrogram(pred_spec.T)
+            tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                     c.audio["sample_rate"])
+            # Plot Validation Stats
+            tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+            tb_logger.tb_eval_figures(global_step, eval_figures)
+    if args.rank == 0 and epoch >= c.test_delay_epochs:
+        if c.test_sentences_file is None:
+            test_sentences = [
+                "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                "Be a voice, not an echo.",
+                "I'm sorry Dave. I'm afraid I can't do that.",
+                "This cake is great. It's so delicious and moist.",
+                "Prior to November 22, 1963."
+            ]
+        else:
+            with open(c.test_sentences_file, "r") as f:
+                test_sentences = [s.strip() for s in f.readlines()]
+        # test sentences
+        test_audios = {}
+        test_figures = {}
+        print(" | > Synthesizing test sentences")
+        if c.use_speaker_embedding:
+            if c.use_external_speaker_embedding_file:
+                speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding']
+                speaker_id = None
+            else:
+                speaker_id = 0
+                speaker_embedding = None
+        else:
+            speaker_id = None
+            speaker_embedding = None
+        style_wav = c.get("style_wav_for_test")
+        for idx, test_sentence in enumerate(test_sentences):
+            try:
+                wav, alignment, _, postnet_output, _, _ = synthesis(
+                    model,
+                    test_sentence,
+                    c,
+                    use_cuda,
+                    ap,
+                    speaker_id=speaker_id,
+                    speaker_embedding=speaker_embedding,
+                    style_wav=style_wav,
+                    truncated=False,
+                    enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
+                    use_griffin_lim=True,
+                    do_trim_silence=False)
+                file_path = os.path.join(AUDIO_PATH, str(global_step))
+                os.makedirs(file_path, exist_ok=True)
+                file_path = os.path.join(file_path,
+                                         "TestSentence_{}.wav".format(idx))
+                ap.save_wav(wav, file_path)
+                test_audios['{}-audio'.format(idx)] = wav
+                test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
+                    postnet_output, ap)
+                test_figures['{}-alignment'.format(idx)] = plot_alignment(
+                    alignment)
+            except: #pylint: disable=bare-except
+                print(" !! Error creating Test Sentence -", idx)
+                traceback.print_exc()
+        tb_logger.tb_test_audios(global_step, test_audios,
+                                 c.audio['sample_rate'])
+        tb_logger.tb_test_figures(global_step, test_figures)
+    return keep_avg.avg_values
+# FIXME: move args definition/parsing inside of main?
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
+    # Audio processor
+    ap = AudioProcessor(**c.audio)
+    if 'characters' in c.keys():
+        symbols, phonemes = make_symbols(**c.characters)
+    # DISTRUBUTED
+    if num_gpus > 1:
+        init_distributed(args.rank, num_gpus, args.group_id,
+                         c.distributed["backend"], c.distributed["url"])
+    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+    # load data instances
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True)
+    # set the portion of the data used for training if set in config.json
+    if 'train_portion' in c.keys():
+        meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
+    if 'eval_portion' in c.keys():
+        meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
+    # parse speakers
+    num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH)
+    # setup model
+    model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim)
+    optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9)
+    criterion = SpeedySpeechLoss(c)
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location='cpu')
+        try:
+            # TODO: fix optimizer init, model.cuda() needs to be called before
+            # optimizer restore
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if c.reinit_layers:
+                raise RuntimeError
+            model.load_state_dict(checkpoint['model'])
+        except: #pylint: disable=bare-except
+            print(" > Partial model initialization.")
+            model_dict = model.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model.load_state_dict(model_dict)
+            del model_dict
+        for group in optimizer.param_groups:
+            group['initial_lr'] = c.lr
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model.cuda()
+        criterion.cuda()
+    # DISTRUBUTED
+    if num_gpus > 1:
+        model = DDP_th(model, device_ids=[args.rank])
+    if c.noam_schedule:
+        scheduler = NoamLR(optimizer,
+                           warmup_steps=c.warmup_steps,
+                           last_epoch=args.restore_step - 1)
+    else:
+        scheduler = None
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    if 'best_loss' not in locals():
+        best_loss = float('inf')
+    # define dataloaders
+    train_loader = setup_loader(ap, 1, is_val=False, verbose=True)
+    eval_loader = setup_loader(ap, 1, is_val=True, verbose=True)
+    global_step = args.restore_step
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer,
+                                                 scheduler, ap, global_step,
+                                                 epoch)
+        eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = train_avg_loss_dict['avg_loss']
+        if c.run_eval:
+            target_loss = eval_avg_loss_dict['avg_loss']
+        best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
+                                    OUT_PATH)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        help='Path to config file for training.',
+        required='--continue_path' not in sys.argv
+    )
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=False,
+                        help='Do not verify commit integrity to run training.')
+    # DISTRUBUTED
+    parser.add_argument(
+        '--rank',
+        type=int,
+        default=0,
+        help='DISTRIBUTED: process rank for distributed training.')
+    parser.add_argument('--group_id',
+                        type=str,
+                        default="",
+                        help='DISTRIBUTED: process group id.')
+    args = parser.parse_args()
+    if args.continue_path != '':
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, 'config.json')
+        list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+        print(f" > Training continues for {args.restore_path}")
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    # check_config(c)
+    check_config_tts(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    if c.mixed_precision:
+        print("   > Mixed precision enabled.")
+    OUT_PATH = args.continue_path
+    if args.continue_path == '':
+        OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
+    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(c, args.config_path, OUT_PATH, new_fields)
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
+        # write model desc to tensorboard
+        tb_logger.tb_add_text('model-description', c['run_description'], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_tacotron.py ADDED Viewed

	@@ -0,0 +1,731 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import glob
+import os
+import sys
+import time
+import traceback
+from random import randrange
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from TTS.tts.datasets.preprocess import load_meta_data
+from TTS.tts.datasets.TTSDataset import MyDataset
+from TTS.tts.layers.losses import TacotronLoss
+from TTS.tts.utils.generic_utils import check_config_tts, setup_model
+from TTS.tts.utils.io import save_best_model, save_checkpoint
+from TTS.tts.utils.measures import alignment_diagonal_score
+from TTS.tts.utils.speakers import parse_speakers
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce,
+                                  init_distributed, reduce_tensor)
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.radam import RAdam
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
+                                gradual_training_scheduler, set_weight_decay,
+                                setup_torch_training_env)
+use_cuda, num_gpus = setup_torch_training_env(True, False)
+def setup_loader(ap, r, is_val=False, verbose=False, dataset=None):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        if dataset is None:
+            dataset = MyDataset(
+                r,
+                c.text_cleaner,
+                compute_linear_spec=c.model.lower() == 'tacotron',
+                meta_data=meta_data_eval if is_val else meta_data_train,
+                ap=ap,
+                tp=c.characters if 'characters' in c.keys() else None,
+                add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
+                batch_group_size=0 if is_val else c.batch_group_size *
+                c.batch_size,
+                min_seq_len=c.min_seq_len,
+                max_seq_len=c.max_seq_len,
+                phoneme_cache_path=c.phoneme_cache_path,
+                use_phonemes=c.use_phonemes,
+                phoneme_language=c.phoneme_language,
+                enable_eos_bos=c.enable_eos_bos_chars,
+                verbose=verbose,
+                speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
+            if c.use_phonemes and c.compute_input_seq_cache:
+                # precompute phonemes to have a better estimate of sequence lengths.
+                dataset.compute_input_seq(c.num_loader_workers)
+            dataset.sort_items()
+        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(
+            dataset,
+            batch_size=c.eval_batch_size if is_val else c.batch_size,
+            shuffle=False,
+            collate_fn=dataset.collate_fn,
+            drop_last=False,
+            sampler=sampler,
+            num_workers=c.num_val_loader_workers
+            if is_val else c.num_loader_workers,
+            pin_memory=False)
+    return loader
+def format_data(data):
+    # setup input data
+    text_input = data[0]
+    text_lengths = data[1]
+    speaker_names = data[2]
+    linear_input = data[3] if c.model in ["Tacotron"] else None
+    mel_input = data[4]
+    mel_lengths = data[5]
+    stop_targets = data[6]
+    max_text_length = torch.max(text_lengths.float())
+    max_spec_length = torch.max(mel_lengths.float())
+    if c.use_speaker_embedding:
+        if c.use_external_speaker_embedding_file:
+            speaker_embeddings = data[8]
+            speaker_ids = None
+        else:
+            speaker_ids = [
+                speaker_mapping[speaker_name] for speaker_name in speaker_names
+            ]
+            speaker_ids = torch.LongTensor(speaker_ids)
+            speaker_embeddings = None
+    else:
+        speaker_embeddings = None
+        speaker_ids = None
+    # set stop targets view, we predict a single stop token per iteration.
+    stop_targets = stop_targets.view(text_input.shape[0],
+                                     stop_targets.size(1) // c.r, -1)
+    stop_targets = (stop_targets.sum(2) >
+                    0.0).unsqueeze(2).float().squeeze(2)
+    # dispatch data to GPU
+    if use_cuda:
+        text_input = text_input.cuda(non_blocking=True)
+        text_lengths = text_lengths.cuda(non_blocking=True)
+        mel_input = mel_input.cuda(non_blocking=True)
+        mel_lengths = mel_lengths.cuda(non_blocking=True)
+        linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
+        stop_targets = stop_targets.cuda(non_blocking=True)
+        if speaker_ids is not None:
+            speaker_ids = speaker_ids.cuda(non_blocking=True)
+        if speaker_embeddings is not None:
+            speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
+    return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length
+def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler,
+          ap, global_step, epoch, scaler, scaler_st):
+    model.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(
+            len(data_loader.dataset) / (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        # setup lr
+        if c.noam_schedule:
+            scheduler.step()
+        optimizer.zero_grad()
+        if optimizer_st:
+            optimizer_st.zero_grad()
+        with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+            # forward pass model
+            if c.bidirectional_decoder or c.double_decoder_consistency:
+                decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
+                    text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
+            else:
+                decoder_output, postnet_output, alignments, stop_tokens = model(
+                    text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
+                decoder_backward_output = None
+                alignments_backward = None
+            # set the [alignment] lengths wrt reduction factor for guided attention
+            if mel_lengths.max() % model.decoder.r != 0:
+                alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
+            else:
+                alignment_lengths = mel_lengths //  model.decoder.r
+            # compute loss
+            loss_dict = criterion(postnet_output, decoder_output, mel_input,
+                                linear_input, stop_tokens, stop_targets,
+                                mel_lengths, decoder_backward_output,
+                                alignments, alignment_lengths, alignments_backward,
+                                text_lengths)
+        # check nan loss
+        if torch.isnan(loss_dict['loss']).any():
+            raise RuntimeError(f'Detected NaN loss at step {global_step}.')
+        # optimizer step
+        if c.mixed_precision:
+            # model optimizer step in mixed precision mode
+            scaler.scale(loss_dict['loss']).backward()
+            scaler.unscale_(optimizer)
+            optimizer, current_lr = adam_weight_decay(optimizer)
+            grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
+            scaler.step(optimizer)
+            scaler.update()
+            # stopnet optimizer step
+            if c.separate_stopnet:
+                scaler_st.scale( loss_dict['stopnet_loss']).backward()
+                scaler.unscale_(optimizer_st)
+                optimizer_st, _ = adam_weight_decay(optimizer_st)
+                grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
+                scaler_st.step(optimizer)
+                scaler_st.update()
+            else:
+                grad_norm_st = 0
+        else:
+            # main model optimizer step
+            loss_dict['loss'].backward()
+            optimizer, current_lr = adam_weight_decay(optimizer)
+            grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
+            optimizer.step()
+            # stopnet optimizer step
+            if c.separate_stopnet:
+                loss_dict['stopnet_loss'].backward()
+                optimizer_st, _ = adam_weight_decay(optimizer_st)
+                grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
+                optimizer_st.step()
+            else:
+                grad_norm_st = 0
+        # compute alignment error (the lower the better )
+        align_error = 1 - alignment_diagonal_score(alignments)
+        loss_dict['align_error'] = align_error
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # aggregate losses from processes
+        if num_gpus > 1:
+            loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
+            loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
+            loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+            loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
+        # detach loss values
+        loss_dict_new = dict()
+        for key, value in loss_dict.items():
+            if isinstance(value, (int, float)):
+                loss_dict_new[key] = value
+            else:
+                loss_dict_new[key] = value.item()
+        loss_dict = loss_dict_new
+        # update avg stats
+        update_train_values = dict()
+        for key, value in loss_dict.items():
+            update_train_values['avg_' + key] = value
+        update_train_values['avg_loader_time'] = loader_time
+        update_train_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training progress
+        if global_step % c.print_step == 0:
+            log_dict = {
+                "max_spec_length": [max_spec_length, 1],  # value, precision
+                "max_text_length": [max_text_length, 1],
+                "step_time": [step_time, 4],
+                "loader_time": [loader_time, 2],
+                "current_lr": current_lr,
+            }
+            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                      log_dict, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Plot Training Iter Stats
+            # reduce TB load
+            if global_step % c.tb_plot_step == 0:
+                iter_stats = {
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "grad_norm_st": grad_norm_st,
+                    "step_time": step_time
+                }
+                iter_stats.update(loss_dict)
+                tb_logger.tb_train_iter_stats(global_step, iter_stats)
+            if global_step % c.save_step == 0:
+                if c.checkpoint:
+                    # save model
+                    save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
+                                    optimizer_st=optimizer_st,
+                                    model_loss=loss_dict['postnet_loss'],
+                                    scaler=scaler.state_dict() if c.mixed_precision else None)
+                # Diagnostic visualizations
+                const_spec = postnet_output[0].data.cpu().numpy()
+                gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
+                    "Tacotron", "TacotronGST"
+                ] else mel_input[0].data.cpu().numpy()
+                align_img = alignments[0].data.cpu().numpy()
+                figures = {
+                    "prediction": plot_spectrogram(const_spec, ap, output_fig=False),
+                    "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
+                    "alignment": plot_alignment(align_img, output_fig=False),
+                }
+                if c.bidirectional_decoder or c.double_decoder_consistency:
+                    figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
+                tb_logger.tb_train_figures(global_step, figures)
+                # Sample audio
+                if c.model in ["Tacotron", "TacotronGST"]:
+                    train_audio = ap.inv_spectrogram(const_spec.T)
+                else:
+                    train_audio = ap.inv_melspectrogram(const_spec.T)
+                tb_logger.tb_train_audios(global_step,
+                                          {'TrainAudio': train_audio},
+                                          c.audio["sample_rate"])
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Epoch Stats
+    if args.rank == 0:
+        epoch_stats = {"epoch_time": epoch_time}
+        epoch_stats.update(keep_avg.avg_values)
+        tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+        if c.tb_model_param_stats:
+            tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(data_loader, model, criterion, ap, global_step, epoch):
+    model.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    c_logger.print_eval_start()
+    if data_loader is not None:
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
+            # format data
+            text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data)
+            assert mel_input.shape[1] % model.decoder.r == 0
+            # forward pass model
+            if c.bidirectional_decoder or c.double_decoder_consistency:
+                decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
+            else:
+                decoder_output, postnet_output, alignments, stop_tokens = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
+                decoder_backward_output = None
+                alignments_backward = None
+            # set the alignment lengths wrt reduction factor for guided attention
+            if mel_lengths.max() % model.decoder.r != 0:
+                alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
+            else:
+                alignment_lengths = mel_lengths //  model.decoder.r
+            # compute loss
+            loss_dict = criterion(postnet_output, decoder_output, mel_input,
+                                  linear_input, stop_tokens, stop_targets,
+                                  mel_lengths, decoder_backward_output,
+                                  alignments, alignment_lengths, alignments_backward,
+                                  text_lengths)
+            # step time
+            step_time = time.time() - start_time
+            epoch_time += step_time
+            # compute alignment score
+            align_error = 1 - alignment_diagonal_score(alignments)
+            loss_dict['align_error'] = align_error
+            # aggregate losses from processes
+            if num_gpus > 1:
+                loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
+                loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
+                if c.stopnet:
+                    loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
+            # detach loss values
+            loss_dict_new = dict()
+            for key, value in loss_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict_new[key] = value
+                else:
+                    loss_dict_new[key] = value.item()
+            loss_dict = loss_dict_new
+            # update avg stats
+            update_train_values = dict()
+            for key, value in loss_dict.items():
+                update_train_values['avg_' + key] = value
+            keep_avg.update_values(update_train_values)
+            if c.print_eval:
+                c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # Diagnostic visualizations
+            idx = np.random.randint(mel_input.shape[0])
+            const_spec = postnet_output[idx].data.cpu().numpy()
+            gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
+                "Tacotron", "TacotronGST"
+            ] else mel_input[idx].data.cpu().numpy()
+            align_img = alignments[idx].data.cpu().numpy()
+            eval_figures = {
+                "prediction": plot_spectrogram(const_spec, ap, output_fig=False),
+                "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
+                "alignment": plot_alignment(align_img, output_fig=False)
+            }
+            # Sample audio
+            if c.model in ["Tacotron", "TacotronGST"]:
+                eval_audio = ap.inv_spectrogram(const_spec.T)
+            else:
+                eval_audio = ap.inv_melspectrogram(const_spec.T)
+            tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                     c.audio["sample_rate"])
+            # Plot Validation Stats
+            if c.bidirectional_decoder or c.double_decoder_consistency:
+                align_b_img = alignments_backward[idx].data.cpu().numpy()
+                eval_figures['alignment2'] = plot_alignment(align_b_img, output_fig=False)
+            tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+            tb_logger.tb_eval_figures(global_step, eval_figures)
+    if args.rank == 0 and epoch > c.test_delay_epochs:
+        if c.test_sentences_file is None:
+            test_sentences = [
+                "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                "Be a voice, not an echo.",
+                "I'm sorry Dave. I'm afraid I can't do that.",
+                "This cake is great. It's so delicious and moist.",
+                "Prior to November 22, 1963."
+            ]
+        else:
+            with open(c.test_sentences_file, "r") as f:
+                test_sentences = [s.strip() for s in f.readlines()]
+        # test sentences
+        test_audios = {}
+        test_figures = {}
+        print(" | > Synthesizing test sentences")
+        speaker_id = 0 if c.use_speaker_embedding else None
+        speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding'] if c.use_external_speaker_embedding_file and c.use_speaker_embedding else None
+        style_wav = c.get("gst_style_input")
+        if style_wav is None and c.use_gst:
+            # inicialize GST with zero dict.
+            style_wav = {}
+            print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!")
+            for i in range(c.gst['gst_style_tokens']):
+                style_wav[str(i)] = 0
+        style_wav = c.get("gst_style_input")
+        for idx, test_sentence in enumerate(test_sentences):
+            try:
+                wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
+                    model,
+                    test_sentence,
+                    c,
+                    use_cuda,
+                    ap,
+                    speaker_id=speaker_id,
+                    speaker_embedding=speaker_embedding,
+                    style_wav=style_wav,
+                    truncated=False,
+                    enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
+                    use_griffin_lim=True,
+                    do_trim_silence=False)
+                file_path = os.path.join(AUDIO_PATH, str(global_step))
+                os.makedirs(file_path, exist_ok=True)
+                file_path = os.path.join(file_path,
+                                         "TestSentence_{}.wav".format(idx))
+                ap.save_wav(wav, file_path)
+                test_audios['{}-audio'.format(idx)] = wav
+                test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
+                    postnet_output, ap, output_fig=False)
+                test_figures['{}-alignment'.format(idx)] = plot_alignment(
+                    alignment, output_fig=False)
+            except:  #pylint: disable=bare-except
+                print(" !! Error creating Test Sentence -", idx)
+                traceback.print_exc()
+        tb_logger.tb_test_audios(global_step, test_audios,
+                                 c.audio['sample_rate'])
+        tb_logger.tb_test_figures(global_step, test_figures)
+    return keep_avg.avg_values
+# FIXME: move args definition/parsing inside of main?
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data_train, meta_data_eval, symbols, phonemes, speaker_mapping
+    # Audio processor
+    ap = AudioProcessor(**c.audio)
+    if 'characters' in c.keys():
+        symbols, phonemes = make_symbols(**c.characters)
+    # DISTRUBUTED
+    if num_gpus > 1:
+        init_distributed(args.rank, num_gpus, args.group_id,
+                         c.distributed["backend"], c.distributed["url"])
+    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+    # load data instances
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets)
+    # set the portion of the data used for training
+    if 'train_portion' in c.keys():
+        meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
+    if 'eval_portion' in c.keys():
+        meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
+    # parse speakers
+    num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH)
+    model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim)
+    # scalers for mixed precision training
+    scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+    scaler_st = torch.cuda.amp.GradScaler() if c.mixed_precision and c.separate_stopnet else None
+    params = set_weight_decay(model, c.wd)
+    optimizer = RAdam(params, lr=c.lr, weight_decay=0)
+    if c.stopnet and c.separate_stopnet:
+        optimizer_st = RAdam(model.decoder.stopnet.parameters(),
+                             lr=c.lr,
+                             weight_decay=0)
+    else:
+        optimizer_st = None
+    # setup criterion
+    criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location='cpu')
+        try:
+            print(" > Restoring Model.")
+            model.load_state_dict(checkpoint['model'])
+            # optimizer restore
+            print(" > Restoring Optimizer.")
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if "scaler" in checkpoint and c.mixed_precision:
+                print(" > Restoring AMP Scaler...")
+                scaler.load_state_dict(checkpoint["scaler"])
+            if c.reinit_layers:
+                raise RuntimeError
+        except (KeyError, RuntimeError):
+            print(" > Partial model initialization.")
+            model_dict = model.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt'))
+            # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt'))
+            model.load_state_dict(model_dict)
+            del model_dict
+        for group in optimizer.param_groups:
+            group['lr'] = c.lr
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model.cuda()
+        criterion.cuda()
+    # DISTRUBUTED
+    if num_gpus > 1:
+        model = apply_gradient_allreduce(model)
+    if c.noam_schedule:
+        scheduler = NoamLR(optimizer,
+                           warmup_steps=c.warmup_steps,
+                           last_epoch=args.restore_step - 1)
+    else:
+        scheduler = None
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    if 'best_loss' not in locals():
+        best_loss = float('inf')
+    # define data loaders
+    train_loader = setup_loader(ap,
+                                model.decoder.r,
+                                is_val=False,
+                                verbose=True)
+    eval_loader = setup_loader(ap, model.decoder.r, is_val=True)
+    global_step = args.restore_step
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        # set gradual training
+        if c.gradual_training is not None:
+            r, c.batch_size = gradual_training_scheduler(global_step, c)
+            c.r = r
+            model.decoder.set_r(r)
+            if c.bidirectional_decoder:
+                model.decoder_backward.set_r(r)
+            train_loader.dataset.outputs_per_step = r
+            eval_loader.dataset.outputs_per_step = r
+            train_loader = setup_loader(ap,
+                                        model.decoder.r,
+                                        is_val=False,
+                                        dataset=train_loader.dataset)
+            eval_loader = setup_loader(ap,
+                                       model.decoder.r,
+                                       is_val=True,
+                                       dataset=eval_loader.dataset)
+            print("\n > Number of output frames:", model.decoder.r)
+        # train one epoch
+        train_avg_loss_dict, global_step = train(train_loader, model,
+                                                 criterion, optimizer,
+                                                 optimizer_st, scheduler, ap,
+                                                 global_step, epoch, scaler,
+                                                 scaler_st)
+        # eval one epoch
+        eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap,
+                                      global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = train_avg_loss_dict['avg_postnet_loss']
+        if c.run_eval:
+            target_loss = eval_avg_loss_dict['avg_postnet_loss']
+        best_loss = save_best_model(
+            target_loss,
+            best_loss,
+            model,
+            optimizer,
+            global_step,
+            epoch,
+            c.r,
+            OUT_PATH,
+            scaler=scaler.state_dict() if c.mixed_precision else None)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        help='Path to config file for training.',
+        required='--continue_path' not in sys.argv
+    )
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=False,
+                        help='Do not verify commit integrity to run training.')
+    # DISTRUBUTED
+    parser.add_argument(
+        '--rank',
+        type=int,
+        default=0,
+        help='DISTRIBUTED: process rank for distributed training.')
+    parser.add_argument('--group_id',
+                        type=str,
+                        default="",
+                        help='DISTRIBUTED: process group id.')
+    args = parser.parse_args()
+    if args.continue_path != '':
+        print(f" > Training continues for {args.continue_path}")
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, 'config.json')
+        list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    check_config_tts(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    if c.mixed_precision:
+        print("   >  Mixed precision mode is ON")
+    OUT_PATH = args.continue_path
+    if args.continue_path == '':
+        OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
+    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(c,  args.config_path,
+                         OUT_PATH, new_fields)
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
+        # write model desc to tensorboard
+        tb_logger.tb_add_text('model-description', c['run_description'], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_vocoder_gan.py ADDED Viewed

	@@ -0,0 +1,664 @@

+import argparse
+import glob
+import os
+import sys
+import time
+import traceback
+from inspect import signature
+import torch
+from torch.utils.data import DataLoader
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.radam import RAdam
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import setup_torch_training_env
+from TTS.vocoder.datasets.gan_dataset import GANDataset
+from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
+from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator,
+                                             setup_generator)
+from TTS.vocoder.utils.io import save_best_model, save_checkpoint
+# DISTRIBUTED
+from torch.nn.parallel import DistributedDataParallel as DDP_th
+from torch.utils.data.distributed import DistributedSampler
+from TTS.utils.distribute import init_distributed
+use_cuda, num_gpus = setup_torch_training_env(True, True)
+def setup_loader(ap, is_val=False, verbose=False):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        dataset = GANDataset(ap=ap,
+                             items=eval_data if is_val else train_data,
+                             seq_len=c.seq_len,
+                             hop_len=ap.hop_length,
+                             pad_short=c.pad_short,
+                             conv_pad=c.conv_pad,
+                             is_training=not is_val,
+                             return_segments=not is_val,
+                             use_noise_augment=c.use_noise_augment,
+                             use_cache=c.use_cache,
+                             verbose=verbose)
+        dataset.shuffle_mapping()
+        sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
+        loader = DataLoader(dataset,
+                            batch_size=1 if is_val else c.batch_size,
+                            shuffle=False if num_gpus > 1 else True,
+                            drop_last=False,
+                            sampler=sampler,
+                            num_workers=c.num_val_loader_workers
+                            if is_val else c.num_loader_workers,
+                            pin_memory=False)
+    return loader
+def format_data(data):
+    if isinstance(data[0], list):
+        # setup input data
+        c_G, x_G = data[0]
+        c_D, x_D = data[1]
+        # dispatch data to GPU
+        if use_cuda:
+            c_G = c_G.cuda(non_blocking=True)
+            x_G = x_G.cuda(non_blocking=True)
+            c_D = c_D.cuda(non_blocking=True)
+            x_D = x_D.cuda(non_blocking=True)
+        return c_G, x_G, c_D, x_D
+    # return a whole audio segment
+    co, x = data
+    if use_cuda:
+        co = co.cuda(non_blocking=True)
+        x = x.cuda(non_blocking=True)
+    return co, x, None, None
+def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
+          scheduler_G, scheduler_D, ap, global_step, epoch):
+    data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
+    model_G.train()
+    model_D.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(
+            len(data_loader.dataset) / (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        c_G, y_G, c_D, y_D = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        ##############################
+        # GENERATOR
+        ##############################
+        # generator pass
+        y_hat = model_G(c_G)
+        y_hat_sub = None
+        y_G_sub = None
+        y_hat_vis = y_hat  # for visualization
+        # PQMF formatting
+        if y_hat.shape[1] > 1:
+            y_hat_sub = y_hat
+            y_hat = model_G.pqmf_synthesis(y_hat)
+            y_hat_vis = y_hat
+            y_G_sub = model_G.pqmf_analysis(y_G)
+        scores_fake, feats_fake, feats_real = None, None, None
+        if global_step > c.steps_to_start_discriminator:
+            # run D with or without cond. features
+            if len(signature(model_D.forward).parameters) == 2:
+                D_out_fake = model_D(y_hat, c_G)
+            else:
+                D_out_fake = model_D(y_hat)
+            D_out_real = None
+            if c.use_feat_match_loss:
+                with torch.no_grad():
+                    D_out_real = model_D(y_G)
+            # format D outputs
+            if isinstance(D_out_fake, tuple):
+                scores_fake, feats_fake = D_out_fake
+                if D_out_real is None:
+                    feats_real = None
+                else:
+                    _, feats_real = D_out_real
+            else:
+                scores_fake = D_out_fake
+        # compute losses
+        loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
+                                  feats_real, y_hat_sub, y_G_sub)
+        loss_G = loss_G_dict['G_loss']
+        # optimizer generator
+        optimizer_G.zero_grad()
+        loss_G.backward()
+        if c.gen_clip_grad > 0:
+            torch.nn.utils.clip_grad_norm_(model_G.parameters(),
+                                           c.gen_clip_grad)
+        optimizer_G.step()
+        if scheduler_G is not None:
+            scheduler_G.step()
+        loss_dict = dict()
+        for key, value in loss_G_dict.items():
+            if isinstance(value, int):
+                loss_dict[key] = value
+            else:
+                loss_dict[key] = value.item()
+        ##############################
+        # DISCRIMINATOR
+        ##############################
+        if global_step >= c.steps_to_start_discriminator:
+            # discriminator pass
+            with torch.no_grad():
+                y_hat = model_G(c_D)
+            # PQMF formatting
+            if y_hat.shape[1] > 1:
+                y_hat = model_G.pqmf_synthesis(y_hat)
+            # run D with or without cond. features
+            if len(signature(model_D.forward).parameters) == 2:
+                D_out_fake = model_D(y_hat.detach(), c_D)
+                D_out_real = model_D(y_D, c_D)
+            else:
+                D_out_fake = model_D(y_hat.detach())
+                D_out_real = model_D(y_D)
+            # format D outputs
+            if isinstance(D_out_fake, tuple):
+                scores_fake, feats_fake = D_out_fake
+                if D_out_real is None:
+                    scores_real, feats_real = None, None
+                else:
+                    scores_real, feats_real = D_out_real
+            else:
+                scores_fake = D_out_fake
+                scores_real = D_out_real
+            # compute losses
+            loss_D_dict = criterion_D(scores_fake, scores_real)
+            loss_D = loss_D_dict['D_loss']
+            # optimizer discriminator
+            optimizer_D.zero_grad()
+            loss_D.backward()
+            if c.disc_clip_grad > 0:
+                torch.nn.utils.clip_grad_norm_(model_D.parameters(),
+                                               c.disc_clip_grad)
+            optimizer_D.step()
+            if scheduler_D is not None:
+                scheduler_D.step()
+            for key, value in loss_D_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict[key] = value
+                else:
+                    loss_dict[key] = value.item()
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # get current learning rates
+        current_lr_G = list(optimizer_G.param_groups)[0]['lr']
+        current_lr_D = list(optimizer_D.param_groups)[0]['lr']
+        # update avg stats
+        update_train_values = dict()
+        for key, value in loss_dict.items():
+            update_train_values['avg_' + key] = value
+        update_train_values['avg_loader_time'] = loader_time
+        update_train_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training stats
+        if global_step % c.print_step == 0:
+            log_dict = {
+                'step_time': [step_time, 2],
+                'loader_time': [loader_time, 4],
+                "current_lr_G": current_lr_G,
+                "current_lr_D": current_lr_D
+            }
+            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                      log_dict, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # plot step stats
+            if global_step % 10 == 0:
+                iter_stats = {
+                    "lr_G": current_lr_G,
+                    "lr_D": current_lr_D,
+                    "step_time": step_time
+                }
+                iter_stats.update(loss_dict)
+                tb_logger.tb_train_iter_stats(global_step, iter_stats)
+            # save checkpoint
+            if global_step % c.save_step == 0:
+                if c.checkpoint:
+                    # save model
+                    save_checkpoint(model_G,
+                                    optimizer_G,
+                                    scheduler_G,
+                                    model_D,
+                                    optimizer_D,
+                                    scheduler_D,
+                                    global_step,
+                                    epoch,
+                                    OUT_PATH,
+                                    model_losses=loss_dict)
+                # compute spectrograms
+                figures = plot_results(y_hat_vis, y_G, ap, global_step,
+                                    'train')
+                tb_logger.tb_train_figures(global_step, figures)
+                # Sample audio
+                sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
+                tb_logger.tb_train_audios(global_step,
+                                        {'train/audio': sample_voice},
+                                        c.audio["sample_rate"])
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Training Epoch Stats
+    epoch_stats = {"epoch_time": epoch_time}
+    epoch_stats.update(keep_avg.avg_values)
+    if args.rank == 0:
+        tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+    # TODO: plot model stats
+    # if c.tb_model_param_stats:
+    # tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch):
+    data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
+    model_G.eval()
+    model_D.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    end_time = time.time()
+    c_logger.print_eval_start()
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        c_G, y_G, _, _ = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        ##############################
+        # GENERATOR
+        ##############################
+        # generator pass
+        y_hat = model_G(c_G)
+        y_hat_sub = None
+        y_G_sub = None
+        # PQMF formatting
+        if y_hat.shape[1] > 1:
+            y_hat_sub = y_hat
+            y_hat = model_G.pqmf_synthesis(y_hat)
+            y_G_sub = model_G.pqmf_analysis(y_G)
+        scores_fake, feats_fake, feats_real = None, None, None
+        if global_step > c.steps_to_start_discriminator:
+            if len(signature(model_D.forward).parameters) == 2:
+                D_out_fake = model_D(y_hat, c_G)
+            else:
+                D_out_fake = model_D(y_hat)
+            D_out_real = None
+            if c.use_feat_match_loss:
+                with torch.no_grad():
+                    D_out_real = model_D(y_G)
+            # format D outputs
+            if isinstance(D_out_fake, tuple):
+                scores_fake, feats_fake = D_out_fake
+                if D_out_real is None:
+                    feats_real = None
+                else:
+                    _, feats_real = D_out_real
+            else:
+                scores_fake = D_out_fake
+                feats_fake, feats_real = None, None
+        # compute losses
+        loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
+                                  feats_real, y_hat_sub, y_G_sub)
+        loss_dict = dict()
+        for key, value in loss_G_dict.items():
+            if isinstance(value, (int, float)):
+                loss_dict[key] = value
+            else:
+                loss_dict[key] = value.item()
+        ##############################
+        # DISCRIMINATOR
+        ##############################
+        if global_step >= c.steps_to_start_discriminator:
+            # discriminator pass
+            with torch.no_grad():
+                y_hat = model_G(c_G)
+            # PQMF formatting
+            if y_hat.shape[1] > 1:
+                y_hat = model_G.pqmf_synthesis(y_hat)
+            # run D with or without cond. features
+            if len(signature(model_D.forward).parameters) == 2:
+                D_out_fake = model_D(y_hat.detach(), c_G)
+                D_out_real = model_D(y_G, c_G)
+            else:
+                D_out_fake = model_D(y_hat.detach())
+                D_out_real = model_D(y_G)
+            # format D outputs
+            if isinstance(D_out_fake, tuple):
+                scores_fake, feats_fake = D_out_fake
+                if D_out_real is None:
+                    scores_real, feats_real = None, None
+                else:
+                    scores_real, feats_real = D_out_real
+            else:
+                scores_fake = D_out_fake
+                scores_real = D_out_real
+            # compute losses
+            loss_D_dict = criterion_D(scores_fake, scores_real)
+            for key, value in loss_D_dict.items():
+                if isinstance(value, (int, float)):
+                    loss_dict[key] = value
+                else:
+                    loss_dict[key] = value.item()
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # update avg stats
+        update_eval_values = dict()
+        for key, value in loss_dict.items():
+            update_eval_values['avg_' + key] = value
+        update_eval_values['avg_loader_time'] = loader_time
+        update_eval_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_eval_values)
+        # print eval stats
+        if c.print_eval:
+            c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
+    if args.rank == 0:
+        # compute spectrograms
+        figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
+        tb_logger.tb_eval_figures(global_step, figures)
+        # Sample audio
+        sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
+        tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
+                                c.audio["sample_rate"])
+        tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+     # synthesize a full voice
+    data_loader.return_segments = False
+    return keep_avg.avg_values
+# FIXME: move args definition/parsing inside of main?
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global train_data, eval_data
+    print(f" > Loading wavs from: {c.data_path}")
+    if c.feature_path is not None:
+        print(f" > Loading features from: {c.feature_path}")
+        eval_data, train_data = load_wav_feat_data(
+            c.data_path, c.feature_path, c.eval_split_size)
+    else:
+        eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
+    # setup audio processor
+    ap = AudioProcessor(**c.audio)
+    # DISTRUBUTED
+    if num_gpus > 1:
+        init_distributed(args.rank, num_gpus, args.group_id,
+                         c.distributed["backend"], c.distributed["url"])
+    # setup models
+    model_gen = setup_generator(c)
+    model_disc = setup_discriminator(c)
+    # setup optimizers
+    optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
+    optimizer_disc = RAdam(model_disc.parameters(),
+                           lr=c.lr_disc,
+                           weight_decay=0)
+    # schedulers
+    scheduler_gen = None
+    scheduler_disc = None
+    if 'lr_scheduler_gen' in c:
+        scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
+        scheduler_gen = scheduler_gen(
+            optimizer_gen, **c.lr_scheduler_gen_params)
+    if 'lr_scheduler_disc' in c:
+        scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
+        scheduler_disc = scheduler_disc(
+            optimizer_disc, **c.lr_scheduler_disc_params)
+    # setup criterion
+    criterion_gen = GeneratorLoss(c)
+    criterion_disc = DiscriminatorLoss(c)
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location='cpu')
+        try:
+            print(" > Restoring Generator Model...")
+            model_gen.load_state_dict(checkpoint['model'])
+            print(" > Restoring Generator Optimizer...")
+            optimizer_gen.load_state_dict(checkpoint['optimizer'])
+            print(" > Restoring Discriminator Model...")
+            model_disc.load_state_dict(checkpoint['model_disc'])
+            print(" > Restoring Discriminator Optimizer...")
+            optimizer_disc.load_state_dict(checkpoint['optimizer_disc'])
+            if 'scheduler' in checkpoint:
+                print(" > Restoring Generator LR Scheduler...")
+                scheduler_gen.load_state_dict(checkpoint['scheduler'])
+                # NOTE: Not sure if necessary
+                scheduler_gen.optimizer = optimizer_gen
+            if 'scheduler_disc' in checkpoint:
+                print(" > Restoring Discriminator LR Scheduler...")
+                scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
+                scheduler_disc.optimizer = optimizer_disc
+        except RuntimeError:
+            # retore only matching layers.
+            print(" > Partial model initialization...")
+            model_dict = model_gen.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model_gen.load_state_dict(model_dict)
+            model_dict = model_disc.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c)
+            model_disc.load_state_dict(model_dict)
+            del model_dict
+        # reset lr if not countinuining training.
+        for group in optimizer_gen.param_groups:
+            group['lr'] = c.lr_gen
+        for group in optimizer_disc.param_groups:
+            group['lr'] = c.lr_disc
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model_gen.cuda()
+        criterion_gen.cuda()
+        model_disc.cuda()
+        criterion_disc.cuda()
+    # DISTRUBUTED
+    if num_gpus > 1:
+        model_gen = DDP_th(model_gen, device_ids=[args.rank])
+        model_disc = DDP_th(model_disc, device_ids=[args.rank])
+    num_params = count_parameters(model_gen)
+    print(" > Generator has {} parameters".format(num_params), flush=True)
+    num_params = count_parameters(model_disc)
+    print(" > Discriminator has {} parameters".format(num_params), flush=True)
+    if 'best_loss' not in locals():
+        best_loss = float('inf')
+    global_step = args.restore_step
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        _, global_step = train(model_gen, criterion_gen, optimizer_gen,
+                               model_disc, criterion_disc, optimizer_disc,
+                               scheduler_gen, scheduler_disc, ap, global_step,
+                               epoch)
+        eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap,
+                                      global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = eval_avg_loss_dict[c.target_loss]
+        best_loss = save_best_model(target_loss,
+                                    best_loss,
+                                    model_gen,
+                                    optimizer_gen,
+                                    scheduler_gen,
+                                    model_disc,
+                                    optimizer_disc,
+                                    scheduler_disc,
+                                    global_step,
+                                    epoch,
+                                    OUT_PATH,
+                                    model_losses=eval_avg_loss_dict)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument('--config_path',
+                        type=str,
+                        help='Path to config file for training.',
+                        required='--continue_path' not in sys.argv)
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=False,
+                        help='Do not verify commit integrity to run training.')
+    # DISTRUBUTED
+    parser.add_argument(
+        '--rank',
+        type=int,
+        default=0,
+        help='DISTRIBUTED: process rank for distributed training.')
+    parser.add_argument('--group_id',
+                        type=str,
+                        default="",
+                        help='DISTRIBUTED: process group id.')
+    args = parser.parse_args()
+    if args.continue_path != '':
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, 'config.json')
+        list_of_files = glob.glob(
+            args.continue_path +
+            "/*.pth.tar")  # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+        print(f" > Training continues for {args.restore_path}")
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    # check_config(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    OUT_PATH = args.continue_path
+    if args.continue_path == '':
+        OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
+                                            args.debug)
+    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(c,  args.config_path,
+                         OUT_PATH, new_fields)
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
+        # write model desc to tensorboard
+        tb_logger.tb_add_text('model-description', c['run_description'], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_vocoder_wavegrad.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import argparse
+import glob
+import os
+import sys
+import time
+import traceback
+import numpy as np
+import torch
+# DISTRIBUTED
+from torch.nn.parallel import DistributedDataParallel as DDP_th
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.distribute import init_distributed
+from TTS.utils.generic_utils import (KeepAverage, count_parameters,
+                                     create_experiment_folder, get_git_branch,
+                                     remove_experiment_folder, set_init_dict)
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.training import setup_torch_training_env
+from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
+from TTS.vocoder.utils.generic_utils import plot_results, setup_generator
+from TTS.vocoder.utils.io import save_best_model, save_checkpoint
+use_cuda, num_gpus = setup_torch_training_env(True, True)
+def setup_loader(ap, is_val=False, verbose=False):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        dataset = WaveGradDataset(ap=ap,
+                                items=eval_data if is_val else train_data,
+                                seq_len=c.seq_len,
+                                hop_len=ap.hop_length,
+                                pad_short=c.pad_short,
+                                conv_pad=c.conv_pad,
+                                is_training=not is_val,
+                                return_segments=True,
+                                use_noise_augment=False,
+                                use_cache=c.use_cache,
+                                verbose=verbose)
+        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(dataset,
+                            batch_size=c.batch_size,
+                            shuffle=num_gpus <= 1,
+                            drop_last=False,
+                            sampler=sampler,
+                            num_workers=c.num_val_loader_workers
+                            if is_val else c.num_loader_workers,
+                            pin_memory=False)
+    return loader
+def format_data(data):
+    # return a whole audio segment
+    m, x = data
+    x = x.unsqueeze(1)
+    if use_cuda:
+        m = m.cuda(non_blocking=True)
+        x = x.cuda(non_blocking=True)
+    return m, x
+def format_test_data(data):
+    # return a whole audio segment
+    m, x = data
+    m = m[None, ...]
+    x = x[None, None, ...]
+    if use_cuda:
+        m = m.cuda(non_blocking=True)
+        x = x.cuda(non_blocking=True)
+    return m, x
+def train(model, criterion, optimizer,
+          scheduler, scaler, ap, global_step, epoch):
+    data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
+    model.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(
+            len(data_loader.dataset) / (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    # setup noise schedule
+    noise_schedule = c['train_noise_schedule']
+    betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps'])
+    if hasattr(model, 'module'):
+        model.module.compute_noise_level(betas)
+    else:
+        model.compute_noise_level(betas)
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        m, x = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        with torch.cuda.amp.autocast(enabled=c.mixed_precision):
+            # compute noisy input
+            if hasattr(model, 'module'):
+                noise, x_noisy, noise_scale = model.module.compute_y_n(x)
+            else:
+                noise, x_noisy, noise_scale = model.compute_y_n(x)
+            # forward pass
+            noise_hat = model(x_noisy, m, noise_scale)
+            # compute losses
+            loss = criterion(noise, noise_hat)
+        loss_wavegrad_dict = {'wavegrad_loss':loss}
+        # check nan loss
+        if torch.isnan(loss).any():
+            raise RuntimeError(f'Detected NaN loss at step {global_step}.')
+        optimizer.zero_grad()
+        # backward pass with loss scaling
+        if c.mixed_precision:
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.clip_grad)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
+                                           c.clip_grad)
+            optimizer.step()
+        # schedule update
+        if scheduler is not None:
+            scheduler.step()
+        # disconnect loss values
+        loss_dict = dict()
+        for key, value in loss_wavegrad_dict.items():
+            if isinstance(value, int):
+                loss_dict[key] = value
+            else:
+                loss_dict[key] = value.item()
+        # epoch/step timing
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # get current learning rates
+        current_lr = list(optimizer.param_groups)[0]['lr']
+        # update avg stats
+        update_train_values = dict()
+        for key, value in loss_dict.items():
+            update_train_values['avg_' + key] = value
+        update_train_values['avg_loader_time'] = loader_time
+        update_train_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training stats
+        if global_step % c.print_step == 0:
+            log_dict = {
+                'step_time': [step_time, 2],
+                'loader_time': [loader_time, 4],
+                "current_lr": current_lr,
+                "grad_norm": grad_norm.item()
+            }
+            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+                                      log_dict, loss_dict, keep_avg.avg_values)
+        if args.rank == 0:
+            # plot step stats
+            if global_step % 10 == 0:
+                iter_stats = {
+                    "lr": current_lr,
+                    "grad_norm": grad_norm.item(),
+                    "step_time": step_time
+                }
+                iter_stats.update(loss_dict)
+                tb_logger.tb_train_iter_stats(global_step, iter_stats)
+            # save checkpoint
+            if global_step % c.save_step == 0:
+                if c.checkpoint:
+                    # save model
+                    save_checkpoint(model,
+                                    optimizer,
+                                    scheduler,
+                                    None,
+                                    None,
+                                    None,
+                                    global_step,
+                                    epoch,
+                                    OUT_PATH,
+                                    model_losses=loss_dict,
+                                    scaler=scaler.state_dict() if c.mixed_precision else None)
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Training Epoch Stats
+    epoch_stats = {"epoch_time": epoch_time}
+    epoch_stats.update(keep_avg.avg_values)
+    if args.rank == 0:
+        tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+    # TODO: plot model stats
+    if c.tb_model_param_stats and args.rank == 0:
+        tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(model, criterion, ap, global_step, epoch):
+    data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
+    model.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    end_time = time.time()
+    c_logger.print_eval_start()
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        # format data
+        m, x = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        # compute noisy input
+        if hasattr(model, 'module'):
+            noise, x_noisy, noise_scale = model.module.compute_y_n(x)
+        else:
+            noise, x_noisy, noise_scale = model.compute_y_n(x)
+        # forward pass
+        noise_hat = model(x_noisy, m, noise_scale)
+        # compute losses
+        loss = criterion(noise, noise_hat)
+        loss_wavegrad_dict = {'wavegrad_loss':loss}
+        loss_dict = dict()
+        for key, value in loss_wavegrad_dict.items():
+            if isinstance(value, (int, float)):
+                loss_dict[key] = value
+            else:
+                loss_dict[key] = value.item()
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        # update avg stats
+        update_eval_values = dict()
+        for key, value in loss_dict.items():
+            update_eval_values['avg_' + key] = value
+        update_eval_values['avg_loader_time'] = loader_time
+        update_eval_values['avg_step_time'] = step_time
+        keep_avg.update_values(update_eval_values)
+        # print eval stats
+        if c.print_eval:
+            c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
+    if args.rank == 0:
+        data_loader.dataset.return_segments = False
+        samples = data_loader.dataset.load_test_samples(1)
+        m, x = format_test_data(samples[0])
+        # setup noise schedule and inference
+        noise_schedule = c['test_noise_schedule']
+        betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps'])
+        if hasattr(model, 'module'):
+            model.module.compute_noise_level(betas)
+            # compute voice
+            x_pred = model.module.inference(m)
+        else:
+            model.compute_noise_level(betas)
+            # compute voice
+            x_pred = model.inference(m)
+        # compute spectrograms
+        figures = plot_results(x_pred, x, ap, global_step, 'eval')
+        tb_logger.tb_eval_figures(global_step, figures)
+        # Sample audio
+        sample_voice = x_pred[0].squeeze(0).detach().cpu().numpy()
+        tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
+                                 c.audio["sample_rate"])
+        tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+        data_loader.dataset.return_segments = True
+    return keep_avg.avg_values
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global train_data, eval_data
+    print(f" > Loading wavs from: {c.data_path}")
+    if c.feature_path is not None:
+        print(f" > Loading features from: {c.feature_path}")
+        eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
+    else:
+        eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
+    # setup audio processor
+    ap = AudioProcessor(**c.audio)
+    # DISTRUBUTED
+    if num_gpus > 1:
+        init_distributed(args.rank, num_gpus, args.group_id,
+                         c.distributed["backend"], c.distributed["url"])
+    # setup models
+    model = setup_generator(c)
+    # scaler for mixed_precision
+    scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+    # setup optimizers
+    optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0)
+    # schedulers
+    scheduler = None
+    if 'lr_scheduler' in c:
+        scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler)
+        scheduler = scheduler(optimizer, **c.lr_scheduler_params)
+    # setup criterion
+    criterion = torch.nn.L1Loss().cuda()
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location='cpu')
+        try:
+            print(" > Restoring Model...")
+            model.load_state_dict(checkpoint['model'])
+            print(" > Restoring Optimizer...")
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if 'scheduler' in checkpoint:
+                print(" > Restoring LR Scheduler...")
+                scheduler.load_state_dict(checkpoint['scheduler'])
+                # NOTE: Not sure if necessary
+                scheduler.optimizer = optimizer
+            if "scaler" in checkpoint and c.mixed_precision:
+                print(" > Restoring AMP Scaler...")
+                scaler.load_state_dict(checkpoint["scaler"])
+        except RuntimeError:
+            # retore only matching layers.
+            print(" > Partial model initialization...")
+            model_dict = model.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model.load_state_dict(model_dict)
+            del model_dict
+        # reset lr if not countinuining training.
+        for group in optimizer.param_groups:
+            group['lr'] = c.lr
+        print(" > Model restored from step %d" % checkpoint['step'],
+              flush=True)
+        args.restore_step = checkpoint['step']
+    else:
+        args.restore_step = 0
+    if use_cuda:
+        model.cuda()
+        criterion.cuda()
+    # DISTRUBUTED
+    if num_gpus > 1:
+        model = DDP_th(model, device_ids=[args.rank])
+    num_params = count_parameters(model)
+    print(" > WaveGrad has {} parameters".format(num_params), flush=True)
+    if 'best_loss' not in locals():
+        best_loss = float('inf')
+    global_step = args.restore_step
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        _, global_step = train(model, criterion, optimizer,
+                               scheduler, scaler, ap, global_step,
+                               epoch)
+        eval_avg_loss_dict = evaluate(model, criterion, ap,
+                                      global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = eval_avg_loss_dict[c.target_loss]
+        best_loss = save_best_model(target_loss,
+                                    best_loss,
+                                    model,
+                                    optimizer,
+                                    scheduler,
+                                    None,
+                                    None,
+                                    None,
+                                    global_step,
+                                    epoch,
+                                    OUT_PATH,
+                                    model_losses=eval_avg_loss_dict,
+                                    scaler=scaler.state_dict() if c.mixed_precision else None)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help=
+        'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument('--config_path',
+                        type=str,
+                        help='Path to config file for training.',
+                        required='--continue_path' not in sys.argv)
+    parser.add_argument('--debug',
+                        type=bool,
+                        default=False,
+                        help='Do not verify commit integrity to run training.')
+    # DISTRUBUTED
+    parser.add_argument(
+        '--rank',
+        type=int,
+        default=0,
+        help='DISTRIBUTED: process rank for distributed training.')
+    parser.add_argument('--group_id',
+                        type=str,
+                        default="",
+                        help='DISTRIBUTED: process group id.')
+    args = parser.parse_args()
+    if args.continue_path != '':
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, 'config.json')
+        list_of_files = glob.glob(
+            args.continue_path +
+            "/*.pth.tar")  # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+        print(f" > Training continues for {args.restore_path}")
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    # check_config(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    # DISTRIBUTED
+    if c.mixed_precision:
+        print("   >  Mixed precision is enabled")
+    OUT_PATH = args.continue_path
+    if args.continue_path == '':
+        OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
+                                            args.debug)
+    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(c,  args.config_path,
+                         OUT_PATH, new_fields)
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
+        # write model desc to tensorboard
+        tb_logger.tb_add_text('model-description', c['run_description'], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_vocoder_wavernn.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import argparse
+import os
+import sys
+import traceback
+import time
+import glob
+import random
+import torch
+from torch.utils.data import DataLoader
+# from torch.utils.data.distributed import DistributedSampler
+from TTS.tts.utils.visual import plot_spectrogram
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.radam import RAdam
+from TTS.utils.io import copy_model_files, load_config
+from TTS.utils.training import setup_torch_training_env
+from TTS.utils.console_logger import ConsoleLogger
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.generic_utils import (
+    KeepAverage,
+    count_parameters,
+    create_experiment_folder,
+    get_git_branch,
+    remove_experiment_folder,
+    set_init_dict,
+)
+from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
+from TTS.vocoder.datasets.preprocess import (
+    load_wav_data,
+    load_wav_feat_data
+)
+from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
+from TTS.vocoder.utils.generic_utils import setup_wavernn
+from TTS.vocoder.utils.io import save_best_model, save_checkpoint
+use_cuda, num_gpus = setup_torch_training_env(True, True)
+def setup_loader(ap, is_val=False, verbose=False):
+    if is_val and not c.run_eval:
+        loader = None
+    else:
+        dataset = WaveRNNDataset(ap=ap,
+                                 items=eval_data if is_val else train_data,
+                                 seq_len=c.seq_len,
+                                 hop_len=ap.hop_length,
+                                 pad=c.padding,
+                                 mode=c.mode,
+                                 mulaw=c.mulaw,
+                                 is_training=not is_val,
+                                 verbose=verbose,
+                                 )
+        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        loader = DataLoader(dataset,
+                            shuffle=True,
+                            collate_fn=dataset.collate,
+                            batch_size=c.batch_size,
+                            num_workers=c.num_val_loader_workers
+                            if is_val
+                            else c.num_loader_workers,
+                            pin_memory=True,
+                            )
+    return loader
+def format_data(data):
+    # setup input data
+    x_input = data[0]
+    mels = data[1]
+    y_coarse = data[2]
+    # dispatch data to GPU
+    if use_cuda:
+        x_input = x_input.cuda(non_blocking=True)
+        mels = mels.cuda(non_blocking=True)
+        y_coarse = y_coarse.cuda(non_blocking=True)
+    return x_input, mels, y_coarse
+def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch):
+    # create train loader
+    data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
+    model.train()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    if use_cuda:
+        batch_n_iter = int(len(data_loader.dataset) /
+                           (c.batch_size * num_gpus))
+    else:
+        batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
+    end_time = time.time()
+    c_logger.print_train_start()
+    # train loop
+    for num_iter, data in enumerate(data_loader):
+        start_time = time.time()
+        x_input, mels, y_coarse = format_data(data)
+        loader_time = time.time() - end_time
+        global_step += 1
+        optimizer.zero_grad()
+        if c.mixed_precision:
+            # mixed precision training
+            with torch.cuda.amp.autocast():
+                y_hat = model(x_input, mels)
+                if isinstance(model.mode, int):
+                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
+                else:
+                    y_coarse = y_coarse.float()
+                y_coarse = y_coarse.unsqueeze(-1)
+                # compute losses
+                loss = criterion(y_hat, y_coarse)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            if c.grad_clip > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), c.grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            # full precision training
+            y_hat = model(x_input, mels)
+            if isinstance(model.mode, int):
+                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
+            else:
+                y_coarse = y_coarse.float()
+            y_coarse = y_coarse.unsqueeze(-1)
+            # compute losses
+            loss = criterion(y_hat, y_coarse)
+            if loss.item() is None:
+                raise RuntimeError(" [!] None loss. Exiting ...")
+            loss.backward()
+            if c.grad_clip > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), c.grad_clip)
+            optimizer.step()
+        if scheduler is not None:
+            scheduler.step()
+        # get the current learning rate
+        cur_lr = list(optimizer.param_groups)[0]["lr"]
+        step_time = time.time() - start_time
+        epoch_time += step_time
+        update_train_values = dict()
+        loss_dict = dict()
+        loss_dict["model_loss"] = loss.item()
+        for key, value in loss_dict.items():
+            update_train_values["avg_" + key] = value
+        update_train_values["avg_loader_time"] = loader_time
+        update_train_values["avg_step_time"] = step_time
+        keep_avg.update_values(update_train_values)
+        # print training stats
+        if global_step % c.print_step == 0:
+            log_dict = {"step_time": [step_time, 2],
+                        "loader_time": [loader_time, 4],
+                        "current_lr": cur_lr,
+                        }
+            c_logger.print_train_step(batch_n_iter,
+                                      num_iter,
+                                      global_step,
+                                      log_dict,
+                                      loss_dict,
+                                      keep_avg.avg_values,
+                                      )
+        # plot step stats
+        if global_step % 10 == 0:
+            iter_stats = {"lr": cur_lr, "step_time": step_time}
+            iter_stats.update(loss_dict)
+            tb_logger.tb_train_iter_stats(global_step, iter_stats)
+        # save checkpoint
+        if global_step % c.save_step == 0:
+            if c.checkpoint:
+                # save model
+                save_checkpoint(model,
+                                optimizer,
+                                scheduler,
+                                None,
+                                None,
+                                None,
+                                global_step,
+                                epoch,
+                                OUT_PATH,
+                                model_losses=loss_dict,
+                                scaler=scaler.state_dict() if c.mixed_precision else None
+                                )
+            # synthesize a full voice
+            rand_idx = random.randrange(0, len(train_data))
+            wav_path = train_data[rand_idx] if not isinstance(
+                train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0]
+            wav = ap.load_wav(wav_path)
+            ground_mel = ap.melspectrogram(wav)
+            sample_wav = model.generate(ground_mel,
+                                        c.batched,
+                                        c.target_samples,
+                                        c.overlap_samples,
+                                        use_cuda
+                                        )
+            predict_mel = ap.melspectrogram(sample_wav)
+            # compute spectrograms
+            figures = {"train/ground_truth": plot_spectrogram(ground_mel.T),
+                       "train/prediction": plot_spectrogram(predict_mel.T)
+                       }
+            tb_logger.tb_train_figures(global_step, figures)
+            # Sample audio
+            tb_logger.tb_train_audios(
+                global_step, {
+                    "train/audio": sample_wav}, c.audio["sample_rate"]
+            )
+        end_time = time.time()
+    # print epoch stats
+    c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+    # Plot Training Epoch Stats
+    epoch_stats = {"epoch_time": epoch_time}
+    epoch_stats.update(keep_avg.avg_values)
+    tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
+    # TODO: plot model stats
+    # if c.tb_model_param_stats:
+    # tb_logger.tb_model_weights(model, global_step)
+    return keep_avg.avg_values, global_step
+@torch.no_grad()
+def evaluate(model, criterion, ap, global_step, epoch):
+    # create train loader
+    data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
+    model.eval()
+    epoch_time = 0
+    keep_avg = KeepAverage()
+    end_time = time.time()
+    c_logger.print_eval_start()
+    with torch.no_grad():
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
+            # format data
+            x_input, mels, y_coarse = format_data(data)
+            loader_time = time.time() - end_time
+            global_step += 1
+            y_hat = model(x_input, mels)
+            if isinstance(model.mode, int):
+                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
+            else:
+                y_coarse = y_coarse.float()
+            y_coarse = y_coarse.unsqueeze(-1)
+            loss = criterion(y_hat, y_coarse)
+            # Compute avg loss
+            # if num_gpus > 1:
+            #     loss = reduce_tensor(loss.data, num_gpus)
+            loss_dict = dict()
+            loss_dict["model_loss"] = loss.item()
+            step_time = time.time() - start_time
+            epoch_time += step_time
+            # update avg stats
+            update_eval_values = dict()
+            for key, value in loss_dict.items():
+                update_eval_values["avg_" + key] = value
+            update_eval_values["avg_loader_time"] = loader_time
+            update_eval_values["avg_step_time"] = step_time
+            keep_avg.update_values(update_eval_values)
+            # print eval stats
+            if c.print_eval:
+                c_logger.print_eval_step(
+                    num_iter, loss_dict, keep_avg.avg_values)
+    if epoch % c.test_every_epochs == 0 and epoch != 0:
+        # synthesize a full voice
+        rand_idx = random.randrange(0, len(eval_data))
+        wav_path = eval_data[rand_idx] if not isinstance(
+            eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0]
+        wav = ap.load_wav(wav_path)
+        ground_mel = ap.melspectrogram(wav)
+        sample_wav = model.generate(ground_mel,
+                                    c.batched,
+                                    c.target_samples,
+                                    c.overlap_samples,
+                                    use_cuda
+                                    )
+        predict_mel = ap.melspectrogram(sample_wav)
+        # Sample audio
+        tb_logger.tb_eval_audios(
+            global_step, {
+                "eval/audio": sample_wav}, c.audio["sample_rate"]
+        )
+        # compute spectrograms
+        figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T),
+                   "eval/prediction": plot_spectrogram(predict_mel.T)
+                   }
+        tb_logger.tb_eval_figures(global_step, figures)
+    tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
+    return keep_avg.avg_values
+# FIXME: move args definition/parsing inside of main?
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global train_data, eval_data
+    # setup audio processor
+    ap = AudioProcessor(**c.audio)
+    # print(f" > Loading wavs from: {c.data_path}")
+    # if c.feature_path is not None:
+    #     print(f" > Loading features from: {c.feature_path}")
+    #     eval_data, train_data = load_wav_feat_data(
+    #         c.data_path, c.feature_path, c.eval_split_size
+    #     )
+    # else:
+    #     mel_feat_path = os.path.join(OUT_PATH, "mel")
+    #     feat_data = find_feat_files(mel_feat_path)
+    #     if feat_data:
+    #         print(f" > Loading features from: {mel_feat_path}")
+    #         eval_data, train_data = load_wav_feat_data(
+    #             c.data_path, mel_feat_path, c.eval_split_size
+    #         )
+    #     else:
+    #         print(" > No feature data found. Preprocessing...")
+    #         # preprocessing feature data from given wav files
+    #         preprocess_wav_files(OUT_PATH, CONFIG, ap)
+    #         eval_data, train_data = load_wav_feat_data(
+    #             c.data_path, mel_feat_path, c.eval_split_size
+    #         )
+    print(f" > Loading wavs from: {c.data_path}")
+    if c.feature_path is not None:
+        print(f" > Loading features from: {c.feature_path}")
+        eval_data, train_data = load_wav_feat_data(
+            c.data_path, c.feature_path, c.eval_split_size)
+    else:
+        eval_data, train_data = load_wav_data(
+            c.data_path, c.eval_split_size)
+    # setup model
+    model_wavernn = setup_wavernn(c)
+    # setup amp scaler
+    scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None
+    # define train functions
+    if c.mode == "mold":
+        criterion = discretized_mix_logistic_loss
+    elif c.mode == "gauss":
+        criterion = gaussian_loss
+    elif isinstance(c.mode, int):
+        criterion = torch.nn.CrossEntropyLoss()
+    if use_cuda:
+        model_wavernn.cuda()
+        if isinstance(c.mode, int):
+            criterion.cuda()
+    optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0)
+    scheduler = None
+    if "lr_scheduler" in c:
+        scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler)
+        scheduler = scheduler(optimizer, **c.lr_scheduler_params)
+    # slow start for the first 5 epochs
+    # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1)
+    # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+    # restore any checkpoint
+    if args.restore_path:
+        checkpoint = torch.load(args.restore_path, map_location="cpu")
+        try:
+            print(" > Restoring Model...")
+            model_wavernn.load_state_dict(checkpoint["model"])
+            print(" > Restoring Optimizer...")
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            if "scheduler" in checkpoint:
+                print(" > Restoring Generator LR Scheduler...")
+                scheduler.load_state_dict(checkpoint["scheduler"])
+                scheduler.optimizer = optimizer
+            if "scaler" in checkpoint and c.mixed_precision:
+                print(" > Restoring AMP Scaler...")
+                scaler.load_state_dict(checkpoint["scaler"])
+        except RuntimeError:
+            # retore only matching layers.
+            print(" > Partial model initialization...")
+            model_dict = model_wavernn.state_dict()
+            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
+            model_wavernn.load_state_dict(model_dict)
+        print(" > Model restored from step %d" %
+              checkpoint["step"], flush=True)
+        args.restore_step = checkpoint["step"]
+    else:
+        args.restore_step = 0
+    # DISTRIBUTED
+    # if num_gpus > 1:
+    #     model = apply_gradient_allreduce(model)
+    num_parameters = count_parameters(model_wavernn)
+    print(" > Model has {} parameters".format(num_parameters), flush=True)
+    if "best_loss" not in locals():
+        best_loss = float("inf")
+    global_step = args.restore_step
+    for epoch in range(0, c.epochs):
+        c_logger.print_epoch_start(epoch, c.epochs)
+        _, global_step = train(model_wavernn, optimizer,
+                               criterion, scheduler, scaler, ap, global_step, epoch)
+        eval_avg_loss_dict = evaluate(
+            model_wavernn, criterion, ap, global_step, epoch)
+        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+        target_loss = eval_avg_loss_dict["avg_model_loss"]
+        best_loss = save_best_model(
+            target_loss,
+            best_loss,
+            model_wavernn,
+            optimizer,
+            scheduler,
+            None,
+            None,
+            None,
+            global_step,
+            epoch,
+            OUT_PATH,
+            model_losses=eval_avg_loss_dict,
+            scaler=scaler.state_dict() if c.mixed_precision else None
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--continue_path",
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default="",
+        required="--config_path" not in sys.argv,
+    )
+    parser.add_argument(
+        "--restore_path",
+        type=str,
+        help="Model file to be restored. Use to finetune a model.",
+        default="",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to config file for training.",
+        required="--continue_path" not in sys.argv,
+    )
+    parser.add_argument(
+        "--debug",
+        type=bool,
+        default=False,
+        help="Do not verify commit integrity to run training.",
+    )
+    # DISTRUBUTED
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=0,
+        help="DISTRIBUTED: process rank for distributed training.",
+    )
+    parser.add_argument(
+        "--group_id", type=str, default="", help="DISTRIBUTED: process group id."
+    )
+    args = parser.parse_args()
+    if args.continue_path != "":
+        args.output_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, "config.json")
+        list_of_files = glob.glob(
+            args.continue_path + "/*.pth.tar"
+        )  # * means all if need specific format then *.csv
+        latest_model_file = max(list_of_files, key=os.path.getctime)
+        args.restore_path = latest_model_file
+        print(f" > Training continues for {args.restore_path}")
+    # setup output paths and read configs
+    c = load_config(args.config_path)
+    # check_config(c)
+    _ = os.path.dirname(os.path.realpath(__file__))
+    OUT_PATH = args.continue_path
+    if args.continue_path == "":
+        OUT_PATH = create_experiment_folder(
+            c.output_path, c.run_name, args.debug
+        )
+    AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
+    c_logger = ConsoleLogger()
+    if args.rank == 0:
+        os.makedirs(AUDIO_PATH, exist_ok=True)
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        copy_model_files(
+            c, args.config_path, OUT_PATH, new_fields
+        )
+        os.chmod(AUDIO_PATH, 0o775)
+        os.chmod(OUT_PATH, 0o775)
+        LOG_DIR = OUT_PATH
+        tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
+        # write model desc to tensorboard
+        tb_logger.tb_add_text("model-description", c["run_description"], 0)
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/tune_wavegrad.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""Search a good noise schedule for WaveGrad for a given number of inferece iterations"""
+import argparse
+from itertools import product as cartesian_product
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
+from TTS.vocoder.utils.generic_utils import setup_generator
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, help='Path to model checkpoint.')
+parser.add_argument('--config_path', type=str, help='Path to model config file.')
+parser.add_argument('--data_path', type=str, help='Path to data directory.')
+parser.add_argument('--output_path', type=str, help='path for output file including file name and extension.')
+parser.add_argument('--num_iter', type=int, help='Number of model inference iterations that you like to optimize noise schedule for.')
+parser.add_argument('--use_cuda', type=bool, help='enable/disable CUDA.')
+parser.add_argument('--num_samples', type=int, default=1, help='Number of datasamples used for inference.')
+parser.add_argument('--search_depth', type=int, default=3, help='Search granularity. Increasing this increases the run-time exponentially.')
+# load config
+args = parser.parse_args()
+config = load_config(args.config_path)
+# setup audio processor
+ap = AudioProcessor(**config.audio)
+# load dataset
+_, train_data = load_wav_data(args.data_path, 0)
+train_data = train_data[:args.num_samples]
+dataset = WaveGradDataset(ap=ap,
+                          items=train_data,
+                          seq_len=-1,
+                          hop_len=ap.hop_length,
+                          pad_short=config.pad_short,
+                          conv_pad=config.conv_pad,
+                          is_training=True,
+                          return_segments=False,
+                          use_noise_augment=False,
+                          use_cache=False,
+                          verbose=True)
+loader = DataLoader(
+    dataset,
+    batch_size=1,
+    shuffle=False,
+    collate_fn=dataset.collate_full_clips,
+    drop_last=False,
+    num_workers=config.num_loader_workers,
+    pin_memory=False)
+# setup the model
+model = setup_generator(config)
+if args.use_cuda:
+    model.cuda()
+# setup optimization parameters
+base_values = sorted(10 * np.random.uniform(size=args.search_depth))
+print(base_values)
+exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
+best_error = float('inf')
+best_schedule = None
+total_search_iter = len(base_values)**args.num_iter
+for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
+    beta = exponents * base
+    model.compute_noise_level(beta)
+    for data in loader:
+        mel, audio = data
+        y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
+        if args.use_cuda:
+            y_hat = y_hat.cpu()
+        y_hat = y_hat.numpy()
+        mel_hat = []
+        for i in range(y_hat.shape[0]):
+            m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
+            mel_hat.append(torch.from_numpy(m))
+        mel_hat = torch.stack(mel_hat)
+        mse = torch.sum((mel - mel_hat) ** 2).mean()
+        if mse.item() < best_error:
+            best_error = mse.item()
+            best_schedule = {'beta': beta}
+            print(f" > Found a better schedule. - MSE: {mse.item()}")
+            np.save(args.output_path, best_schedule)

TTS/server/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+## TTS example web-server
+You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
+Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work.
+#### Development server:
+##### Using server.py
+If you have the environment set already for TTS, then you can directly call ```server.py```.
+**Note:** After installing TTS as a package you can use ```tts-server``` to call the commands below.
+Examples runs:
+List officially released models.
+```python TTS/server/server.py  --list_models ```
+Run the server with the official models.
+```python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan```
+Run the server with the official models on a GPU.
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan --use_cuda True```
+Run the server with a custom models.
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
+##### Using .whl
+1. apt-get install -y espeak libsndfile1 python3-venv
+2. python3 -m venv /tmp/venv
+3. source /tmp/venv/bin/activate
+4. pip install -U pip setuptools wheel
+5. pip install -U https//example.com/url/to/python/package.whl
+6. python -m TTS.server.server
+You can now open http://localhost:5002 in a browser
+#### Running with nginx/uwsgi:
+**Note:** This method uses an old TTS model, so quality might be low.
+1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
+2. python3 -m venv /tmp/venv
+3. source /tmp/venv/bin/activate
+4. pip install -U pip setuptools wheel
+5. pip install -U https//example.com/url/to/python/package.whl
+6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip
+7. unzip *-nginx-uwsgi.zip
+8. cp tts_site_nginx /etc/nginx/sites-enabled/default
+9. service nginx restart
+10. uwsgi --ini uwsgi.ini
+You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx).
+Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting.
+#### Creating a server package with an embedded model
+[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command:
+- `--checkpoint <path to checkpoint file>` - path to model checkpoint file you want to embed in the package
+- `--model_config <path to config.json file>` - path to corresponding config.json file for the checkpoint
+To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`.
+A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it.

TTS/server/__init__.py ADDED Viewed

File without changes

TTS/server/conf.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/",  // tts model root folder
+    "tts_file":"best_model.pth.tar",     // tts checkpoint file
+    "tts_config":"config.json",     // tts config.json file
+    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
+    "vocoder_config":null,
+    "vocoder_file": null,
+    "is_wavernn_batched":true,
+    "port": 5002,
+    "use_cuda": true,
+    "debug": true
+}

TTS/server/server.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!flask/bin/python
+import argparse
+import os
+import sys
+import io
+from pathlib import Path
+from flask import Flask, render_template, request, send_file
+from TTS.utils.synthesizer import Synthesizer
+from TTS.utils.manage import ModelManager
+from TTS.utils.io import load_config
+def create_argparser():
+    def convert_boolean(x):
+        return x.lower() in ['true', '1', 'yes']
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.')
+    parser.add_argument('--model_name', type=str, help='name of one of the released tts models.')
+    parser.add_argument('--vocoder_name', type=str, help='name of one of the released vocoder models.')
+    parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file')
+    parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file')
+    parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
+    parser.add_argument('--vocoder_config', type=str, default=None, help='path to vocoder config file.')
+    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to vocoder checkpoint file.')
+    parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
+    parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
+    parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
+    parser.add_argument('--show_details', type=convert_boolean, default=False, help='Generate model detail page.')
+    return parser
+synthesizer = None
+embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
+embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
+tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
+tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
+embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder')
+vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar')
+vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json')
+# These models are soon to be deprecated
+embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
+wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
+wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
+args = create_argparser().parse_args()
+path = Path(__file__).parent / "../.models.json"
+manager = ModelManager(path)
+if args.list_models:
+    manager.list_models()
+    sys.exit()
+# set models by the released models
+if args.model_name is not None:
+    tts_checkpoint_file, tts_config_file = manager.download_model(args.model_name)
+if args.vocoder_name is not None:
+    vocoder_checkpoint_file, vocoder_config_file = manager.download_model(args.vocoder_name)
+# If these were not specified in the CLI args, use default values with embedded model files
+if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
+    args.tts_checkpoint = tts_checkpoint_file
+if not args.tts_config and os.path.isfile(tts_config_file):
+    args.tts_config = tts_config_file
+if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
+    args.vocoder_checkpoint = vocoder_checkpoint_file
+if not args.vocoder_config and os.path.isfile(vocoder_config_file):
+    args.vocoder_config = vocoder_config_file
+synthesizer = Synthesizer(args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda)
+app = Flask(__name__)
+@app.route('/')
+def index():
+    return render_template('index.html', show_details=args.show_details)
+@app.route('/details')
+def details():
+    model_config = load_config(args.tts_config)
+    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
+        vocoder_config = load_config(args.vocoder_config)
+    else:
+        vocoder_config = None
+    return render_template('details.html',
+                           show_details=args.show_details
+                           , model_config=model_config
+                           , vocoder_config=vocoder_config
+                           , args=args.__dict__
+                          )
+@app.route('/api/tts', methods=['GET'])
+def tts():
+    text = request.args.get('text')
+    print(" > Model input: {}".format(text))
+    wavs = synthesizer.tts(text)
+    out = io.BytesIO()
+    synthesizer.save_wav(wavs, out)
+    return send_file(out, mimetype='audio/wav')
+def main():
+    app.run(debug=args.debug, host='0.0.0.0', port=args.port)
+if __name__ == '__main__':
+    main()

TTS/server/static/TTS_circle.png ADDED Viewed

TTS/server/templates/details.html ADDED Viewed

	@@ -0,0 +1,131 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <meta name="description" content="">
+  <meta name="author" content="">
+  <title>TTS engine</title>
+  <!-- Bootstrap core CSS -->
+  <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
+    integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
+    rel="stylesheet">
+  <!-- Custom styles for this template -->
+  <style>
+    body {
+      padding-top: 54px;
+    }
+    @media (min-width: 992px) {
+      body {
+        padding-top: 56px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
+      src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
+  {% if show_details == true %}
+  <div class="container">
+    <b>Model details</b>
+  </div>
+  <div class="container">
+    <details>
+      <summary>CLI arguments:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> CLI key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in args.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+  </div></br>
+  <div class="container">
+    {% if model_config != None %}
+    <details>
+      <summary>Model config:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in model_config.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+    {% endif %}
+  </div></br>
+  <div class="container">
+    {% if vocoder_config != None %}
+    <details>
+      <summary>Vocoder model config:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in vocoder_config.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+    {% endif %}
+  </div></br>
+  {% else %}
+  <div class="container">
+    <b>Please start server with --show_details=true to see details.</b>
+  </div>
+  {% endif %}
+</body>
+</html>

TTS/server/templates/index.html ADDED Viewed

	@@ -0,0 +1,114 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="">
+    <meta name="author" content="">
+    <title>TTS engine</title>
+    <!-- Bootstrap core CSS -->
+    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
+     integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
+    <!-- Custom styles for this template -->
+    <style>
+      body {
+        padding-top: 54px;
+      }
+      @media (min-width: 992px) {
+        body {
+          padding-top: 56px;
+        }
+      }
+    </style>
+  </head>
+  <body>
+    <a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
+    <!-- Navigation -->
+    <!--
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
+      <div class="container">
+        <a class="navbar-brand" href="#">Mozilla TTS</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarResponsive">
+          <ul class="navbar-nav ml-auto">
+            <li class="nav-item active">
+              <a class="nav-link" href="#">Home
+                <span class="sr-only">(current)</span>
+              </a>
+            </li>
+          </ul>
+        </div>
+      </div>
+    </nav>
+    -->
+    <!-- Page Content -->
+    <div class="container">
+      <div class="row">
+        <div class="col-lg-12 text-center">
+            <img class="mt-5" src="{{url_for('static', filename='TTS_circle.png')}}" align="middle" />
+          <ul class="list-unstyled">
+          </ul>
+          <input id="text" placeholder="Type here..." size=45 type="text" name="text">
+          <button id="speak-button" name="speak">Speak</button><br/><br/>
+          {%if show_details%}
+            <button id="details-button" onclick="location.href = 'details'" name="model-details">Model Details</button><br/><br/>
+          {%endif%}
+          <audio id="audio" controls autoplay hidden></audio>
+          <p id="message"></p>
+        </div>
+      </div>
+    </div>
+    <!-- Bootstrap core JavaScript -->
+    <script>
+            function q(selector) {return document.querySelector(selector)}
+            q('#text').focus()
+            function do_tts(e) {
+                text = q('#text').value
+                if (text) {
+                    q('#message').textContent = 'Synthesizing...'
+                    q('#speak-button').disabled = true
+                    q('#audio').hidden = true
+                    synthesize(text)
+                }
+                e.preventDefault()
+                return false
+            }
+            q('#speak-button').addEventListener('click', do_tts)
+            q('#text').addEventListener('keyup', function(e) {
+              if (e.keyCode == 13) { // enter
+                do_tts(e)
+              }
+            })
+            function synthesize(text) {
+                fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
+                    .then(function(res) {
+                        if (!res.ok) throw Error(res.statusText)
+                            return res.blob()
+                        }).then(function(blob) {
+                            q('#message').textContent = ''
+                            q('#speak-button').disabled = false
+                            q('#audio').src = URL.createObjectURL(blob)
+                            q('#audio').hidden = false
+                        }).catch(function(err) {
+                            q('#message').textContent = 'Error: ' + err.message
+                            q('#speak-button').disabled = false
+                        })
+            }
+        </script>
+  </body>
+</html>

TTS/speaker_encoder/README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+### Speaker Encoder
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+![](umap.png)
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+To run the code, you need to follow the same flow as in TTS.
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS

TTS/speaker_encoder/__init__.py ADDED Viewed

File without changes

TTS/speaker_encoder/config.json ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+    "run_name": "mueller91",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 40,         // size of the mel spec frame.
+        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 400,     // stft window length in ms.
+        "hop_length": 160,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+    },
+    "reinit_layers": [],
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "num_utters_per_speaker": 10,  //
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 20, // Number of steps to log traning on console.
+    "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 40,
+        "proj_dim": 256,
+        "lstm_dim": 768,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": true
+    },
+    "storage": {
+        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
+        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+    },
+    "datasets":
+        [
+            {
+                "name": "vctk_slim",
+                "path": "../../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb1",
+                "path": "../../../audio-datasets/en/voxceleb1/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb2",
+                "path": "../../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "common_voice",
+                "path": "../../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            }
+        ]
+}

TTS/speaker_encoder/dataset.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import numpy
+import numpy as np
+import queue
+import torch
+import random
+from torch.utils.data import Dataset
+from tqdm import tqdm
+class MyDataset(Dataset):
+    def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
+                 storage_size=1, sample_from_storage_p=0.5, additive_noise=0,
+                 num_utter_per_speaker=10, skip_speakers=False, verbose=False):
+        """
+        Args:
+            ap (TTS.tts.utils.AudioProcessor): audio processor object.
+            meta_data (list): list of dataset instances.
+            seq_len (int): voice segment length in seconds.
+            verbose (bool): print diagnostic information.
+        """
+        self.items = meta_data
+        self.sample_rate = ap.sample_rate
+        self.voice_len = voice_len
+        self.seq_len = int(voice_len * self.sample_rate)
+        self.num_speakers_in_batch = num_speakers_in_batch
+        self.num_utter_per_speaker = num_utter_per_speaker
+        self.skip_speakers = skip_speakers
+        self.ap = ap
+        self.verbose = verbose
+        self.__parse_items()
+        self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch)
+        self.sample_from_storage_p = float(sample_from_storage_p)
+        self.additive_noise = float(additive_noise)
+        if self.verbose:
+            print("\n > DataLoader initialization")
+            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
+            print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
+            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
+            print(f" | > Noise added : {self.additive_noise}")
+            print(f" | > Number of instances : {len(self.items)}")
+            print(f" | > Sequence length: {self.seq_len}")
+            print(f" | > Num speakers: {len(self.speakers)}")
+    def load_wav(self, filename):
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+        return audio
+    def load_data(self, idx):
+        text, wav_file, speaker_name = self.items[idx]
+        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
+        mel = self.ap.melspectrogram(wav).astype("float32")
+        # sample seq_len
+        assert text.size > 0, self.items[idx][1]
+        assert wav.size > 0, self.items[idx][1]
+        sample = {
+            "mel": mel,
+            "item_idx": self.items[idx][1],
+            "speaker_name": speaker_name,
+        }
+        return sample
+    def __parse_items(self):
+        self.speaker_to_utters = {}
+        for i in self.items:
+            path_ = i[1]
+            speaker_ = i[2]
+            if speaker_ in self.speaker_to_utters.keys():
+                self.speaker_to_utters[speaker_].append(path_)
+            else:
+                self.speaker_to_utters[speaker_] = [path_, ]
+        if self.skip_speakers:
+            self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if
+                                      len(v) >= self.num_utter_per_speaker}
+        self.speakers = [k for (k, v) in self.speaker_to_utters.items()]
+    # def __parse_items(self):
+    #     """
+    #     Find unique speaker ids and create a dict mapping utterances from speaker id
+    #     """
+    #     speakers = list({item[-1] for item in self.items})
+    #     self.speaker_to_utters = {}
+    #     self.speakers = []
+    #     for speaker in speakers:
+    #         speaker_utters = [item[1] for item in self.items if item[2] == speaker]
+    #         if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
+    #             print(
+    #                 f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
+    #             )
+    #         else:
+    #             self.speakers.append(speaker)
+    #             self.speaker_to_utters[speaker] = speaker_utters
+    def __len__(self):
+        return int(1e10)
+    def __sample_speaker(self):
+        speaker = random.sample(self.speakers, 1)[0]
+        if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
+            utters = random.choices(
+                self.speaker_to_utters[speaker], k=self.num_utter_per_speaker
+            )
+        else:
+            utters = random.sample(
+                self.speaker_to_utters[speaker], self.num_utter_per_speaker
+            )
+        return speaker, utters
+    def __sample_speaker_utterances(self, speaker):
+        """
+        Sample all M utterances for the given speaker.
+        """
+        wavs = []
+        labels = []
+        for _ in range(self.num_utter_per_speaker):
+            # TODO:dummy but works
+            while True:
+                if len(self.speaker_to_utters[speaker]) > 0:
+                    utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
+                else:
+                    self.speakers.remove(speaker)
+                    speaker, _ = self.__sample_speaker()
+                    continue
+                wav = self.load_wav(utter)
+                if wav.shape[0] - self.seq_len > 0:
+                    break
+                self.speaker_to_utters[speaker].remove(utter)
+            wavs.append(wav)
+            labels.append(speaker)
+        return wavs, labels
+    def __getitem__(self, idx):
+        speaker, _ = self.__sample_speaker()
+        return speaker
+    def collate_fn(self, batch):
+        labels = []
+        feats = []
+        for speaker in batch:
+            if random.random() < self.sample_from_storage_p and self.storage.full():
+                # sample from storage (if full), ignoring the speaker
+                wavs_, labels_ = random.choice(self.storage.queue)
+            else:
+                # don't sample from storage, but from HDD
+                wavs_, labels_ = self.__sample_speaker_utterances(speaker)
+                # if storage is full, remove an item
+                if self.storage.full():
+                    _ = self.storage.get_nowait()
+                # put the newly loaded item into storage
+                self.storage.put_nowait((wavs_, labels_))
+            # add random gaussian noise
+            if self.additive_noise > 0:
+                noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
+                wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
+            # get a random subset of each of the wavs and convert to MFCC.
+            offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
+            mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))]
+            feats_ = [torch.FloatTensor(mel) for mel in mels_]
+            labels.append(labels_)
+            feats.extend(feats_)
+        feats = torch.stack(feats)
+        return feats.transpose(1, 2), labels

TTS/speaker_encoder/losses.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+# adapted from https://github.com/cvqluu/GE2E-Loss
+class GE2ELoss(nn.Module):
+    def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
+        """
+        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector (e.g. d-vector)
+        Args:
+            - init_w (float): defines the initial value of w in Equation (5) of [1]
+            - init_b (float): definies the initial value of b in Equation (5) of [1]
+        """
+        super(GE2ELoss, self).__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.loss_method = loss_method
+        print(' > Initialised Generalized End-to-End loss')
+        assert self.loss_method in ["softmax", "contrast"]
+        if self.loss_method == "softmax":
+            self.embed_loss = self.embed_loss_softmax
+        if self.loss_method == "contrast":
+            self.embed_loss = self.embed_loss_contrast
+    # pylint: disable=R0201
+    def calc_new_centroids(self, dvecs, centroids, spkr, utt):
+        """
+        Calculates the new centroids excluding the reference utterance
+        """
+        excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
+        excl = torch.mean(excl, 0)
+        new_centroids = []
+        for i, centroid in enumerate(centroids):
+            if i == spkr:
+                new_centroids.append(excl)
+            else:
+                new_centroids.append(centroid)
+        return torch.stack(new_centroids)
+    def calc_cosine_sim(self, dvecs, centroids):
+        """
+        Make the cosine similarity matrix with dims (N,M,N)
+        """
+        cos_sim_matrix = []
+        for spkr_idx, speaker in enumerate(dvecs):
+            cs_row = []
+            for utt_idx, utterance in enumerate(speaker):
+                new_centroids = self.calc_new_centroids(
+                    dvecs, centroids, spkr_idx, utt_idx
+                )
+                # vector based cosine similarity for speed
+                cs_row.append(
+                    torch.clamp(
+                        torch.mm(
+                            utterance.unsqueeze(1).transpose(0, 1),
+                            new_centroids.transpose(0, 1),
+                        )
+                        / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
+                        1e-6,
+                    )
+                )
+            cs_row = torch.cat(cs_row, dim=0)
+            cos_sim_matrix.append(cs_row)
+        return torch.stack(cos_sim_matrix)
+    # pylint: disable=R0201
+    def embed_loss_softmax(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+    # pylint: disable=R0201
+    def embed_loss_contrast(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
+                excl_centroids_sigmoids = torch.cat(
+                    (centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])
+                )
+                L_row.append(
+                    1.0
+                    - torch.sigmoid(cos_sim_matrix[j, i, j])
+                    + torch.max(excl_centroids_sigmoids)
+                )
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+    def forward(self, dvecs):
+        """
+        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        centroids = torch.mean(dvecs, 1)
+        cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = self.w * cos_sim_matrix + self.b
+        L = self.embed_loss(dvecs, cos_sim_matrix)
+        return L.mean()
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+    """
+    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector
+        Args:
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super(AngleProtoLoss, self).__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion = torch.nn.CrossEntropyLoss()
+        print(' > Initialised Angular Prototypical loss')
+    def forward(self, x):
+        """
+        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        out_anchor = torch.mean(x[:, 1:, :], 1)
+        out_positive = x[:, 0, :]
+        num_speakers = out_anchor.size()[0]
+        cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
+        L = self.criterion(cos_sim_matrix, label)
+        return L

TTS/speaker_encoder/model.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from torch import nn
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim,
+                            hidden_size=lstm_dim,
+                            num_layers=num_lstm_layers,
+                            batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+class SpeakerEncoder(nn.Module):
+    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+        self._init_layers()
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+    def forward(self, x):
+        # TODO: implement state passing for lstms
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+    @torch.no_grad()
+    def inference(self, x):
+        d = self.layers.forward(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+    def compute_embedding(self, x, num_frames=160, overlap=0.5):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        num_overlap = int(num_frames * overlap)
+        max_len = x.shape[1]
+        embed = None
+        cur_iter = 0
+        for offset in range(0, max_len, num_frames - num_overlap):
+            cur_iter += 1
+            end_offset = min(x.shape[1], offset + num_frames)
+            frames = x[:, offset:end_offset]
+            if embed is None:
+                embed = self.inference(frames)
+            else:
+                embed += self.inference(frames)
+        return embed / cur_iter
+    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
+        """
+        Generate embeddings for a batch of utterances
+        x: BxTxD
+        """
+        num_overlap = num_frames * overlap
+        max_len = x.shape[1]
+        embed = None
+        num_iters = seq_lens / (num_frames - num_overlap)
+        cur_iter = 0
+        for offset in range(0, max_len, num_frames - num_overlap):
+            cur_iter += 1
+            end_offset = min(x.shape[1], offset + num_frames)
+            frames = x[:, offset:end_offset]
+            if embed is None:
+                embed = self.inference(frames)
+            else:
+                embed[cur_iter <= num_iters, :] += self.inference(
+                    frames[cur_iter <= num_iters, :, :]
+                )
+        return embed / num_iters

TTS/speaker_encoder/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ umap-learn
2	+ numpy>=1.17.0

TTS/speaker_encoder/umap.png ADDED Viewed