Spaces:

NATSpeech
/

DiffSpeech

Runtime error

File size: 30,043 Bytes

d1b91e7

import atexit
import sys
import os
import time
import argparse
from datetime import datetime
import multiprocessing as mp

from montreal_forced_aligner import __version__

from montreal_forced_aligner.utils import get_available_acoustic_languages, get_available_g2p_languages, \
    get_available_dict_languages, get_available_lm_languages, get_available_ivector_languages
from montreal_forced_aligner.command_line.align import run_align_corpus

from mfa_usr.adapt import run_adapt_model
from montreal_forced_aligner.command_line.train_and_align import run_train_corpus
from montreal_forced_aligner.command_line.g2p import run_g2p
from montreal_forced_aligner.command_line.train_g2p import run_train_g2p
from montreal_forced_aligner.command_line.validate import run_validate_corpus
from montreal_forced_aligner.command_line.download import run_download
from montreal_forced_aligner.command_line.train_lm import run_train_lm
from montreal_forced_aligner.command_line.thirdparty import run_thirdparty
from montreal_forced_aligner.command_line.train_ivector_extractor import run_train_ivector_extractor
from montreal_forced_aligner.command_line.classify_speakers import run_classify_speakers
from montreal_forced_aligner.command_line.transcribe import run_transcribe_corpus
from montreal_forced_aligner.command_line.train_dictionary import run_train_dictionary
from montreal_forced_aligner.command_line.create_segments import run_create_segments
from montreal_forced_aligner.exceptions import MFAError
from montreal_forced_aligner.config import update_global_config, load_global_config, update_command_history, \
    load_command_history


class ExitHooks(object):
    def __init__(self):
        self.exit_code = None
        self.exception = None

    def hook(self):
        self._orig_exit = sys.exit
        sys.exit = self.exit
        sys.excepthook = self.exc_handler

    def exit(self, code=0):
        self.exit_code = code
        self._orig_exit(code)

    def exc_handler(self, exc_type, exc, *args):
        self.exception = exc


hooks = ExitHooks()
hooks.hook()

BEGIN = time.time()
BEGIN_DATE = datetime.now()


def history_save_handler():
    history_data = {
        'command': ' '.join(sys.argv),
        'execution_time': time.time() - BEGIN,
        'date': BEGIN_DATE,
        'version': __version__
    }

    if hooks.exit_code is not None:
        history_data['exit_code'] = hooks.exit_code
        history_data['exception'] = ''
    elif hooks.exception is not None:
        history_data['exit_code'] = 1
        history_data['exception'] = hooks.exception
    else:
        history_data['exception'] = ''
        history_data['exit_code'] = 0
    update_command_history(history_data)


atexit.register(history_save_handler)


def fix_path():
    from montreal_forced_aligner.config import TEMP_DIR
    thirdparty_dir = os.path.join(TEMP_DIR, 'thirdparty', 'bin')
    old_path = os.environ.get('PATH', '')
    if sys.platform == 'win32':
        os.environ['PATH'] = thirdparty_dir + ';' + old_path
    else:
        os.environ['PATH'] = thirdparty_dir + ':' + old_path
        os.environ['LD_LIBRARY_PATH'] = thirdparty_dir + ':' + os.environ.get('LD_LIBRARY_PATH', '')


def unfix_path():
    if sys.platform == 'win32':
        sep = ';'
        os.environ['PATH'] = sep.join(os.environ['PATH'].split(sep)[1:])
    else:
        sep = ':'
        os.environ['PATH'] = sep.join(os.environ['PATH'].split(sep)[1:])
        os.environ['LD_LIBRARY_PATH'] = sep.join(os.environ['PATH'].split(sep)[1:])


acoustic_languages = get_available_acoustic_languages()
ivector_languages = get_available_ivector_languages()
lm_languages = get_available_lm_languages()
g2p_languages = get_available_g2p_languages()
dict_languages = get_available_dict_languages()


def create_parser():
    GLOBAL_CONFIG = load_global_config()

    def add_global_options(subparser, textgrid_output=False):
        subparser.add_argument('-t', '--temp_directory', type=str, default=GLOBAL_CONFIG['temp_directory'],
                               help=f"Temporary directory root to store MFA created files, default is {GLOBAL_CONFIG['temp_directory']}")
        subparser.add_argument('--disable_mp',
                               help=f"Disable any multiprocessing during alignment (not recommended), default is {not GLOBAL_CONFIG['use_mp']}",
                               action='store_true',
                               default=not GLOBAL_CONFIG['use_mp'])
        subparser.add_argument('-j', '--num_jobs', type=int, default=GLOBAL_CONFIG['num_jobs'],
                               help=f"Number of data splits (and cores to use if multiprocessing is enabled), defaults "
                                    f"is {GLOBAL_CONFIG['num_jobs']}")
        subparser.add_argument('-v', '--verbose', help=f"Output debug messages, default is {GLOBAL_CONFIG['verbose']}",
                               action='store_true',
                               default=GLOBAL_CONFIG['verbose'])
        subparser.add_argument('--clean', help=f"Remove files from previous runs, default is {GLOBAL_CONFIG['clean']}",
                               action='store_true',
                               default=GLOBAL_CONFIG['clean'])
        subparser.add_argument('--overwrite',
                               help=f"Overwrite output files when they exist, default is {GLOBAL_CONFIG['overwrite']}",
                               action='store_true',
                               default=GLOBAL_CONFIG['overwrite'])
        subparser.add_argument('--debug',
                               help=f"Run extra steps for debugging issues, default is {GLOBAL_CONFIG['debug']}",
                               action='store_true',
                               default=GLOBAL_CONFIG['debug'])
        if textgrid_output:
            subparser.add_argument('--disable_textgrid_cleanup',
                                   help=f"Disable extra clean up steps on TextGrid output, default is {not GLOBAL_CONFIG['cleanup_textgrids']}",
                                   action='store_true',
                                   default=not GLOBAL_CONFIG['cleanup_textgrids'])

    parser = argparse.ArgumentParser()

    subparsers = parser.add_subparsers(dest="subcommand")
    subparsers.required = True

    version_parser = subparsers.add_parser('version')

    align_parser = subparsers.add_parser('align')
    align_parser.add_argument('corpus_directory', help="Full path to the directory to align")
    align_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use")
    align_parser.add_argument('acoustic_model_path',
                              help=f"Full path to the archive containing pre-trained model or language ({', '.join(acoustic_languages)})")
    align_parser.add_argument('output_directory',
                              help="Full path to output directory, will be created if it doesn't exist")
    align_parser.add_argument('--config_path', type=str, default='',
                              help="Path to config file to use for alignment")
    align_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                              help="Number of characters of file names to use for determining speaker, "
                                   'default is to use directory names')
    align_parser.add_argument('-a', '--audio_directory', type=str, default='',
                              help="Audio directory root to use for finding audio files")
    add_global_options(align_parser, textgrid_output=True)

    adapt_parser = subparsers.add_parser('adapt')
    adapt_parser.add_argument('corpus_directory', help="Full path to the directory to align")
    adapt_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use")
    adapt_parser.add_argument('acoustic_model_path',
                              help=f"Full path to the archive containing pre-trained model or language ({', '.join(acoustic_languages)})")
    adapt_parser.add_argument('output_model_path',
                              help="Full path to save adapted_model")
    adapt_parser.add_argument('output_directory',
                              help="Full path to output directory, will be created if it doesn't exist")
    adapt_parser.add_argument('--config_path', type=str, default='',
                              help="Path to config file to use for alignment")
    adapt_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                              help="Number of characters of file names to use for determining speaker, "
                                   'default is to use directory names')
    adapt_parser.add_argument('-a', '--audio_directory', type=str, default='',
                              help="Audio directory root to use for finding audio files")
    add_global_options(adapt_parser, textgrid_output=True)

    train_parser = subparsers.add_parser('train')
    train_parser.add_argument('corpus_directory', help="Full path to the source directory to align")
    train_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use",
                              default='')
    train_parser.add_argument('output_directory',
                              help="Full path to output directory, will be created if it doesn't exist")
    train_parser.add_argument('--config_path', type=str, default='',
                              help="Path to config file to use for training and alignment")
    train_parser.add_argument('-o', '--output_model_path', type=str, default='',
                              help="Full path to save resulting acoustic and dictionary model")
    train_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                              help="Number of characters of filenames to use for determining speaker, "
                                   'default is to use directory names')
    train_parser.add_argument('-a', '--audio_directory', type=str, default='',
                              help="Audio directory root to use for finding audio files")
    train_parser.add_argument('-m', '--acoustic_model_path', type=str, default='',
                              help="Full path to save adapted_model")

    add_global_options(train_parser, textgrid_output=True)

    validate_parser = subparsers.add_parser('validate')
    validate_parser.add_argument('corpus_directory', help="Full path to the source directory to align")
    validate_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use",
                                 default='')
    validate_parser.add_argument('acoustic_model_path', nargs='?', default='',
                                 help=f"Full path to the archive containing pre-trained model or language ({', '.join(acoustic_languages)})")
    validate_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                                 help="Number of characters of file names to use for determining speaker, "
                                      'default is to use directory names')
    validate_parser.add_argument('--test_transcriptions', help="Test accuracy of transcriptions", action='store_true')
    validate_parser.add_argument('--ignore_acoustics',
                                 help="Skip acoustic feature generation and associated validation",
                                 action='store_true')
    add_global_options(validate_parser)

    g2p_model_help_message = f'''Full path to the archive containing pre-trained model or language ({', '.join(g2p_languages)})
    If not specified, then orthographic transcription is split into pronunciations.'''
    g2p_parser = subparsers.add_parser('g2p')
    g2p_parser.add_argument("g2p_model_path", help=g2p_model_help_message, nargs='?')

    g2p_parser.add_argument("input_path",
                            help="Corpus to base word list on or a text file of words to generate pronunciations")
    g2p_parser.add_argument("output_path", help="Path to save output dictionary")
    g2p_parser.add_argument('--include_bracketed', help="Included words enclosed by brackets, i.e. [...], (...), <...>",
                            action='store_true')
    g2p_parser.add_argument('--config_path', type=str, default='',
                            help="Path to config file to use for G2P")
    add_global_options(g2p_parser)

    train_g2p_parser = subparsers.add_parser('train_g2p')
    train_g2p_parser.add_argument("dictionary_path", help="Location of existing dictionary")

    train_g2p_parser.add_argument("output_model_path", help="Desired location of generated model")
    train_g2p_parser.add_argument('--config_path', type=str, default='',
                                  help="Path to config file to use for G2P")
    train_g2p_parser.add_argument("--validate", action='store_true',
                                  help="Perform an analysis of accuracy training on "
                                       "most of the data and validating on an unseen subset")
    add_global_options(train_g2p_parser)

    download_parser = subparsers.add_parser('download')
    download_parser.add_argument("model_type",
                                 help="Type of model to download, one of 'acoustic', 'g2p', or 'dictionary'")
    download_parser.add_argument("language", help="Name of language code to download, if not specified, "
                                                  "will list all available languages", nargs='?')

    train_lm_parser = subparsers.add_parser('train_lm')
    train_lm_parser.add_argument('source_path', help="Full path to the source directory to train from, alternatively "
                                                     'an ARPA format language model to convert for MFA use')
    train_lm_parser.add_argument('output_model_path', type=str,
                                 help="Full path to save resulting language model")
    train_lm_parser.add_argument('-m', '--model_path', type=str,
                                 help="Full path to existing language model to merge probabilities")
    train_lm_parser.add_argument('-w', '--model_weight', type=float, default=1.0,
                                 help="Weight factor for supplemental language model, defaults to 1.0")
    train_lm_parser.add_argument('--dictionary_path', help="Full path to the pronunciation dictionary to use",
                                 default='')
    train_lm_parser.add_argument('--config_path', type=str, default='',
                                 help="Path to config file to use for training and alignment")
    add_global_options(train_lm_parser)

    train_dictionary_parser = subparsers.add_parser('train_dictionary')
    train_dictionary_parser.add_argument('corpus_directory', help="Full path to the directory to align")
    train_dictionary_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use")
    train_dictionary_parser.add_argument('acoustic_model_path',
                                         help=f"Full path to the archive containing pre-trained model or language ({', '.join(acoustic_languages)})")
    train_dictionary_parser.add_argument('output_directory',
                                         help="Full path to output directory, will be created if it doesn't exist")
    train_dictionary_parser.add_argument('--config_path', type=str, default='',
                                         help="Path to config file to use for alignment")
    train_dictionary_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                                         help="Number of characters of file names to use for determining speaker, "
                                              'default is to use directory names')
    add_global_options(train_dictionary_parser)

    train_ivector_parser = subparsers.add_parser('train_ivector')
    train_ivector_parser.add_argument('corpus_directory', help="Full path to the source directory to "
                                                               'train the ivector extractor')
    train_ivector_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use")
    train_ivector_parser.add_argument('acoustic_model_path', type=str, default='',
                                      help="Full path to acoustic model for alignment")
    train_ivector_parser.add_argument('output_model_path', type=str, default='',
                                      help="Full path to save resulting ivector extractor")
    train_ivector_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                                      help="Number of characters of filenames to use for determining speaker, "
                                           'default is to use directory names')
    train_ivector_parser.add_argument('--config_path', type=str, default='',
                                      help="Path to config file to use for training")
    add_global_options(train_ivector_parser)

    classify_speakers_parser = subparsers.add_parser('classify_speakers')
    classify_speakers_parser.add_argument('corpus_directory', help="Full path to the source directory to "
                                                                   'run speaker classification')
    classify_speakers_parser.add_argument('ivector_extractor_path', type=str, default='',
                                          help="Full path to ivector extractor model")
    classify_speakers_parser.add_argument('output_directory',
                                          help="Full path to output directory, will be created if it doesn't exist")

    classify_speakers_parser.add_argument('-s', '--num_speakers', type=int, default=0,
                                          help="Number of speakers if known")
    classify_speakers_parser.add_argument('--cluster', help="Using clustering instead of classification",
                                          action='store_true')
    classify_speakers_parser.add_argument('--config_path', type=str, default='',
                                          help="Path to config file to use for ivector extraction")
    add_global_options(classify_speakers_parser)

    create_segments_parser = subparsers.add_parser('create_segments')
    create_segments_parser.add_argument('corpus_directory', help="Full path to the source directory to "
                                                                 'run VAD segmentation')
    create_segments_parser.add_argument('output_directory',
                                        help="Full path to output directory, will be created if it doesn't exist")
    create_segments_parser.add_argument('--config_path', type=str, default='',
                                        help="Path to config file to use for segmentation")
    add_global_options(create_segments_parser)

    transcribe_parser = subparsers.add_parser('transcribe')
    transcribe_parser.add_argument('corpus_directory', help="Full path to the directory to transcribe")
    transcribe_parser.add_argument('dictionary_path', help="Full path to the pronunciation dictionary to use")
    transcribe_parser.add_argument('acoustic_model_path',
                                   help=f"Full path to the archive containing pre-trained model or language ({', '.join(acoustic_languages)})")
    transcribe_parser.add_argument('language_model_path',
                                   help=f"Full path to the archive containing pre-trained model or language ({', '.join(lm_languages)})")
    transcribe_parser.add_argument('output_directory',
                                   help="Full path to output directory, will be created if it doesn't exist")
    transcribe_parser.add_argument('--config_path', type=str, default='',
                                   help="Path to config file to use for transcription")
    transcribe_parser.add_argument('-s', '--speaker_characters', type=str, default='0',
                                   help="Number of characters of file names to use for determining speaker, "
                                        'default is to use directory names')
    transcribe_parser.add_argument('-a', '--audio_directory', type=str, default='',
                                   help="Audio directory root to use for finding audio files")
    transcribe_parser.add_argument('-e', '--evaluate', help="Evaluate the transcription "
                                                            "against golden texts", action='store_true')
    add_global_options(transcribe_parser)

    config_parser = subparsers.add_parser('configure',
                                          help="The configure command is used to set global defaults for MFA so "
                                               "you don't have to set them every time you call an MFA command.")
    config_parser.add_argument('-t', '--temp_directory', type=str, default='',
                               help=f"Set the default temporary directory, default is {GLOBAL_CONFIG['temp_directory']}")
    config_parser.add_argument('-j', '--num_jobs', type=int,
                               help=f"Set the number of processes to use by default, defaults to {GLOBAL_CONFIG['num_jobs']}")
    config_parser.add_argument('--always_clean', help="Always remove files from previous runs by default",
                               action='store_true')
    config_parser.add_argument('--never_clean', help="Don't remove files from previous runs by default",
                               action='store_true')
    config_parser.add_argument('--always_verbose', help="Default to verbose output", action='store_true')
    config_parser.add_argument('--never_verbose', help="Default to non-verbose output", action='store_true')
    config_parser.add_argument('--always_debug', help="Default to running debugging steps", action='store_true')
    config_parser.add_argument('--never_debug', help="Default to not running debugging steps", action='store_true')
    config_parser.add_argument('--always_overwrite', help="Always overwrite output files", action='store_true')
    config_parser.add_argument('--never_overwrite', help="Never overwrite output files (if file already exists, "
                                                         "the output will be saved in the temp directory)",
                               action='store_true')
    config_parser.add_argument('--disable_mp', help="Disable all multiprocessing (not recommended as it will usually "
                                                    "increase processing times)", action='store_true')
    config_parser.add_argument('--enable_mp', help="Enable multiprocessing (recommended and enabled by default)",
                               action='store_true')
    config_parser.add_argument('--disable_textgrid_cleanup', help="Disable postprocessing of TextGrids that cleans up "
                                                                  "silences and recombines compound words and clitics",
                               action='store_true')
    config_parser.add_argument('--enable_textgrid_cleanup', help="Enable postprocessing of TextGrids that cleans up "
                                                                 "silences and recombines compound words and clitics",
                               action='store_true')

    history_parser = subparsers.add_parser('history')

    history_parser.add_argument('depth', help='Number of commands to list', nargs='?', default=10)
    history_parser.add_argument('--verbose', help="Flag for whether to output additional information",
                                action='store_true')

    annotator_parser = subparsers.add_parser('annotator')
    anchor_parser = subparsers.add_parser('anchor')

    thirdparty_parser = subparsers.add_parser('thirdparty')

    thirdparty_parser.add_argument("command",
                                   help="One of 'download', 'validate', or 'kaldi'")
    thirdparty_parser.add_argument('local_directory',
                                   help="Full path to the built executables to collect", nargs="?",
                                   default='')
    return parser


parser = create_parser()


def main():
    parser = create_parser()
    mp.freeze_support()
    args, unknown = parser.parse_known_args()
    for short in ['-c', '-d']:
        if short in unknown:
            print(f'Due to the number of options that `{short}` could refer to, it is not accepted. '
                  'Please specify the full argument')
            sys.exit(1)
    try:
        fix_path()
        if args.subcommand in ['align', 'train', 'train_ivector']:
            from montreal_forced_aligner.thirdparty.kaldi import validate_alignment_binaries
            if not validate_alignment_binaries():
                print("There was an issue validating Kaldi binaries, please ensure you've downloaded them via the "
                      "'mfa thirdparty download' command.  See 'mfa thirdparty validate' for more detailed information "
                      "on why this check failed.")
                sys.exit(1)
        elif args.subcommand in ['transcribe']:
            from montreal_forced_aligner.thirdparty.kaldi import validate_transcribe_binaries
            if not validate_transcribe_binaries():
                print("There was an issue validating Kaldi binaries, please ensure you've downloaded them via the "
                      "'mfa thirdparty download' command.  See 'mfa thirdparty validate' for more detailed information "
                      "on why this check failed.  If you are on MacOS, please note that the thirdparty binaries available "
                      "via the download command do not contain the transcription ones.  To get this functionality working "
                      "for the time being, please build kaldi locally and follow the instructions for running the "
                      "'mfa thirdparty kaldi' command.")
                sys.exit(1)
        elif args.subcommand in ['train_dictionary']:
            from montreal_forced_aligner.thirdparty.kaldi import validate_train_dictionary_binaries
            if not validate_train_dictionary_binaries():
                print("There was an issue validating Kaldi binaries, please ensure you've downloaded them via the "
                      "'mfa thirdparty download' command.  See 'mfa thirdparty validate' for more detailed information "
                      "on why this check failed.  If you are on MacOS, please note that the thirdparty binaries available "
                      "via the download command do not contain the train_dictionary ones.  To get this functionality working "
                      "for the time being, please build kaldi locally and follow the instructions for running the "
                      "'mfa thirdparty kaldi' command.")
                sys.exit(1)
        elif args.subcommand in ['g2p', 'train_g2p']:
            try:
                import pynini
            except ImportError:
                print("There was an issue importing Pynini, please ensure that it is installed. If you are on Windows, "
                      "please use the Windows Subsystem for Linux to use g2p functionality.")
                sys.exit(1)
        if args.subcommand == 'align':
            run_align_corpus(args, unknown, acoustic_languages)
        elif args.subcommand == 'adapt':
            run_adapt_model(args, unknown, acoustic_languages)
        elif args.subcommand == 'train':
            run_train_corpus(args, unknown)
        elif args.subcommand == 'g2p':
            run_g2p(args, unknown, g2p_languages)
        elif args.subcommand == 'train_g2p':
            run_train_g2p(args, unknown)
        elif args.subcommand == 'validate':
            run_validate_corpus(args, unknown)
        elif args.subcommand == 'download':
            run_download(args)
        elif args.subcommand == 'train_lm':
            run_train_lm(args, unknown)
        elif args.subcommand == 'train_dictionary':
            run_train_dictionary(args, unknown)
        elif args.subcommand == 'train_ivector':
            run_train_ivector_extractor(args, unknown)
        elif args.subcommand == 'classify_speakers':
            run_classify_speakers(args, unknown)
        elif args.subcommand in ['annotator', 'anchor']:
            from montreal_forced_aligner.command_line.anchor import run_anchor
            run_anchor(args)
        elif args.subcommand == 'thirdparty':
            run_thirdparty(args)
        elif args.subcommand == 'transcribe':
            run_transcribe_corpus(args, unknown)
        elif args.subcommand == 'create_segments':
            run_create_segments(args, unknown)
        elif args.subcommand == 'configure':
            update_global_config(args)
            global GLOBAL_CONFIG
            GLOBAL_CONFIG = load_global_config()
        elif args.subcommand == 'history':
            depth = args.depth
            history = load_command_history()[-depth:]
            for h in history:
                if args.verbose:
                    print('command\tDate\tExecution time\tVersion\tExit code\tException')
                    for h in history:
                        execution_time = time.strftime('%H:%M:%S', time.gmtime(h['execution_time']))
                        d = h['date'].isoformat()
                        print(
                            f"{h['command']}\t{d}\t{execution_time}\t{h['version']}\t{h['exit_code']}\t{h['exception']}")
                    pass
                else:
                    for h in history:
                        print(h['command'])

        elif args.subcommand == 'version':
            print(__version__)
    except MFAError as e:
        if getattr(args, 'debug', False):
            raise
        print(e)
        sys.exit(1)
    finally:
        unfix_path()


if __name__ == '__main__':
    main()