| | from pathlib import Path |
| |
|
| | import sqlalchemy |
| | from montreal_forced_aligner import config |
| |
|
| | config.USE_POSTGRES = False |
| | config.CLEAN = True |
| | config.QUIET = True |
| | config.TEMPORARY_DIRECTORY = Path(__file__).parent.joinpath("temp") |
| |
|
| | from montreal_forced_aligner.corpus.acoustic_corpus import AcousticCorpus |
| | from montreal_forced_aligner.db import Utterance |
| |
|
| | root_dir = Path(r"D:\Data\speech\model_training_corpora") |
| |
|
| |
|
| | languages = [ |
| | |
| | "korean", |
| | "bulgarian", |
| | "vietnamese", |
| | "serbocroatian", |
| | "hausa", |
| | "ukrainian", |
| | "thai", |
| | "swahili", |
| | "turkish", |
| | "spanish", |
| | "swedish", |
| | "portuguese", |
| | "polish", |
| | "french", |
| | "czech", |
| | "japanese", |
| | "russian", |
| | "german", |
| | "mandarin", |
| | "tamil", |
| | "hindi-urdu", |
| | ] |
| |
|
| | languages = ["czech"] |
| |
|
| | for language_directory in root_dir.iterdir(): |
| | if not language_directory.is_dir(): |
| | continue |
| | if language_directory.name not in languages: |
| | continue |
| | print(language_directory.name) |
| | for corpus_directory in language_directory.iterdir(): |
| | if not corpus_directory.is_dir(): |
| | continue |
| | corpus_name = corpus_directory.name |
| | print(corpus_name) |
| | print("=" * len(corpus_name)) |
| | c = AcousticCorpus(corpus_directory=corpus_directory) |
| | c._load_corpus() |
| | print("Num utterances:", c.num_utterances) |
| | print("Num files:", c.num_files) |
| | print("Num speakers:", c.num_speakers) |
| | with c.session() as session: |
| | total_duration = ( |
| | session.query(sqlalchemy.func.sum(Utterance.duration)) |
| | .filter(Utterance.text != "") |
| | .first()[0] |
| | / 3600 |
| | ) |
| | print("Num hours:", total_duration) |
| | print() |
| |
|