|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Script to create mExpresso Eng-XXX S2T dataset. |
|
""" |
|
|
|
import argparse |
|
import logging |
|
import multiprocessing as mp |
|
import os |
|
import pandas as pd |
|
import pathlib |
|
import re |
|
import seamless_communication |
|
import torchaudio |
|
|
|
from pathlib import Path |
|
from tqdm import tqdm |
|
from typing import List, Optional, Tuple |
|
|
|
from fairseq2.assets import asset_store, download_manager |
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s %(levelname)s: %(message)s", |
|
) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def multiprocess_map( |
|
a_list: list, |
|
func: callable, |
|
n_workers: Optional[int] = None, |
|
chunksize: int = 1, |
|
desc=None, |
|
): |
|
if n_workers is None: |
|
n_workers = mp.cpu_count() |
|
n_workers = min(n_workers, mp.cpu_count()) |
|
with mp.get_context("spawn").Pool(processes=n_workers) as pool: |
|
results = list( |
|
tqdm( |
|
pool.imap(func, a_list, chunksize=chunksize), |
|
total=len(a_list), |
|
desc=desc, |
|
) |
|
) |
|
return results |
|
|
|
|
|
def convert_to_16khz_wav(config: Tuple[str, str]) -> str: |
|
input_audio, output_audio = config |
|
input_wav, input_sr = torchaudio.load(input_audio) |
|
effects = [ |
|
["rate", "16000"], |
|
["channels", "1"], |
|
] |
|
wav, _ = torchaudio.sox_effects.apply_effects_tensor( |
|
input_wav, input_sr, effects=effects |
|
) |
|
os.makedirs(Path(output_audio).parent, exist_ok=True) |
|
torchaudio.save( |
|
output_audio, wav, sample_rate=16000, encoding="PCM_S", bits_per_sample=16 |
|
) |
|
return output_audio |
|
|
|
|
|
def build_en_manifest_from_oss(oss_root: Path, output_folder: Path) -> pd.DataFrame: |
|
|
|
WHITELIST_STYLE = [ |
|
"default", |
|
"default_emphasis", |
|
"default_essentials", |
|
"confused", |
|
"happy", |
|
"sad", |
|
"enunciated", |
|
"whisper", |
|
"laughing", |
|
] |
|
|
|
results = [] |
|
with open(oss_root / "read_transcriptions.txt") as fin: |
|
for line in fin: |
|
uid, text = line.strip().split("\t") |
|
sps = uid.split("_") |
|
oss_speaker = sps[0] |
|
style = "_".join(sps[1:-1]) |
|
base_style = style.split("_")[0] |
|
if style not in WHITELIST_STYLE: |
|
continue |
|
|
|
text = re.sub(r" <.*?>", "", text) |
|
text = re.sub(r"<.*?> ", "", text) |
|
results.append( |
|
{ |
|
"id": uid, |
|
"speaker": oss_speaker, |
|
"text": text, |
|
"orig_audio": ( |
|
oss_root |
|
/ "audio_48khz" |
|
/ "read" |
|
/ oss_speaker |
|
/ base_style |
|
/ "base" |
|
/ f"{uid}.wav" |
|
).as_posix(), |
|
"label": style, |
|
} |
|
) |
|
|
|
df = pd.DataFrame(results) |
|
|
|
|
|
|
|
orig_audio_exists = df["orig_audio"].apply(lambda x: os.path.isfile(x)) |
|
assert all(orig_audio_exists), df[~orig_audio_exists].iloc[0]["orig_audio"] |
|
|
|
|
|
target_audio_root = output_folder / "audio_16khz_wav" |
|
os.makedirs(target_audio_root, exist_ok=True) |
|
input_output_audios = [ |
|
( |
|
row["orig_audio"], |
|
(target_audio_root / row["speaker"] / (row["id"] + ".wav")).as_posix(), |
|
) |
|
for i, row in df.iterrows() |
|
] |
|
logger.info("converting from 48khz to mono 16khz") |
|
multiprocess_map(input_output_audios, convert_to_16khz_wav, chunksize=50) |
|
df.loc[:, "audio"] = [output_audio for _, output_audio in input_output_audios] |
|
audio_exists = df["audio"].apply(lambda x: os.path.isfile(x)) |
|
assert all(audio_exists), df[~audio_exists].iloc[0]["audio"] |
|
output_manifest = f"{output_folder}/en_manifest.tsv" |
|
df.to_csv(output_manifest, sep="\t", quoting=3, index=None) |
|
logger.info(f"Output {len(df)} rows to {output_manifest}") |
|
return df |
|
|
|
|
|
def main() -> None: |
|
parser = argparse.ArgumentParser( |
|
description="Prepare mExpresso Eng-XXX S2T manifest" |
|
) |
|
parser.add_argument( |
|
"output_folder", |
|
type=lambda p: pathlib.Path(p).resolve(), |
|
help="Output folder for the downsampled Expresso En audios and combined manifest. " |
|
"The output folder path will be expanded to absolute path.", |
|
) |
|
parser.add_argument( |
|
"--existing-expresso-root", |
|
type=str, |
|
help="Existing root folder if you have downloaded Expresso dataset. " |
|
"The folder path should include 'read_transcriptions.txt' and 'audio_48khz'", |
|
) |
|
args = parser.parse_args() |
|
|
|
mexpresso_card = asset_store.retrieve_card("mexpresso_text") |
|
mexpresso_root_path = download_manager.download_dataset( |
|
mexpresso_card.field("uri").as_uri(), |
|
"mExpresso_text", |
|
) |
|
logger.info(f"The mExpresso dataset is downloaded to {mexpresso_root_path}") |
|
mexpresso_path = mexpresso_root_path / "mexpresso_text" |
|
|
|
|
|
if args.existing_expresso_root is not None: |
|
logger.info( |
|
f"Re-use user manually downloaded Expresso from {args.existing_expresso_root}" |
|
) |
|
en_expresso_path = Path(args.existing_expresso_root) |
|
else: |
|
en_expresso_card = asset_store.retrieve_card("expresso") |
|
en_expresso_root_path = download_manager.download_dataset( |
|
en_expresso_card.field("uri").as_uri(), |
|
"Expresso", |
|
) |
|
logger.info( |
|
f"The English Expresso dataset is downloaded to {en_expresso_root_path}" |
|
) |
|
en_expresso_path = en_expresso_root_path / "expresso" |
|
en_expresso_folder = args.output_folder / "En_Expresso" |
|
en_expresso_df = build_en_manifest_from_oss( |
|
Path(en_expresso_path), en_expresso_folder |
|
) |
|
|
|
for subset in ["dev", "test"]: |
|
for lang in ["spa", "fra", "ita", "cmn", "deu"]: |
|
df = pd.read_csv( |
|
f"{mexpresso_path}/{subset}_mexpresso_{lang}.tsv", sep="\t", quoting=3 |
|
).rename(columns={"text": "tgt_text"}) |
|
num_released_items = len(df) |
|
df = df.merge( |
|
en_expresso_df.rename( |
|
columns={ |
|
"text": "src_text", |
|
"audio": "src_audio", |
|
"speaker": "src_speaker", |
|
} |
|
), |
|
on="id", |
|
how="inner", |
|
) |
|
assert ( |
|
len(df) == num_released_items |
|
), f"Missing items from downloaded En Expresso" |
|
df["src_lang"] = "eng" |
|
df["tgt_lang"] = lang |
|
|
|
assert all(os.path.isfile(audio) for audio in df["src_audio"].tolist()) |
|
output_manifest_path = args.output_folder / f"{subset}_mexpresso_eng_{lang}.tsv" |
|
df[ |
|
[ |
|
"id", |
|
"src_audio", |
|
"src_speaker", |
|
"src_text", |
|
"src_lang", |
|
"tgt_text", |
|
"tgt_lang", |
|
"label", |
|
] |
|
].to_csv(output_manifest_path, sep="\t", quoting=3, index=None) |
|
logger.info(f"Output {len(df)} rows to {output_manifest_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|