victan commited on
Commit
a95f284
1 Parent(s): d4f8426

Upload seamless_communication/cli/expressivity/data/prepare_mexpresso.py with huggingface_hub

Browse files
seamless_communication/cli/expressivity/data/prepare_mexpresso.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # MIT_LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Script to create mExpresso Eng-XXX S2T dataset.
9
+ """
10
+
11
+ import argparse
12
+ import logging
13
+ import multiprocessing as mp
14
+ import os
15
+ import pandas as pd
16
+ import pathlib
17
+ import re
18
+ import seamless_communication # need this to load dataset cards
19
+ import torchaudio
20
+
21
+ from pathlib import Path
22
+ from tqdm import tqdm
23
+ from typing import List, Optional, Tuple
24
+
25
+ from fairseq2.assets import asset_store, download_manager
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s %(levelname)s: %(message)s",
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def multiprocess_map(
36
+ a_list: list,
37
+ func: callable,
38
+ n_workers: Optional[int] = None,
39
+ chunksize: int = 1,
40
+ desc=None,
41
+ ):
42
+ if n_workers is None:
43
+ n_workers = mp.cpu_count()
44
+ n_workers = min(n_workers, mp.cpu_count())
45
+ with mp.get_context("spawn").Pool(processes=n_workers) as pool:
46
+ results = list(
47
+ tqdm(
48
+ pool.imap(func, a_list, chunksize=chunksize),
49
+ total=len(a_list),
50
+ desc=desc,
51
+ )
52
+ )
53
+ return results
54
+
55
+
56
+ def convert_to_16khz_wav(config: Tuple[str, str]) -> str:
57
+ input_audio, output_audio = config
58
+ input_wav, input_sr = torchaudio.load(input_audio)
59
+ effects = [
60
+ ["rate", "16000"],
61
+ ["channels", "1"],
62
+ ]
63
+ wav, _ = torchaudio.sox_effects.apply_effects_tensor(
64
+ input_wav, input_sr, effects=effects
65
+ )
66
+ os.makedirs(Path(output_audio).parent, exist_ok=True)
67
+ torchaudio.save(
68
+ output_audio, wav, sample_rate=16000, encoding="PCM_S", bits_per_sample=16
69
+ )
70
+ return output_audio
71
+
72
+
73
+ def build_en_manifest_from_oss(oss_root: Path, output_folder: Path) -> pd.DataFrame:
74
+ # We only open source the following styles
75
+ WHITELIST_STYLE = [
76
+ "default",
77
+ "default_emphasis",
78
+ "default_essentials",
79
+ "confused",
80
+ "happy",
81
+ "sad",
82
+ "enunciated",
83
+ "whisper",
84
+ "laughing",
85
+ ]
86
+
87
+ results = []
88
+ with open(oss_root / "read_transcriptions.txt") as fin:
89
+ for line in fin:
90
+ uid, text = line.strip().split("\t")
91
+ sps = uid.split("_")
92
+ oss_speaker = sps[0]
93
+ style = "_".join(sps[1:-1])
94
+ base_style = style.split("_")[0]
95
+ if style not in WHITELIST_STYLE:
96
+ continue
97
+ # Normalize the text to remove <laugh> and <breath> etc
98
+ text = re.sub(r" <.*?>", "", text)
99
+ text = re.sub(r"<.*?> ", "", text)
100
+ results.append(
101
+ {
102
+ "id": uid,
103
+ "speaker": oss_speaker,
104
+ "text": text,
105
+ "orig_audio": (
106
+ oss_root
107
+ / "audio_48khz"
108
+ / "read"
109
+ / oss_speaker
110
+ / base_style
111
+ / "base"
112
+ / f"{uid}.wav"
113
+ ).as_posix(),
114
+ "label": style,
115
+ }
116
+ )
117
+
118
+ df = pd.DataFrame(results)
119
+
120
+ # Sanity checks
121
+ # Check 1: audio files exists
122
+ orig_audio_exists = df["orig_audio"].apply(lambda x: os.path.isfile(x))
123
+ assert all(orig_audio_exists), df[~orig_audio_exists].iloc[0]["orig_audio"]
124
+
125
+ # Convert 48kHz -> 16kHz
126
+ target_audio_root = output_folder / "audio_16khz_wav"
127
+ os.makedirs(target_audio_root, exist_ok=True)
128
+ input_output_audios = [
129
+ (
130
+ row["orig_audio"],
131
+ (target_audio_root / row["speaker"] / (row["id"] + ".wav")).as_posix(),
132
+ )
133
+ for i, row in df.iterrows()
134
+ ]
135
+ logger.info("converting from 48khz to mono 16khz")
136
+ multiprocess_map(input_output_audios, convert_to_16khz_wav, chunksize=50)
137
+ df.loc[:, "audio"] = [output_audio for _, output_audio in input_output_audios]
138
+ audio_exists = df["audio"].apply(lambda x: os.path.isfile(x))
139
+ assert all(audio_exists), df[~audio_exists].iloc[0]["audio"]
140
+ output_manifest = f"{output_folder}/en_manifest.tsv"
141
+ df.to_csv(output_manifest, sep="\t", quoting=3, index=None)
142
+ logger.info(f"Output {len(df)} rows to {output_manifest}")
143
+ return df
144
+
145
+
146
+ def main() -> None:
147
+ parser = argparse.ArgumentParser(
148
+ description="Prepare mExpresso Eng-XXX S2T manifest"
149
+ )
150
+ parser.add_argument(
151
+ "output_folder",
152
+ type=lambda p: pathlib.Path(p).resolve(), # always convert to absolute path
153
+ help="Output folder for the downsampled Expresso En audios and combined manifest. "
154
+ "The output folder path will be expanded to absolute path.",
155
+ )
156
+ parser.add_argument(
157
+ "--existing-expresso-root",
158
+ type=str,
159
+ help="Existing root folder if you have downloaded Expresso dataset. "
160
+ "The folder path should include 'read_transcriptions.txt' and 'audio_48khz'",
161
+ )
162
+ args = parser.parse_args()
163
+
164
+ mexpresso_card = asset_store.retrieve_card("mexpresso_text")
165
+ mexpresso_root_path = download_manager.download_dataset(
166
+ mexpresso_card.field("uri").as_uri(),
167
+ "mExpresso_text",
168
+ )
169
+ logger.info(f"The mExpresso dataset is downloaded to {mexpresso_root_path}")
170
+ mexpresso_path = mexpresso_root_path / "mexpresso_text"
171
+
172
+ # downsample all English speech
173
+ if args.existing_expresso_root is not None:
174
+ logger.info(
175
+ f"Re-use user manually downloaded Expresso from {args.existing_expresso_root}"
176
+ )
177
+ en_expresso_path = Path(args.existing_expresso_root)
178
+ else:
179
+ en_expresso_card = asset_store.retrieve_card("expresso")
180
+ en_expresso_root_path = download_manager.download_dataset(
181
+ en_expresso_card.field("uri").as_uri(),
182
+ "Expresso",
183
+ )
184
+ logger.info(
185
+ f"The English Expresso dataset is downloaded to {en_expresso_root_path}"
186
+ )
187
+ en_expresso_path = en_expresso_root_path / "expresso"
188
+ en_expresso_folder = args.output_folder / "En_Expresso"
189
+ en_expresso_df = build_en_manifest_from_oss(
190
+ Path(en_expresso_path), en_expresso_folder
191
+ )
192
+
193
+ for subset in ["dev", "test"]:
194
+ for lang in ["spa", "fra", "ita", "cmn", "deu"]:
195
+ df = pd.read_csv(
196
+ f"{mexpresso_path}/{subset}_mexpresso_{lang}.tsv", sep="\t", quoting=3
197
+ ).rename(columns={"text": "tgt_text"})
198
+ num_released_items = len(df)
199
+ df = df.merge(
200
+ en_expresso_df.rename(
201
+ columns={
202
+ "text": "src_text",
203
+ "audio": "src_audio",
204
+ "speaker": "src_speaker",
205
+ }
206
+ ),
207
+ on="id",
208
+ how="inner",
209
+ )
210
+ assert (
211
+ len(df) == num_released_items
212
+ ), f"Missing items from downloaded En Expresso"
213
+ df["src_lang"] = "eng"
214
+ df["tgt_lang"] = lang
215
+ # Check all the audio files exist
216
+ assert all(os.path.isfile(audio) for audio in df["src_audio"].tolist())
217
+ output_manifest_path = args.output_folder / f"{subset}_mexpresso_eng_{lang}.tsv"
218
+ df[
219
+ [
220
+ "id",
221
+ "src_audio", # converted 16kHz audio path
222
+ "src_speaker", # source speaker
223
+ "src_text", # source text
224
+ "src_lang", # source language id
225
+ "tgt_text", # target text
226
+ "tgt_lang", # target language id
227
+ "label", # style of utterance
228
+ ]
229
+ ].to_csv(output_manifest_path, sep="\t", quoting=3, index=None)
230
+ logger.info(f"Output {len(df)} rows to {output_manifest_path}")
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()