|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Download and preprocess LibriSpeech dataset for DeepSpeech model.""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import codecs |
|
import fnmatch |
|
import os |
|
import sys |
|
import tarfile |
|
import tempfile |
|
import unicodedata |
|
|
|
from absl import app as absl_app |
|
from absl import flags as absl_flags |
|
import pandas |
|
from six.moves import urllib |
|
from sox import Transformer |
|
import tensorflow as tf |
|
|
|
LIBRI_SPEECH_URLS = { |
|
"train-clean-100": |
|
"http://www.openslr.org/resources/12/train-clean-100.tar.gz", |
|
"train-clean-360": |
|
"http://www.openslr.org/resources/12/train-clean-360.tar.gz", |
|
"train-other-500": |
|
"http://www.openslr.org/resources/12/train-other-500.tar.gz", |
|
"dev-clean": |
|
"http://www.openslr.org/resources/12/dev-clean.tar.gz", |
|
"dev-other": |
|
"http://www.openslr.org/resources/12/dev-other.tar.gz", |
|
"test-clean": |
|
"http://www.openslr.org/resources/12/test-clean.tar.gz", |
|
"test-other": |
|
"http://www.openslr.org/resources/12/test-other.tar.gz" |
|
} |
|
|
|
|
|
def download_and_extract(directory, url): |
|
"""Download and extract the given split of dataset. |
|
|
|
Args: |
|
directory: the directory where to extract the tarball. |
|
url: the url to download the data file. |
|
""" |
|
|
|
if not tf.gfile.Exists(directory): |
|
tf.gfile.MakeDirs(directory) |
|
|
|
_, tar_filepath = tempfile.mkstemp(suffix=".tar.gz") |
|
|
|
try: |
|
tf.logging.info("Downloading %s to %s" % (url, tar_filepath)) |
|
|
|
def _progress(count, block_size, total_size): |
|
sys.stdout.write("\r>> Downloading {} {:.1f}%".format( |
|
tar_filepath, 100.0 * count * block_size / total_size)) |
|
sys.stdout.flush() |
|
|
|
urllib.request.urlretrieve(url, tar_filepath, _progress) |
|
print() |
|
statinfo = os.stat(tar_filepath) |
|
tf.logging.info( |
|
"Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) |
|
with tarfile.open(tar_filepath, "r") as tar: |
|
tar.extractall(directory) |
|
finally: |
|
tf.gfile.Remove(tar_filepath) |
|
|
|
|
|
def convert_audio_and_split_transcript(input_dir, source_name, target_name, |
|
output_dir, output_file): |
|
"""Convert FLAC to WAV and split the transcript. |
|
|
|
For audio file, convert the format from FLAC to WAV using the sox.Transformer |
|
library. |
|
For transcripts, each line contains the sequence id and the corresponding |
|
transcript (separated by space): |
|
Input data format: seq-id transcript_of_seq-id |
|
For example: |
|
1-2-0 transcript_of_1-2-0.flac |
|
1-2-1 transcript_of_1-2-1.flac |
|
... |
|
|
|
Each sequence id has a corresponding .flac file. |
|
Parse the transcript file and generate a new csv file which has three columns: |
|
"wav_filename": the absolute path to a wav file. |
|
"wav_filesize": the size of the corresponding wav file. |
|
"transcript": the transcript for this audio segement. |
|
|
|
Args: |
|
input_dir: the directory which holds the input dataset. |
|
source_name: the name of the specified dataset. e.g. test-clean |
|
target_name: the directory name for the newly generated audio files. |
|
e.g. test-clean-wav |
|
output_dir: the directory to place the newly generated csv files. |
|
output_file: the name of the newly generated csv file. e.g. test-clean.csv |
|
""" |
|
|
|
tf.logging.info("Preprocessing audio and transcript for %s" % source_name) |
|
source_dir = os.path.join(input_dir, source_name) |
|
target_dir = os.path.join(input_dir, target_name) |
|
|
|
if not tf.gfile.Exists(target_dir): |
|
tf.gfile.MakeDirs(target_dir) |
|
|
|
files = [] |
|
tfm = Transformer() |
|
|
|
|
|
for root, _, filenames in tf.gfile.Walk(source_dir): |
|
for filename in fnmatch.filter(filenames, "*.trans.txt"): |
|
trans_file = os.path.join(root, filename) |
|
with codecs.open(trans_file, "r", "utf-8") as fin: |
|
for line in fin: |
|
seqid, transcript = line.split(" ", 1) |
|
|
|
|
|
transcript = unicodedata.normalize("NFKD", transcript).encode( |
|
"ascii", "ignore").decode("ascii", "ignore").strip().lower() |
|
|
|
|
|
flac_file = os.path.join(root, seqid + ".flac") |
|
wav_file = os.path.join(target_dir, seqid + ".wav") |
|
if not tf.gfile.Exists(wav_file): |
|
tfm.build(flac_file, wav_file) |
|
wav_filesize = os.path.getsize(wav_file) |
|
|
|
files.append((os.path.abspath(wav_file), wav_filesize, transcript)) |
|
|
|
|
|
|
|
csv_file_path = os.path.join(output_dir, output_file) |
|
df = pandas.DataFrame( |
|
data=files, columns=["wav_filename", "wav_filesize", "transcript"]) |
|
df.to_csv(csv_file_path, index=False, sep="\t") |
|
tf.logging.info("Successfully generated csv file {}".format(csv_file_path)) |
|
|
|
|
|
def download_and_process_datasets(directory, datasets): |
|
"""Download and pre-process the specified list of LibriSpeech dataset. |
|
|
|
Args: |
|
directory: the directory to put all the downloaded and preprocessed data. |
|
datasets: list of dataset names that will be downloaded and processed. |
|
""" |
|
|
|
tf.logging.info("Preparing LibriSpeech dataset: {}".format( |
|
",".join(datasets))) |
|
for dataset in datasets: |
|
tf.logging.info("Preparing dataset %s", dataset) |
|
dataset_dir = os.path.join(directory, dataset) |
|
download_and_extract(dataset_dir, LIBRI_SPEECH_URLS[dataset]) |
|
convert_audio_and_split_transcript( |
|
dataset_dir + "/LibriSpeech", dataset, dataset + "-wav", |
|
dataset_dir + "/LibriSpeech", dataset + ".csv") |
|
|
|
|
|
def define_data_download_flags(): |
|
"""Define flags for data downloading.""" |
|
absl_flags.DEFINE_string( |
|
"data_dir", "/tmp/librispeech_data", |
|
"Directory to download data and extract the tarball") |
|
absl_flags.DEFINE_bool("train_only", False, |
|
"If true, only download the training set") |
|
absl_flags.DEFINE_bool("dev_only", False, |
|
"If true, only download the dev set") |
|
absl_flags.DEFINE_bool("test_only", False, |
|
"If true, only download the test set") |
|
|
|
|
|
def main(_): |
|
if not tf.gfile.Exists(FLAGS.data_dir): |
|
tf.gfile.MakeDirs(FLAGS.data_dir) |
|
|
|
if FLAGS.train_only: |
|
download_and_process_datasets( |
|
FLAGS.data_dir, |
|
["train-clean-100", "train-clean-360", "train-other-500"]) |
|
elif FLAGS.dev_only: |
|
download_and_process_datasets(FLAGS.data_dir, ["dev-clean", "dev-other"]) |
|
elif FLAGS.test_only: |
|
download_and_process_datasets(FLAGS.data_dir, ["test-clean", "test-other"]) |
|
else: |
|
|
|
download_and_process_datasets(FLAGS.data_dir, LIBRI_SPEECH_URLS.keys()) |
|
|
|
|
|
if __name__ == "__main__": |
|
tf.logging.set_verbosity(tf.logging.INFO) |
|
define_data_download_flags() |
|
FLAGS = absl_flags.FLAGS |
|
absl_app.run(main) |
|
|