Spaces:
Running
Running
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Generate tf.data.Dataset object for deep speech training/evaluation.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import math | |
import random | |
# pylint: disable=g-bad-import-order | |
import numpy as np | |
from six.moves import xrange # pylint: disable=redefined-builtin | |
import soundfile | |
import tensorflow as tf | |
# pylint: enable=g-bad-import-order | |
import data.featurizer as featurizer # pylint: disable=g-bad-import-order | |
class AudioConfig(object): | |
"""Configs for spectrogram extraction from audio.""" | |
def __init__(self, | |
sample_rate, | |
window_ms, | |
stride_ms, | |
normalize=False): | |
"""Initialize the AudioConfig class. | |
Args: | |
sample_rate: an integer denoting the sample rate of the input waveform. | |
window_ms: an integer for the length of a spectrogram frame, in ms. | |
stride_ms: an integer for the frame stride, in ms. | |
normalize: a boolean for whether apply normalization on the audio feature. | |
""" | |
self.sample_rate = sample_rate | |
self.window_ms = window_ms | |
self.stride_ms = stride_ms | |
self.normalize = normalize | |
class DatasetConfig(object): | |
"""Config class for generating the DeepSpeechDataset.""" | |
def __init__(self, audio_config, data_path, vocab_file_path, sortagrad): | |
"""Initialize the configs for deep speech dataset. | |
Args: | |
audio_config: AudioConfig object specifying the audio-related configs. | |
data_path: a string denoting the full path of a manifest file. | |
vocab_file_path: a string specifying the vocabulary file path. | |
sortagrad: a boolean, if set to true, audio sequences will be fed by | |
increasing length in the first training epoch, which will | |
expedite network convergence. | |
Raises: | |
RuntimeError: file path not exist. | |
""" | |
self.audio_config = audio_config | |
assert tf.gfile.Exists(data_path) | |
assert tf.gfile.Exists(vocab_file_path) | |
self.data_path = data_path | |
self.vocab_file_path = vocab_file_path | |
self.sortagrad = sortagrad | |
def _normalize_audio_feature(audio_feature): | |
"""Perform mean and variance normalization on the spectrogram feature. | |
Args: | |
audio_feature: a numpy array for the spectrogram feature. | |
Returns: | |
a numpy array of the normalized spectrogram. | |
""" | |
mean = np.mean(audio_feature, axis=0) | |
var = np.var(audio_feature, axis=0) | |
normalized = (audio_feature - mean) / (np.sqrt(var) + 1e-6) | |
return normalized | |
def _preprocess_audio(audio_file_path, audio_featurizer, normalize): | |
"""Load the audio file and compute spectrogram feature.""" | |
data, _ = soundfile.read(audio_file_path) | |
feature = featurizer.compute_spectrogram_feature( | |
data, audio_featurizer.sample_rate, audio_featurizer.stride_ms, | |
audio_featurizer.window_ms) | |
# Feature normalization | |
if normalize: | |
feature = _normalize_audio_feature(feature) | |
# Adding Channel dimension for conv2D input. | |
feature = np.expand_dims(feature, axis=2) | |
return feature | |
def _preprocess_data(file_path): | |
"""Generate a list of tuples (wav_filename, wav_filesize, transcript). | |
Each dataset file contains three columns: "wav_filename", "wav_filesize", | |
and "transcript". This function parses the csv file and stores each example | |
by the increasing order of audio length (indicated by wav_filesize). | |
AS the waveforms are ordered in increasing length, audio samples in a | |
mini-batch have similar length. | |
Args: | |
file_path: a string specifying the csv file path for a dataset. | |
Returns: | |
A list of tuples (wav_filename, wav_filesize, transcript) sorted by | |
file_size. | |
""" | |
tf.logging.info("Loading data set {}".format(file_path)) | |
with tf.gfile.Open(file_path, "r") as f: | |
lines = f.read().splitlines() | |
# Skip the csv header in lines[0]. | |
lines = lines[1:] | |
# The metadata file is tab separated. | |
lines = [line.split("\t", 2) for line in lines] | |
# Sort input data by the length of audio sequence. | |
lines.sort(key=lambda item: int(item[1])) | |
return [tuple(line) for line in lines] | |
class DeepSpeechDataset(object): | |
"""Dataset class for training/evaluation of DeepSpeech model.""" | |
def __init__(self, dataset_config): | |
"""Initialize the DeepSpeechDataset class. | |
Args: | |
dataset_config: DatasetConfig object. | |
""" | |
self.config = dataset_config | |
# Instantiate audio feature extractor. | |
self.audio_featurizer = featurizer.AudioFeaturizer( | |
sample_rate=self.config.audio_config.sample_rate, | |
window_ms=self.config.audio_config.window_ms, | |
stride_ms=self.config.audio_config.stride_ms) | |
# Instantiate text feature extractor. | |
self.text_featurizer = featurizer.TextFeaturizer( | |
vocab_file=self.config.vocab_file_path) | |
self.speech_labels = self.text_featurizer.speech_labels | |
self.entries = _preprocess_data(self.config.data_path) | |
# The generated spectrogram will have 161 feature bins. | |
self.num_feature_bins = 161 | |
def batch_wise_dataset_shuffle(entries, epoch_index, sortagrad, batch_size): | |
"""Batch-wise shuffling of the data entries. | |
Each data entry is in the format of (audio_file, file_size, transcript). | |
If epoch_index is 0 and sortagrad is true, we don't perform shuffling and | |
return entries in sorted file_size order. Otherwise, do batch_wise shuffling. | |
Args: | |
entries: a list of data entries. | |
epoch_index: an integer of epoch index | |
sortagrad: a boolean to control whether sorting the audio in the first | |
training epoch. | |
batch_size: an integer for the batch size. | |
Returns: | |
The shuffled data entries. | |
""" | |
shuffled_entries = [] | |
if epoch_index == 0 and sortagrad: | |
# No need to shuffle. | |
shuffled_entries = entries | |
else: | |
# Shuffle entries batch-wise. | |
max_buckets = int(math.floor(len(entries) / batch_size)) | |
total_buckets = [i for i in xrange(max_buckets)] | |
random.shuffle(total_buckets) | |
shuffled_entries = [] | |
for i in total_buckets: | |
shuffled_entries.extend(entries[i * batch_size : (i + 1) * batch_size]) | |
# If the last batch doesn't contain enough batch_size examples, | |
# just append it to the shuffled_entries. | |
shuffled_entries.extend(entries[max_buckets * batch_size:]) | |
return shuffled_entries | |
def input_fn(batch_size, deep_speech_dataset, repeat=1): | |
"""Input function for model training and evaluation. | |
Args: | |
batch_size: an integer denoting the size of a batch. | |
deep_speech_dataset: DeepSpeechDataset object. | |
repeat: an integer for how many times to repeat the dataset. | |
Returns: | |
a tf.data.Dataset object for model to consume. | |
""" | |
# Dataset properties | |
data_entries = deep_speech_dataset.entries | |
num_feature_bins = deep_speech_dataset.num_feature_bins | |
audio_featurizer = deep_speech_dataset.audio_featurizer | |
feature_normalize = deep_speech_dataset.config.audio_config.normalize | |
text_featurizer = deep_speech_dataset.text_featurizer | |
def _gen_data(): | |
"""Dataset generator function.""" | |
for audio_file, _, transcript in data_entries: | |
features = _preprocess_audio( | |
audio_file, audio_featurizer, feature_normalize) | |
labels = featurizer.compute_label_feature( | |
transcript, text_featurizer.token_to_index) | |
input_length = [features.shape[0]] | |
label_length = [len(labels)] | |
# Yield a tuple of (features, labels) where features is a dict containing | |
# all info about the actual data features. | |
yield ( | |
{ | |
"features": features, | |
"input_length": input_length, | |
"label_length": label_length | |
}, | |
labels) | |
dataset = tf.data.Dataset.from_generator( | |
_gen_data, | |
output_types=( | |
{ | |
"features": tf.float32, | |
"input_length": tf.int32, | |
"label_length": tf.int32 | |
}, | |
tf.int32), | |
output_shapes=( | |
{ | |
"features": tf.TensorShape([None, num_feature_bins, 1]), | |
"input_length": tf.TensorShape([1]), | |
"label_length": tf.TensorShape([1]) | |
}, | |
tf.TensorShape([None])) | |
) | |
# Repeat and batch the dataset | |
dataset = dataset.repeat(repeat) | |
# Padding the features to its max length dimensions. | |
dataset = dataset.padded_batch( | |
batch_size=batch_size, | |
padded_shapes=( | |
{ | |
"features": tf.TensorShape([None, num_feature_bins, 1]), | |
"input_length": tf.TensorShape([1]), | |
"label_length": tf.TensorShape([1]) | |
}, | |
tf.TensorShape([None])) | |
) | |
# Prefetch to improve speed of input pipeline. | |
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) | |
return dataset | |