Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /deep_speech /data /dataset.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame contribute delete

9.45 kB

	# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Generate tf.data.Dataset object for deep speech training/evaluation."""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import math
	import random
	# pylint: disable=g-bad-import-order
	import numpy as np
	from six.moves import xrange # pylint: disable=redefined-builtin
	import soundfile
	import tensorflow as tf
	# pylint: enable=g-bad-import-order

	import data.featurizer as featurizer # pylint: disable=g-bad-import-order


	class AudioConfig(object):
	"""Configs for spectrogram extraction from audio."""

	def __init__(self,
	sample_rate,
	window_ms,
	stride_ms,
	normalize=False):
	"""Initialize the AudioConfig class.

	Args:
	sample_rate: an integer denoting the sample rate of the input waveform.
	window_ms: an integer for the length of a spectrogram frame, in ms.
	stride_ms: an integer for the frame stride, in ms.
	normalize: a boolean for whether apply normalization on the audio feature.
	"""

	self.sample_rate = sample_rate
	self.window_ms = window_ms
	self.stride_ms = stride_ms
	self.normalize = normalize


	class DatasetConfig(object):
	"""Config class for generating the DeepSpeechDataset."""

	def __init__(self, audio_config, data_path, vocab_file_path, sortagrad):
	"""Initialize the configs for deep speech dataset.

	Args:
	audio_config: AudioConfig object specifying the audio-related configs.
	data_path: a string denoting the full path of a manifest file.
	vocab_file_path: a string specifying the vocabulary file path.
	sortagrad: a boolean, if set to true, audio sequences will be fed by
	increasing length in the first training epoch, which will
	expedite network convergence.

	Raises:
	RuntimeError: file path not exist.
	"""

	self.audio_config = audio_config
	assert tf.gfile.Exists(data_path)
	assert tf.gfile.Exists(vocab_file_path)
	self.data_path = data_path
	self.vocab_file_path = vocab_file_path
	self.sortagrad = sortagrad


	def _normalize_audio_feature(audio_feature):
	"""Perform mean and variance normalization on the spectrogram feature.

	Args:
	audio_feature: a numpy array for the spectrogram feature.

	Returns:
	a numpy array of the normalized spectrogram.
	"""
	mean = np.mean(audio_feature, axis=0)
	var = np.var(audio_feature, axis=0)
	normalized = (audio_feature - mean) / (np.sqrt(var) + 1e-6)

	return normalized


	def _preprocess_audio(audio_file_path, audio_featurizer, normalize):
	"""Load the audio file and compute spectrogram feature."""
	data, _ = soundfile.read(audio_file_path)
	feature = featurizer.compute_spectrogram_feature(
	data, audio_featurizer.sample_rate, audio_featurizer.stride_ms,
	audio_featurizer.window_ms)
	# Feature normalization
	if normalize:
	feature = _normalize_audio_feature(feature)

	# Adding Channel dimension for conv2D input.
	feature = np.expand_dims(feature, axis=2)
	return feature


	def _preprocess_data(file_path):
	"""Generate a list of tuples (wav_filename, wav_filesize, transcript).

	Each dataset file contains three columns: "wav_filename", "wav_filesize",
	and "transcript". This function parses the csv file and stores each example
	by the increasing order of audio length (indicated by wav_filesize).
	AS the waveforms are ordered in increasing length, audio samples in a
	mini-batch have similar length.

	Args:
	file_path: a string specifying the csv file path for a dataset.

	Returns:
	A list of tuples (wav_filename, wav_filesize, transcript) sorted by
	file_size.
	"""
	tf.logging.info("Loading data set {}".format(file_path))
	with tf.gfile.Open(file_path, "r") as f:
	lines = f.read().splitlines()
	# Skip the csv header in lines[0].
	lines = lines[1:]
	# The metadata file is tab separated.
	lines = [line.split("\t", 2) for line in lines]
	# Sort input data by the length of audio sequence.
	lines.sort(key=lambda item: int(item[1]))

	return [tuple(line) for line in lines]


	class DeepSpeechDataset(object):
	"""Dataset class for training/evaluation of DeepSpeech model."""

	def __init__(self, dataset_config):
	"""Initialize the DeepSpeechDataset class.

	Args:
	dataset_config: DatasetConfig object.
	"""
	self.config = dataset_config
	# Instantiate audio feature extractor.
	self.audio_featurizer = featurizer.AudioFeaturizer(
	sample_rate=self.config.audio_config.sample_rate,
	window_ms=self.config.audio_config.window_ms,
	stride_ms=self.config.audio_config.stride_ms)
	# Instantiate text feature extractor.
	self.text_featurizer = featurizer.TextFeaturizer(
	vocab_file=self.config.vocab_file_path)

	self.speech_labels = self.text_featurizer.speech_labels
	self.entries = _preprocess_data(self.config.data_path)
	# The generated spectrogram will have 161 feature bins.
	self.num_feature_bins = 161


	def batch_wise_dataset_shuffle(entries, epoch_index, sortagrad, batch_size):
	"""Batch-wise shuffling of the data entries.

	Each data entry is in the format of (audio_file, file_size, transcript).
	If epoch_index is 0 and sortagrad is true, we don't perform shuffling and
	return entries in sorted file_size order. Otherwise, do batch_wise shuffling.

	Args:
	entries: a list of data entries.
	epoch_index: an integer of epoch index
	sortagrad: a boolean to control whether sorting the audio in the first
	training epoch.
	batch_size: an integer for the batch size.

	Returns:
	The shuffled data entries.
	"""
	shuffled_entries = []
	if epoch_index == 0 and sortagrad:
	# No need to shuffle.
	shuffled_entries = entries
	else:
	# Shuffle entries batch-wise.
	max_buckets = int(math.floor(len(entries) / batch_size))
	total_buckets = [i for i in xrange(max_buckets)]
	random.shuffle(total_buckets)
	shuffled_entries = []
	for i in total_buckets:
	shuffled_entries.extend(entries[i * batch_size : (i + 1) * batch_size])
	# If the last batch doesn't contain enough batch_size examples,
	# just append it to the shuffled_entries.
	shuffled_entries.extend(entries[max_buckets * batch_size:])

	return shuffled_entries


	def input_fn(batch_size, deep_speech_dataset, repeat=1):
	"""Input function for model training and evaluation.

	Args:
	batch_size: an integer denoting the size of a batch.
	deep_speech_dataset: DeepSpeechDataset object.
	repeat: an integer for how many times to repeat the dataset.

	Returns:
	a tf.data.Dataset object for model to consume.
	"""
	# Dataset properties
	data_entries = deep_speech_dataset.entries
	num_feature_bins = deep_speech_dataset.num_feature_bins
	audio_featurizer = deep_speech_dataset.audio_featurizer
	feature_normalize = deep_speech_dataset.config.audio_config.normalize
	text_featurizer = deep_speech_dataset.text_featurizer

	def _gen_data():
	"""Dataset generator function."""
	for audio_file, _, transcript in data_entries:
	features = _preprocess_audio(
	audio_file, audio_featurizer, feature_normalize)
	labels = featurizer.compute_label_feature(
	transcript, text_featurizer.token_to_index)
	input_length = [features.shape[0]]
	label_length = [len(labels)]
	# Yield a tuple of (features, labels) where features is a dict containing
	# all info about the actual data features.
	yield (
	{
	"features": features,
	"input_length": input_length,
	"label_length": label_length
	},
	labels)

	dataset = tf.data.Dataset.from_generator(
	_gen_data,
	output_types=(
	{
	"features": tf.float32,
	"input_length": tf.int32,
	"label_length": tf.int32
	},
	tf.int32),
	output_shapes=(
	{
	"features": tf.TensorShape([None, num_feature_bins, 1]),
	"input_length": tf.TensorShape([1]),
	"label_length": tf.TensorShape([1])
	},
	tf.TensorShape([None]))
	)

	# Repeat and batch the dataset
	dataset = dataset.repeat(repeat)

	# Padding the features to its max length dimensions.
	dataset = dataset.padded_batch(
	batch_size=batch_size,
	padded_shapes=(
	{
	"features": tf.TensorShape([None, num_feature_bins, 1]),
	"input_length": tf.TensorShape([1]),
	"label_length": tf.TensorShape([1])
	},
	tf.TensorShape([None]))
	)

	# Prefetch to improve speed of input pipeline.
	dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
	return dataset