Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet /nets /beam_search_transducer.py

tobiasc

Initial commit

ad16788 over 2 years ago

raw

history blame contribute delete

No virus

21.3 kB

	"""Search algorithms for transducer models."""

	from typing import List
	from typing import Union

	import numpy as np
	import torch

	from espnet.nets.pytorch_backend.transducer.utils import create_lm_batch_state
	from espnet.nets.pytorch_backend.transducer.utils import init_lm_state
	from espnet.nets.pytorch_backend.transducer.utils import is_prefix
	from espnet.nets.pytorch_backend.transducer.utils import recombine_hyps
	from espnet.nets.pytorch_backend.transducer.utils import select_lm_state
	from espnet.nets.pytorch_backend.transducer.utils import substract
	from espnet.nets.transducer_decoder_interface import Hypothesis
	from espnet.nets.transducer_decoder_interface import NSCHypothesis
	from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface


	class BeamSearchTransducer:
	"""Beam search implementation for transducer."""

	def __init__(
	self,
	decoder: Union[TransducerDecoderInterface, torch.nn.Module],
	joint_network: torch.nn.Module,
	beam_size: int,
	lm: torch.nn.Module = None,
	lm_weight: float = 0.1,
	search_type: str = "default",
	max_sym_exp: int = 2,
	u_max: int = 50,
	nstep: int = 1,
	prefix_alpha: int = 1,
	score_norm: bool = True,
	nbest: int = 1,
	):
	"""Initialize transducer beam search.

	Args:
	decoder: Decoder class to use
	joint_network: Joint Network class
	beam_size: Number of hypotheses kept during search
	lm: LM class to use
	lm_weight: lm weight for soft fusion
	search_type: type of algorithm to use for search
	max_sym_exp: number of maximum symbol expansions at each time step ("tsd")
	u_max: maximum output sequence length ("alsd")
	nstep: number of maximum expansion steps at each time step ("nsc")
	prefix_alpha: maximum prefix length in prefix search ("nsc")
	score_norm: normalize final scores by length ("default")
	nbest: number of returned final hypothesis
	"""
	self.decoder = decoder
	self.joint_network = joint_network

	self.beam_size = beam_size
	self.hidden_size = decoder.dunits
	self.vocab_size = decoder.odim
	self.blank = decoder.blank

	if self.beam_size <= 1:
	self.search_algorithm = self.greedy_search
	elif search_type == "default":
	self.search_algorithm = self.default_beam_search
	elif search_type == "tsd":
	self.search_algorithm = self.time_sync_decoding
	elif search_type == "alsd":
	self.search_algorithm = self.align_length_sync_decoding
	elif search_type == "nsc":
	self.search_algorithm = self.nsc_beam_search
	else:
	raise NotImplementedError

	self.lm = lm
	self.lm_weight = lm_weight

	if lm is not None:
	self.use_lm = True
	self.is_wordlm = True if hasattr(lm.predictor, "wordlm") else False
	self.lm_predictor = lm.predictor.wordlm if self.is_wordlm else lm.predictor
	self.lm_layers = len(self.lm_predictor.rnn)
	else:
	self.use_lm = False

	self.max_sym_exp = max_sym_exp
	self.u_max = u_max
	self.nstep = nstep
	self.prefix_alpha = prefix_alpha
	self.score_norm = score_norm

	self.nbest = nbest

	def __call__(self, h: torch.Tensor) -> Union[List[Hypothesis], List[NSCHypothesis]]:
	"""Perform beam search.

	Args:
	h: Encoded speech features (T_max, D_enc)

	Returns:
	nbest_hyps: N-best decoding results

	"""
	self.decoder.set_device(h.device)

	if not hasattr(self.decoder, "decoders"):
	self.decoder.set_data_type(h.dtype)

	nbest_hyps = self.search_algorithm(h)

	return nbest_hyps

	def sort_nbest(
	self, hyps: Union[List[Hypothesis], List[NSCHypothesis]]
	) -> Union[List[Hypothesis], List[NSCHypothesis]]:
	"""Sort hypotheses by score or score given sequence length.

	Args:
	hyps: list of hypotheses

	Return:
	hyps: sorted list of hypotheses

	"""
	if self.score_norm:
	hyps.sort(key=lambda x: x.score / len(x.yseq), reverse=True)
	else:
	hyps.sort(key=lambda x: x.score, reverse=True)

	return hyps[: self.nbest]

	def greedy_search(self, h: torch.Tensor) -> List[Hypothesis]:
	"""Greedy search implementation for transformer-transducer.

	Args:
	h: Encoded speech features (T_max, D_enc)

	Returns:
	hyp: 1-best decoding results

	"""
	dec_state = self.decoder.init_state(1)

	hyp = Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)
	cache = {}

	y, state, _ = self.decoder.score(hyp, cache)

	for i, hi in enumerate(h):
	ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
	logp, pred = torch.max(ytu, dim=-1)

	if pred != self.blank:
	hyp.yseq.append(int(pred))
	hyp.score += float(logp)

	hyp.dec_state = state

	y, state, _ = self.decoder.score(hyp, cache)

	return [hyp]

	def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
	"""Beam search implementation.

	Args:
	x: Encoded speech features (T_max, D_enc)

	Returns:
	nbest_hyps: N-best decoding results

	"""
	beam = min(self.beam_size, self.vocab_size)
	beam_k = min(beam, (self.vocab_size - 1))

	dec_state = self.decoder.init_state(1)

	kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)]
	cache = {}

	for hi in h:
	hyps = kept_hyps
	kept_hyps = []

	while True:
	max_hyp = max(hyps, key=lambda x: x.score)
	hyps.remove(max_hyp)

	y, state, lm_tokens = self.decoder.score(max_hyp, cache)

	ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
	top_k = ytu[1:].topk(beam_k, dim=-1)

	kept_hyps.append(
	Hypothesis(
	score=(max_hyp.score + float(ytu[0:1])),
	yseq=max_hyp.yseq[:],
	dec_state=max_hyp.dec_state,
	lm_state=max_hyp.lm_state,
	)
	)

	if self.use_lm:
	lm_state, lm_scores = self.lm.predict(max_hyp.lm_state, lm_tokens)
	else:
	lm_state = max_hyp.lm_state

	for logp, k in zip(*top_k):
	score = max_hyp.score + float(logp)

	if self.use_lm:
	score += self.lm_weight * lm_scores[0][k + 1]

	hyps.append(
	Hypothesis(
	score=score,
	yseq=max_hyp.yseq[:] + [int(k + 1)],
	dec_state=state,
	lm_state=lm_state,
	)
	)

	hyps_max = float(max(hyps, key=lambda x: x.score).score)
	kept_most_prob = sorted(
	[hyp for hyp in kept_hyps if hyp.score > hyps_max],
	key=lambda x: x.score,
	)
	if len(kept_most_prob) >= beam:
	kept_hyps = kept_most_prob
	break

	return self.sort_nbest(kept_hyps)

	def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
	"""Time synchronous beam search implementation.

	Based on https://ieeexplore.ieee.org/document/9053040

	Args:
	h: Encoded speech features (T_max, D_enc)

	Returns:
	nbest_hyps: N-best decoding results

	"""
	beam = min(self.beam_size, self.vocab_size)

	beam_state = self.decoder.init_state(beam)

	B = [
	Hypothesis(
	yseq=[self.blank],
	score=0.0,
	dec_state=self.decoder.select_state(beam_state, 0),
	)
	]
	cache = {}

	if self.use_lm and not self.is_wordlm:
	B[0].lm_state = init_lm_state(self.lm_predictor)

	for hi in h:
	A = []
	C = B

	h_enc = hi.unsqueeze(0)

	for v in range(self.max_sym_exp):
	D = []

	beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
	C,
	beam_state,
	cache,
	self.use_lm,
	)

	beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
	beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)

	seq_A = [h.yseq for h in A]

	for i, hyp in enumerate(C):
	if hyp.yseq not in seq_A:
	A.append(
	Hypothesis(
	score=(hyp.score + float(beam_logp[i, 0])),
	yseq=hyp.yseq[:],
	dec_state=hyp.dec_state,
	lm_state=hyp.lm_state,
	)
	)
	else:
	dict_pos = seq_A.index(hyp.yseq)

	A[dict_pos].score = np.logaddexp(
	A[dict_pos].score, (hyp.score + float(beam_logp[i, 0]))
	)

	if v < (self.max_sym_exp - 1):
	if self.use_lm:
	beam_lm_states = create_lm_batch_state(
	[c.lm_state for c in C], self.lm_layers, self.is_wordlm
	)

	beam_lm_states, beam_lm_scores = self.lm.buff_predict(
	beam_lm_states, beam_lm_tokens, len(C)
	)

	for i, hyp in enumerate(C):
	for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
	new_hyp = Hypothesis(
	score=(hyp.score + float(logp)),
	yseq=(hyp.yseq + [int(k)]),
	dec_state=self.decoder.select_state(beam_state, i),
	lm_state=hyp.lm_state,
	)

	if self.use_lm:
	new_hyp.score += self.lm_weight * beam_lm_scores[i, k]

	new_hyp.lm_state = select_lm_state(
	beam_lm_states, i, self.lm_layers, self.is_wordlm
	)

	D.append(new_hyp)

	C = sorted(D, key=lambda x: x.score, reverse=True)[:beam]

	B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]

	return self.sort_nbest(B)

	def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
	"""Alignment-length synchronous beam search implementation.

	Based on https://ieeexplore.ieee.org/document/9053040

	Args:
	h: Encoded speech features (T_max, D_enc)

	Returns:
	nbest_hyps: N-best decoding results

	"""
	beam = min(self.beam_size, self.vocab_size)

	h_length = int(h.size(0))
	u_max = min(self.u_max, (h_length - 1))

	beam_state = self.decoder.init_state(beam)

	B = [
	Hypothesis(
	yseq=[self.blank],
	score=0.0,
	dec_state=self.decoder.select_state(beam_state, 0),
	)
	]
	final = []
	cache = {}

	if self.use_lm and not self.is_wordlm:
	B[0].lm_state = init_lm_state(self.lm_predictor)

	for i in range(h_length + u_max):
	A = []

	B_ = []
	h_states = []
	for hyp in B:
	u = len(hyp.yseq) - 1
	t = i - u + 1

	if t > (h_length - 1):
	continue

	B_.append(hyp)
	h_states.append((t, h[t]))

	if B_:
	beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
	B_,
	beam_state,
	cache,
	self.use_lm,
	)

	h_enc = torch.stack([h[1] for h in h_states])

	beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
	beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)

	if self.use_lm:
	beam_lm_states = create_lm_batch_state(
	[b.lm_state for b in B_], self.lm_layers, self.is_wordlm
	)

	beam_lm_states, beam_lm_scores = self.lm.buff_predict(
	beam_lm_states, beam_lm_tokens, len(B_)
	)

	for i, hyp in enumerate(B_):
	new_hyp = Hypothesis(
	score=(hyp.score + float(beam_logp[i, 0])),
	yseq=hyp.yseq[:],
	dec_state=hyp.dec_state,
	lm_state=hyp.lm_state,
	)

	A.append(new_hyp)

	if h_states[i][0] == (h_length - 1):
	final.append(new_hyp)

	for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
	new_hyp = Hypothesis(
	score=(hyp.score + float(logp)),
	yseq=(hyp.yseq[:] + [int(k)]),
	dec_state=self.decoder.select_state(beam_state, i),
	lm_state=hyp.lm_state,
	)

	if self.use_lm:
	new_hyp.score += self.lm_weight * beam_lm_scores[i, k]

	new_hyp.lm_state = select_lm_state(
	beam_lm_states, i, self.lm_layers, self.is_wordlm
	)

	A.append(new_hyp)

	B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]
	B = recombine_hyps(B)

	if final:
	return self.sort_nbest(final)
	else:
	return B

	def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
	"""N-step constrained beam search implementation.

	Based and modified from https://arxiv.org/pdf/2002.03577.pdf.
	Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
	until further modifications.

	Note: the algorithm is not in his "complete" form but works almost as
	intended.

	Args:
	h: Encoded speech features (T_max, D_enc)

	Returns:
	nbest_hyps: N-best decoding results

	"""
	beam = min(self.beam_size, self.vocab_size)
	beam_k = min(beam, (self.vocab_size - 1))

	beam_state = self.decoder.init_state(beam)

	init_tokens = [
	NSCHypothesis(
	yseq=[self.blank],
	score=0.0,
	dec_state=self.decoder.select_state(beam_state, 0),
	)
	]

	cache = {}

	beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
	init_tokens,
	beam_state,
	cache,
	self.use_lm,
	)

	state = self.decoder.select_state(beam_state, 0)

	if self.use_lm:
	beam_lm_states, beam_lm_scores = self.lm.buff_predict(
	None, beam_lm_tokens, 1
	)
	lm_state = select_lm_state(
	beam_lm_states, 0, self.lm_layers, self.is_wordlm
	)
	lm_scores = beam_lm_scores[0]
	else:
	lm_state = None
	lm_scores = None

	kept_hyps = [
	NSCHypothesis(
	yseq=[self.blank],
	score=0.0,
	dec_state=state,
	y=[beam_y[0]],
	lm_state=lm_state,
	lm_scores=lm_scores,
	)
	]

	for hi in h:
	hyps = sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True)
	kept_hyps = []

	h_enc = hi.unsqueeze(0)

	for j, hyp_j in enumerate(hyps[:-1]):
	for hyp_i in hyps[(j + 1) :]:
	curr_id = len(hyp_j.yseq)
	next_id = len(hyp_i.yseq)

	if (
	is_prefix(hyp_j.yseq, hyp_i.yseq)
	and (curr_id - next_id) <= self.prefix_alpha
	):
	ytu = torch.log_softmax(
	self.joint_network(hi, hyp_i.y[-1]), dim=-1
	)

	curr_score = hyp_i.score + float(ytu[hyp_j.yseq[next_id]])

	for k in range(next_id, (curr_id - 1)):
	ytu = torch.log_softmax(
	self.joint_network(hi, hyp_j.y[k]), dim=-1
	)

	curr_score += float(ytu[hyp_j.yseq[k + 1]])

	hyp_j.score = np.logaddexp(hyp_j.score, curr_score)

	S = []
	V = []
	for n in range(self.nstep):
	beam_y = torch.stack([hyp.y[-1] for hyp in hyps])

	beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
	beam_topk = beam_logp[:, 1:].topk(beam_k, dim=-1)

	for i, hyp in enumerate(hyps):
	S.append(
	NSCHypothesis(
	yseq=hyp.yseq[:],
	score=hyp.score + float(beam_logp[i, 0:1]),
	y=hyp.y[:],
	dec_state=hyp.dec_state,
	lm_state=hyp.lm_state,
	lm_scores=hyp.lm_scores,
	)
	)

	for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
	score = hyp.score + float(logp)

	if self.use_lm:
	score += self.lm_weight * float(hyp.lm_scores[k])

	V.append(
	NSCHypothesis(
	yseq=hyp.yseq[:] + [int(k)],
	score=score,
	y=hyp.y[:],
	dec_state=hyp.dec_state,
	lm_state=hyp.lm_state,
	lm_scores=hyp.lm_scores,
	)
	)

	V.sort(key=lambda x: x.score, reverse=True)
	V = substract(V, hyps)[:beam]

	beam_state = self.decoder.create_batch_states(
	beam_state,
	[v.dec_state for v in V],
	[v.yseq for v in V],
	)
	beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
	V,
	beam_state,
	cache,
	self.use_lm,
	)

	if self.use_lm:
	beam_lm_states = create_lm_batch_state(
	[v.lm_state for v in V], self.lm_layers, self.is_wordlm
	)
	beam_lm_states, beam_lm_scores = self.lm.buff_predict(
	beam_lm_states, beam_lm_tokens, len(V)
	)

	if n < (self.nstep - 1):
	for i, v in enumerate(V):
	v.y.append(beam_y[i])

	v.dec_state = self.decoder.select_state(beam_state, i)

	if self.use_lm:
	v.lm_state = select_lm_state(
	beam_lm_states, i, self.lm_layers, self.is_wordlm
	)
	v.lm_scores = beam_lm_scores[i]

	hyps = V[:]
	else:
	beam_logp = torch.log_softmax(
	self.joint_network(h_enc, beam_y), dim=-1
	)

	for i, v in enumerate(V):
	if self.nstep != 1:
	v.score += float(beam_logp[i, 0])

	v.y.append(beam_y[i])

	v.dec_state = self.decoder.select_state(beam_state, i)

	if self.use_lm:
	v.lm_state = select_lm_state(
	beam_lm_states, i, self.lm_layers, self.is_wordlm
	)
	v.lm_scores = beam_lm_scores[i]

	kept_hyps = sorted((S + V), key=lambda x: x.score, reverse=True)[:beam]

	return self.sort_nbest(kept_hyps)