ash56 commited on 12 days ago

Commit

211c22d

verified ·

1 Parent(s): a1d9110

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq/examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py +101 -0
fairseq/examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py +224 -0
fairseq/examples/speech_text_joint_to_text/data/pair_denoising_dataset.py +318 -0
fairseq/examples/speech_text_joint_to_text/docs/ende-mustc.md +118 -0
fairseq/examples/speech_text_joint_to_text/docs/iwslt2021.md +76 -0
fairseq/examples/speech_text_joint_to_text/docs/pre-training.md +192 -0
fairseq/examples/speech_text_joint_to_text/models/__init__.py +8 -0
fairseq/examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py +698 -0
fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py +1093 -0
fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py +526 -0
fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py +584 -0
fairseq/examples/speech_text_joint_to_text/scripts/convert_model.py +71 -0
fairseq/examples/speech_text_joint_to_text/scripts/g2p_encode.py +191 -0
fairseq/examples/speech_text_joint_to_text/tasks/__init__.py +8 -0
fairseq/examples/speech_text_joint_to_text/tasks/pair_denoising.py +447 -0
fairseq/examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py +654 -0
fairseq/examples/speech_text_joint_to_text/tasks/speech_text_joint.py +377 -0
fairseq/examples/speech_to_speech/README.md +7 -0
fairseq/examples/speech_to_speech/__init__.py +6 -0
fairseq/examples/speech_to_speech/asr_bleu/README.md +34 -0
fairseq/examples/speech_to_speech/asr_bleu/__init__.py +0 -0
fairseq/examples/speech_to_speech/asr_bleu/asr_model_cfgs.json +198 -0
fairseq/examples/speech_to_speech/asr_bleu/compute_asr_bleu.py +244 -0
fairseq/examples/speech_to_speech/asr_bleu/requirements.txt +7 -0
fairseq/examples/speech_to_speech/asr_bleu/utils.py +306 -0
fairseq/examples/speech_to_speech/benchmarking/README.md +31 -0
fairseq/examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml +19 -0
fairseq/examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml +28 -0
fairseq/examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml +22 -0
fairseq/examples/speech_to_speech/benchmarking/configs/S2T.yaml +13 -0
fairseq/examples/speech_to_speech/benchmarking/core.py +487 -0
fairseq/examples/speech_to_speech/benchmarking/data_utils.py +264 -0
fairseq/examples/speech_to_speech/benchmarking/get_metrics.py +162 -0
fairseq/examples/speech_to_speech/docs/data_augmentation.md +435 -0
fairseq/examples/speech_to_speech/docs/direct_s2st_discrete_units.md +181 -0
fairseq/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md +125 -0
fairseq/examples/speech_to_speech/docs/textless_s2st_real_data.md +89 -0
fairseq/examples/speech_to_speech/generate_waveform_from_code.py +116 -0
fairseq/examples/speech_to_speech/preprocessing/__init__.py +4 -0
fairseq/examples/speech_to_speech/preprocessing/data_utils.py +88 -0
fairseq/examples/speech_to_speech/preprocessing/prep_s2spect_data.py +169 -0
fairseq/examples/speech_to_speech/preprocessing/prep_s2ut_data.py +114 -0
fairseq/examples/speech_to_speech/preprocessing/prep_sn_data.py +88 -0
fairseq/examples/speech_to_speech/preprocessing/prep_sn_output_data.py +58 -0
fairseq/examples/speech_to_speech/unity/__init__.py +7 -0
fairseq/examples/speech_to_speech/unity/sequence_generator.py +626 -0
fairseq/examples/speech_to_speech/unity/sequence_generator_multi_decoder.py +267 -0
fairseq/examples/speech_to_text/README.md +77 -0
fairseq/examples/speech_to_text/data_utils.py +383 -0
fairseq/examples/speech_to_text/docs/covost_example.md +140 -0

fairseq/examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from fairseq import utils
+from fairseq.criterions import register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+    LabelSmoothedCrossEntropyCriterionConfig,
+    label_smoothed_nll_loss,
+)
+@register_criterion(
+    "speech_text_pretrain_cross_entropy",
+    dataclass=LabelSmoothedCrossEntropyCriterionConfig,
+)
+class SpeechTextPreTrainCrossEntCriterion(LabelSmoothedCrossEntropyCriterion):
+    def __init__(self, task, sentence_avg, label_smoothing, report_accuracy=False):
+        super().__init__(
+            task, sentence_avg, label_smoothing, report_accuracy=report_accuracy
+        )
+    def forward(self, model, sample, reduce=True):
+        net_output = model(**sample["net_input"])
+        loss, nll_loss, nsentences, ntokens, n_correct = self.compute_loss(
+            model, net_output, sample, reduce=reduce
+        )
+        sample_size = nsentences if self.sentence_avg else ntokens
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            logging_output["n_correct"] = utils.item(n_correct)
+            logging_output["total"] = utils.item(ntokens)
+        return loss, sample_size, logging_output
+    def get_lprobs_and_target(self, model, net_output, sample):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        target = model.get_targets(sample, net_output)
+        assert self.ignore_prefix_size == 0
+        if self.ignore_prefix_size > 0:
+            if getattr(lprobs, "batch_first", False):
+                lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
+                target = target[:, self.ignore_prefix_size :].contiguous()
+            else:
+                lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous()
+                target = target[self.ignore_prefix_size :, :].contiguous()
+        return lprobs, target
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
+        n_correct = 0
+        if isinstance(target, dict):
+            t_lprobs = target["target_logprobs"]
+            if not lprobs.batch_first:
+                lprobs = lprobs.transpose(0, 1)
+                t_lprobs = t_lprobs.transpose(0, 1)
+            nsentences, seq_len = lprobs.size()[:2]
+            ntokens = nsentences * seq_len
+            t_probs = t_lprobs.exp()
+            mask_indices = (
+                net_output[1]["mask_indices"][0]
+                if len(net_output[1]["mask_indices"]) > 0
+                else None
+            )
+            # mask_indices is True for those masking frames
+            if mask_indices is not None:  # B X T
+                t_probs = t_probs.masked_fill(mask_indices.eq(False).unsqueeze(-1), 0)
+                ntokens = mask_indices.int().sum()
+            t_probs = t_probs.detach()
+            t_lprobs = t_lprobs.detach()
+            loss = (
+                -(t_probs * (lprobs - t_lprobs)).sum()
+                if reduce
+                else -(t_probs * (lprobs - t_lprobs)).sum(-1, keepdim=True)
+            )
+            nll_loss = loss
+        else:
+            nsentences = target.size(0)
+            mask = target.ne(self.padding_idx)
+            loss, nll_loss = label_smoothed_nll_loss(
+                lprobs.view(-1, lprobs.size(-1)),
+                target.view(-1),
+                self.eps,
+                ignore_index=self.padding_idx,
+                reduce=reduce,
+            )
+            n_correct = torch.sum(
+                lprobs.argmax(-1).masked_select(mask).eq(target.masked_select(mask))
+            )
+            ntokens = torch.sum(mask)
+        return loss, nll_loss, nsentences, ntokens, n_correct

fairseq/examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss
+from fairseq.logging import metrics
+@register_criterion("guided_label_smoothed_cross_entropy_with_accuracy")
+class GuidedCrossEntAccCriterion(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        guide_alpha,
+        text_input_cost_ratio,
+        label_smoothing,
+        disable_text_guide_update_num=0,
+        attentive_cost_regularization=0,
+    ):
+        """
+        guide_alpha:            alpha to inteplate nll and kd loss
+        text_input_cost_ratio:  loss ratio for text only input data
+        label_smoothing:        label smoothing ratio
+        disable_text_guide_update_num:  only use nll loss for the first N updates
+        attentive_cost_regularization:  ratio fo attentive cost
+        """
+        super().__init__(task)
+        self.alpha = guide_alpha
+        self.attn_beta = attentive_cost_regularization
+        self.sentence_avg = sentence_avg
+        self.eps = label_smoothing
+        self.text_input_cost_ratio = text_input_cost_ratio
+        self.disable_update_num = disable_text_guide_update_num
+        assert self.alpha >= 0 and self.alpha <= 1.0
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
+                            help='epsilon for label smoothing, 0 means no label smoothing')
+        # fmt: off
+        parser.add_argument('--guide-alpha', default=0., type=float, metavar='D',
+                            help='alpha to merge kd cost from text to speech input with ce loss')
+        # fmt: off
+        parser.add_argument('--disable-text-guide-update-num', default=0, type=int, metavar='D',
+                            help='disable guided target from text for the first N updates.')
+        parser.add_argument("--attentive-cost-regularization", default=0.0, type=float, metavar='D',
+                            help="use encoder attentive loss regularization with cost ratio D")
+        parser.add_argument("--attentive-cost-without-normalize", action='store_true',
+                            help="Don't do normalization during attentive cost computation")
+    def forward(self, model, sample, reduce=True):
+        reduction = 'sum' if reduce else 'none'
+        net_input = sample["net_input"]
+        net_output = model(**net_input)
+        attn_cost = None
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        is_dual_input = True if net_input['src_tokens'] is not None and net_input.get('src_txt_tokens') is not None else False
+        target = model.get_targets(sample, net_output)
+        src_token_num = 0
+        if is_dual_input:
+            # lprobs_spch from speech encoder and lprobs_text from text encoder
+            lprobs_spch, lprobs_text = torch.chunk(lprobs, 2)
+            lprobs_spch.batch_first = lprobs.batch_first
+            lprobs_text.batch_first = lprobs.batch_first
+            speech_loss, speech_nll_loss, speech_correct, speech_total = \
+                self.guide_loss_and_acc(model, lprobs_spch, lprobs_text, target, reduce=(reduction == 'sum'))
+            text_loss, text_nll_loss, text_correct, text_total = self.compute_loss_and_acc(model, lprobs_text, target, reduction=reduction)
+            loss = (speech_loss + text_loss)
+            nll_loss = (speech_nll_loss + text_nll_loss)
+            correct = speech_correct + text_correct
+            total = speech_total + text_total
+            attn_cost = net_output[1].get('attn_cost')
+            if attn_cost is not None:
+                # attn_cost is batch_first and padding tokens have been masked already
+                src_token_num = attn_cost.ne(0).sum()
+                attn_cost = attn_cost.sum()
+                loss = loss + attn_cost * self.attn_beta
+            else:
+                attn_cost = 0
+        else:
+            loss, nll_loss, correct, total = self.compute_loss_and_acc(model, lprobs, target, reduction=reduction)
+            if sample["net_input"]['src_tokens'] is None:   # text input only
+                loss = loss * self.text_input_cost_ratio
+            speech_loss = None
+            speech_nll_loss = None
+        sample_size, logging_output = self.get_logging_output(
+            sample, loss, nll_loss, correct, total, src_token_num, speech_loss, speech_nll_loss, attn_cost, is_dual_input
+        )
+        return loss, sample_size, logging_output
+    def compute_loss_and_acc(self, model, lprobs, target, reduction='sum'):
+        if not lprobs.batch_first:
+            lprobs = lprobs.transpose(0, 1)
+        lprobs = lprobs.view(-1, lprobs.size(-1))  # -> (B x T) x C
+        target = target.view(-1)
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=(reduction == 'sum'),
+        )
+        mask = target.ne(self.padding_idx)
+        correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask)))
+        total = torch.sum(mask)
+        return loss, nll_loss, correct, total
+    def guide_loss_and_acc(self, model, lprobs, lprobs_teacher, target, reduce=True):
+        """ lprobs_teacher is used as guide for lprobs """
+        if self.alpha == 0.0 or model.num_updates < self.disable_update_num:
+            return self.compute_loss_and_acc(model, lprobs, target, reduction=('sum' if reduce else 'none'))
+        if not lprobs.batch_first:
+            lprobs = lprobs.transpose(0, 1)
+            lprobs_teacher = lprobs_teacher.transpose(0, 1)
+        lprobs = lprobs.view(-1, lprobs.size(-1)).float()  # -> (B x T) x C
+        lprobs_teacher = lprobs_teacher.view(-1, lprobs_teacher.size(-1)).float()  # -> (B x T) x C
+        target = target.view(-1)
+        loss = F.nll_loss(lprobs, target, ignore_index=self.padding_idx, reduction='sum' if reduce else 'none')
+        nll_loss = loss
+        probs_teacher = lprobs_teacher.exp().masked_fill_(target.unsqueeze(-1).eq(self.padding_idx), 0)
+        probs_teacher = probs_teacher.detach()
+        guide_loss = -(probs_teacher*lprobs).sum() if reduce else -(probs_teacher*lprobs).sum(-1, keepdim=True)
+        loss = self.alpha*guide_loss + (1.0 - self.alpha)*loss
+        mask = target.ne(self.padding_idx)
+        correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask)))
+        total = torch.sum(mask)
+        return loss, nll_loss, correct, total
+    def get_logging_output(
+        self,
+        sample,
+        loss,
+        nll_loss,
+        correct,
+        total,
+        src_token_num=0,
+        speech_loss=None,
+        speech_nll_loss=None,
+        attn_cost=None,
+        is_dual_input=False,
+    ):
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        mul_size = 2 if is_dual_input else 1
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "nll_loss": utils.item(nll_loss.data),  # * sample['ntokens'],
+            "ntokens": sample["ntokens"]*mul_size,
+            "nsentences": sample["target"].size(0)*mul_size,
+            "sample_size": sample_size*mul_size,
+            "correct": utils.item(correct.data),
+            "total": utils.item(total.data),
+            "src_token_num": utils.item(src_token_num.data) if src_token_num > 0 else 0,
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+        if speech_loss is not None:
+            logging_output["speech_loss"] = utils.item(speech_loss.data)
+            logging_output["speech_nll_loss"] = utils.item(speech_nll_loss.data)
+            logging_output["sample_size_speech_cost"] = sample_size
+            logging_output["speech_attn_loss"] = attn_cost
+        return sample_size*mul_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        correct_sum = sum(log.get("correct", 0) for log in logging_outputs)
+        total_sum = sum(log.get("total", 0) for log in logging_outputs)
+        src_token_sum = sum(log.get("src_token_num", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        speech_loss_sum = sum(log.get("speech_loss", 0) for log in logging_outputs)
+        speech_nll_loss_sum = sum(log.get("speech_nll_loss", 0) for log in logging_outputs)
+        speech_attn_loss_sum = sum(log.get("speech_attn_loss", 0) for log in logging_outputs)
+        sample_size_speech = sum(log.get("sample_size_speech_cost", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            "nll_loss": nll_loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            # if args.sentence_avg, then sample_size is nsentences, and loss
+            # is per-sentence loss; else sample_size is ntokens, and the loss
+            # becomes per-output token loss
+            "speech_loss": speech_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0,
+            "speech_nll_loss": speech_nll_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0,
+            "speech_attn_loss": speech_attn_loss_sum / src_token_sum / math.log(2) if src_token_sum > 0 else 0.0,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0,
+            "correct": correct_sum,
+            "total": total_sum,
+            "src_token_num": src_token_sum,
+            # total is the number of validate tokens
+        }
+        return agg_output
+    @classmethod
+    def reduce_metrics(cls, logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs)
+        for k, v in agg_logging_outputs.items():
+            if k in {'nsentences', 'ntokens', 'sample_size'}:
+                continue
+            metrics.log_scalar(k, v, round=3)

fairseq/examples/speech_text_joint_to_text/data/pair_denoising_dataset.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import math
+import re
+import torch
+from fairseq.data import data_utils
+from fairseq.data.language_pair_dataset import LanguagePairDataset
+# Part of the code is modified from DenoisingDataset
+# compared with DenoisingDataset, no permute_sentences or documents (rotate_ratio, permute_sentence_ratio)
+class LanguagePairDenoisingDataset(LanguagePairDataset):
+    def __init__(
+        self,
+        src,
+        src_sizes,
+        src_dict,
+        tgt,
+        tgt_sizes,
+        tgt_dict,
+        mask_idx,
+        mask_whole_words,
+        seed,
+        args,
+        left_pad_source=True,
+        left_pad_target=False,
+        shuffle=True,
+        input_feeding=True,
+        remove_eos_from_source=False,
+        append_eos_to_target=False,
+        align_dataset=None,
+        constraints=None,
+        append_bos=False,
+        eos=None,
+        num_buckets=0,
+        src_lang_id=None,
+        tgt_lang_id=None,
+        pad_to_multiple=1,
+    ):
+        super().__init__(
+            src,
+            src_sizes,
+            src_dict,
+            tgt,
+            tgt_sizes,
+            tgt_dict,
+            left_pad_source,
+            left_pad_target,
+            shuffle,
+            input_feeding,
+            remove_eos_from_source,
+            append_eos_to_target,
+            align_dataset,
+            constraints,
+            append_bos,
+            eos,
+            num_buckets,
+            src_lang_id,
+            tgt_lang_id,
+            pad_to_multiple,
+        )
+        self.mask_idx = mask_idx
+        self.mask_whole_word = mask_whole_words
+        self.mask_ratio = args.mask
+        self.random_ratio = args.mask_random
+        self.insert_ratio = args.insert
+        self.replace_length = args.replace_length
+        if self.replace_length not in [-1, 0, 1]:
+            raise ValueError(f"invalid arg: replace_length={self.replace_length}")
+        if args.mask_length not in ["subword", "word", "span-poisson"]:
+            raise ValueError(f"invalid arg: mask-length={args.mask_length}")
+        if args.mask_length == "subword" and args.replace_length not in [0, 1]:
+            raise ValueError("if using subwords, use replace-length=1 or 0")
+        self.mask_span_distribution = None
+        if args.mask_length == "span-poisson":
+            # Text infilling: "A number of text spans are sampled, with span lengths drawn from a Poisson distribution (λ = 3). Each span is replaced with a single [MASK] token. 0-length spans correspond to the insertion of [MASK] tokens."
+            _lambda = args.poisson_lambda
+            lambda_to_the_k = 1
+            e_to_the_minus_lambda = math.exp(-_lambda)
+            k_factorial = 1
+            ps = []
+            for k in range(0, 128):
+                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
+                lambda_to_the_k *= _lambda
+                k_factorial *= k + 1
+                if ps[-1] < 0.0000001:
+                    break
+            ps = torch.FloatTensor(ps)
+            self.mask_span_distribution = torch.distributions.Categorical(ps)
+        self.epoch = 0
+        self.seed = seed
+        def _is_phoneme(x):
+            if re.search("<lang:", x) or x in (
+                "<mask>",
+                "<sil>",
+                "<pad>",
+                "<s>",
+                "</s>",
+                "<unk>",
+            ):
+                return False
+            return True
+        self.voc_valid_ids = torch.LongTensor(
+            [i for i, x in enumerate(self.src_dict.symbols) if _is_phoneme(x)]
+        )
+        self.voc_valid_size = self.voc_valid_ids.size(0)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return False
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+    def __getitem__(self, index):
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        src_item = copy.deepcopy(self.src[index])
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            source = src_item
+            assert source[-1] == self.eos
+            if self.mask_ratio > 0:
+                source = self.add_whole_word_mask(source, self.mask_ratio)
+            if self.insert_ratio > 0:
+                source = self.add_insertion_noise(source, self.insert_ratio)
+            src_item = source
+        if self.append_eos_to_target:
+            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
+            if self.tgt and self.tgt[index][-1] != eos:
+                tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])])
+        if self.append_bos:
+            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
+            if self.tgt and self.tgt[index][0] != bos:
+                tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]])
+            bos = self.src_dict.bos()
+            if src_item[0] != bos:
+                src_item = torch.cat([torch.LongTensor([bos]), src_item])
+        if self.remove_eos_from_source:
+            eos = self.src_dict.eos()
+            if src_item[-1] == eos:
+                src_item = src_item[:-1]
+        example = {
+            "id": index,
+            "source": src_item,
+            "target": tgt_item,
+        }
+        if self.align_dataset is not None:
+            example["alignment"] = self.align_dataset[index]
+        if self.constraints is not None:
+            example["constraints"] = self.constraints[index]
+        if self.src_lang_id is not None:
+            example["src_lang_id"] = self.src_lang_id
+        if self.tgt_lang_id is not None:
+            example["tgt_lang_id"] = self.tgt_lang_id
+        return example
+    # following functions are borrowed from denoising_dataset
+    def word_starts(self, source):
+        if self.mask_whole_word is not None:
+            is_word_start = self.mask_whole_word.gather(0, source)
+        else:
+            is_word_start = torch.ones(source.size())
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        return is_word_start
+    def add_whole_word_mask(self, source, p):
+        is_word_start = self.word_starts(source)
+        num_to_mask = int(math.ceil(is_word_start.float().sum() * p))
+        num_inserts = 0
+        if num_to_mask == 0:
+            return source
+        if self.mask_span_distribution is not None:
+            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
+            # Make sure we have enough to mask
+            cum_length = torch.cumsum(lengths, 0)
+            while cum_length[-1] < num_to_mask:
+                lengths = torch.cat(
+                    [
+                        lengths,
+                        self.mask_span_distribution.sample(sample_shape=(num_to_mask,)),
+                    ],
+                    dim=0,
+                )
+                cum_length = torch.cumsum(lengths, 0)
+            # Trim to masking budget
+            i = 0
+            while cum_length[i] < num_to_mask:
+                i += 1
+            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
+            num_to_mask = i + 1
+            lengths = lengths[:num_to_mask]
+            # Handle 0-length mask (inserts) separately
+            lengths = lengths[lengths > 0]
+            num_inserts = num_to_mask - lengths.size(0)
+            num_to_mask -= num_inserts
+            if num_to_mask == 0:
+                return self.add_insertion_noise(source, num_inserts / source.size(0))
+            assert (lengths > 0).all()
+        else:
+            lengths = torch.ones((num_to_mask,)).long()
+        assert is_word_start[-1] == 0
+        word_starts = is_word_start.nonzero(as_tuple=False)
+        indices = word_starts[
+            torch.randperm(word_starts.size(0))[:num_to_mask]
+        ].squeeze(1)
+        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
+        source_length = source.size(0)
+        assert source_length - 1 not in indices
+        to_keep = torch.ones(source_length, dtype=torch.bool)
+        is_word_start[
+            -1
+        ] = 255  # acts as a long length, so spans don't go over the end of doc
+        if self.replace_length == 0:
+            to_keep[indices] = 0
+        else:
+            # keep index, but replace it with [MASK]
+            source[indices] = self.mask_idx
+            source[indices[mask_random]] = self.voc_valid_ids[
+                torch.randint(0, self.voc_valid_size - 1, size=(mask_random.sum(),))
+            ]
+        if self.mask_span_distribution is not None:
+            assert len(lengths.size()) == 1
+            assert lengths.size() == indices.size()
+            lengths -= 1
+            while indices.size(0) > 0:
+                assert lengths.size() == indices.size()
+                lengths -= is_word_start[indices + 1].long()
+                uncompleted = lengths >= 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                lengths = lengths[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = self.voc_valid_ids[
+                        torch.randint(
+                            0, self.voc_valid_size - 1, size=(mask_random.sum(),)
+                        )
+                    ]
+        else:
+            # A bit faster when all lengths are 1
+            while indices.size(0) > 0:
+                uncompleted = is_word_start[indices + 1] == 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = self.voc_valid_ids[
+                        torch.randint(
+                            0, self.voc_valid_size - 1, size=(mask_random.sum(),)
+                        )
+                    ]
+                assert source_length - 1 not in indices
+        source = source[to_keep]
+        if num_inserts > 0:
+            source = self.add_insertion_noise(source, num_inserts / source.size(0))
+        return source
+    def add_insertion_noise(self, tokens, p):
+        if p == 0.0:
+            return tokens
+        num_tokens = len(tokens)
+        n = int(math.ceil(num_tokens * p))
+        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
+        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
+        noise_mask[noise_indices] = 1
+        result = torch.LongTensor(n + len(tokens)).fill_(-1)
+        num_random = int(math.ceil(n * self.random_ratio))
+        result[noise_indices[num_random:]] = self.mask_idx
+        result[noise_indices[:num_random]] = self.voc_valid_ids[
+            torch.randint(0, self.voc_valid_size - 1, size=(num_random,))
+        ]
+        result[~noise_mask] = tokens
+        assert (result >= 0).all()
+        return result

fairseq/examples/speech_text_joint_to_text/docs/ende-mustc.md ADDED Viewed

	@@ -0,0 +1,118 @@

+[[Back]](..)
+# Joint Speech Text Training for the MuST-C English to German Speech Translation task
+Joint Training Baseline: it is based on paper ["A general multi-task learning framework to leverage text data for speech to text tasks"](https://arxiv.org/pdf/2010.11338.pdf)
+Enhanced Joint Training: the joint training is enhanced with pre-trained models, cross attentive regularization and online knowledge distillation based on paper ["Improving Speech Translation by Understanding and Learning from the Auxiliary Text Translation Task"](https://research.fb.com/publications/improving-speech-translation-by-understanding-and-learning-from-the-auxiliary-text-translation-task)
+## Prepare Data
+#### Download files
+-   Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/spm.model)
+-   Dictionary [dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/dict.txt)
+-   config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/config.yaml)
+#### Prepare MuST-C data set
+-   Please follow the data preparation in the [S2T example](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mustc_example.md)
+-   Convert source text under the "src_text" column in the tsv file into phoneme representation.
+```bash
+    python examples/speech_text_joint_to_text/scripts/g2p_encode.py \
+        --lower-case --do-filter --use-word-start --no-punc \
+        --reserve-word examples/speech_text_joint_to_text/configs/mustc_noise.list \
+        --data-path ${must_c_en_de_src_text} \
+        --out-path ${must_c_en_de_src_text_pho}
+```
+-   Replace the source text under the "src_text" column in the tsv file with the corresponding phoneme reprentation generated in the step above.
+Below is the snapshot for the MuST-C en-de dev tsv
+```
+id  audio   n_frames    tgt_text    src_text    speaker
+ted_767_0   en-de/flac.zip:10071514743:48445    56160   Heute spreche ich zu Ihnen über Energie und Klima.  ▁AY1 M ▁G OW1 IH0 NG ▁T UW1 ▁T AO1 K ▁T AH0 D EY1 ▁AH0 B AW1 T ▁EH1 N ER0 JH IY0 ▁AH0 N D ▁K L AY1 M AH0 T  spk.767_
+ted_767_1   en-de/flac.zip:1214217978:205678    226080  Und das überrascht vielleicht etwas, weil sich meine Vollzeitbeschäftigung bei der Stiftung hauptsächlich um Impfstoffe und Saatgut dreht, um die Dinge, die wir erfinden und liefern müssen um den ärmsten 2 Milliarden ein besseres Leben zu ermöglichen. ▁AH0 N D ▁DH AE1 T ▁M AY1 T ▁S IY1 M ▁AH0 ▁B IH1 T ▁S ER0 P R AY1 Z IH0 NG ▁B IH0 K AO1 Z ▁M AY1 ▁F UH1 L ▁T AY1 M ▁W ER1 K ▁AE1 T ▁DH AH0 ▁F AW0 N D EY1 SH AH0 N ▁IH1 Z ▁M OW1 S T L IY0 ▁AH0 B AW1 T ▁V AE2 K S IY1 N Z ▁AH0 N D ▁S IY1 D Z ▁AH0 B AW1 T ▁DH AH0 ▁TH IH1 NG Z ▁DH AE1 T ▁W IY1 ▁N IY1 D ▁T UW1 ▁IH0 N V EH1 N T ▁AH0 N D ▁D IH0 L IH1 V ER0 ▁T UW1 ▁HH EH1 L P ▁DH AH0 ▁P UH1 R IH0 S T ▁T UW1 ▁B IH1 L Y AH0 N ▁L AY1 V ▁B EH1 T ER0 ▁L IH1 V Z spk.767_
+```
+-   Prepare phoneme dictionary and save to $MANIFEST_ROOT as [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/src_dict.txt)
+#### Prepare WMT text data
+-   [Download wmt data](https://github.com/pytorch/fairseq/blob/main/examples/translation/prepare-wmt14en2de.sh)
+-   Convert source text (English) into phoneme representation as above
+-   Generate binary parallel files with "fairseq-preprocess" from fairseq for training and validation. The source input is English phoneme representation and the target input is German sentencepiece token .  The output is saved under $parallel_text_data
+## Training
+The model is trained with 8 v100 GPUs.
+#### Download pretrained models
+-    [pretrain_encoder](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_transformer_m.pt)
+-    [pretrain_nmt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_mt.pt)
+#### Training scripts
+- Jointly trained model from scratch
+```bash
+python train.py ${MANIFEST_ROOT} \
+    --save-dir ${save_dir} \
+    --num-workers 8 \
+    --task speech_text_joint_to_text \
+    --arch dualinputs2ttransformer_s \
+    --user-dir examples/speech_text_joint_to_text \
+    --max-epoch 100 --update-mix-data \
+    --optimizer adam --lr-scheduler inverse_sqrt \
+    --lr 0.001 --update-freq 4 --clip-norm 10.0 \
+    --criterion guided_label_smoothed_cross_entropy_with_accuracy \
+    --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \
+    --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \
+    --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \
+    --dropout 0.1 --warmup-updates 20000  \
+    --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \
+    --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \
+    --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \
+    --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \
+    --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \
+    --keep-last-epochs 10
+```
+- Jointly trained model with good initialization, cross attentive loss and online knowledge distillation
+```bash
+python train.py ${MANIFEST_ROOT} \
+    --save-dir ${save_dir} \
+    --num-workers 8 \
+    --task speech_text_joint_to_text \
+    --arch dualinputs2ttransformer_m \
+    --user-dir examples/speech_text_joint_to_text \
+    --max-epoch 100 --update-mix-data \
+    --optimizer adam --lr-scheduler inverse_sqrt \
+    --lr 0.002 --update-freq 4 --clip-norm 10.0 \
+    --criterion guided_label_smoothed_cross_entropy_with_accuracy \
+    --guide-alpha 0.8 --disable-text-guide-update-num 5000 \
+    --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \
+    --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \
+    --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \
+    --dropout 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \
+    --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \
+    --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \
+    --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \
+    --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \
+    --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \
+    --load-pretrain-speech-encoder ${pretrain_encoder} \
+    --load-pretrain-decoder ${pretrain_nmt} \
+    --load-pretrain-text-encoder-last ${pretrain_nmt} \
+    --keep-last-epochs 10
+```
+## Evaluation
+```bash
+python ./fairseq_cli/generate.py \
+        ${MANIFEST_ROOT} \
+        --task speech_text_joint_to_text \
+        --max-tokens 25000 \
+        --nbest 1 \
+        --results-path ${infer_results} \
+        --batch-size 512 \
+        --path ${model} \
+        --gen-subset tst-COMMON_st \
+        --config-yaml config.yaml \
+        --scoring sacrebleu \
+        --beam 5 --lenpen 1.0 \
+        --user-dir examples/speech_text_joint_to_text \
+        --load-speech-only
+```
+## Results (Joint training with initialization + CAR + online KD)
+|Direction|En-De | En-Es | En-Fr |
+|---|---|---|---|
+|BLEU|27.4| 31.2 | 37.6 |
+|checkpoint | [link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_ave_10.pt) |[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_es/checkpoint_ave_10.pt)|[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_fr/checkpoint_ave_10.pt)|

fairseq/examples/speech_text_joint_to_text/docs/iwslt2021.md ADDED Viewed

	@@ -0,0 +1,76 @@

+[[Back]](..)
+# Joint Speech Text Training for the 2021 IWSLT multilingual speech translation
+This directory contains the code from paper ["FST: the FAIR Speech Translation System for the IWSLT21 Multilingual Shared Task"](https://arxiv.org/pdf/2107.06959.pdf).
+## Prepare Data
+#### Download files
+-   Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/spm.model)
+-   Dictionary [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/dict.txt)
+-   Config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/config.yaml)
+#### Prepare
+-   Please follow the data preparation in [speech-to-text](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mtedx_example.md) with option "--use-audio-input" for raw audio tsv files.
+-   Prepare tsv files with phoneme based source text (under column 'src_text') as [MuST-C](ende-mustc.md) example.
+## Training
+#### Download pretrained models
+- [Pretrained mbart model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/mbart.pt)
+- [Pretrained w2v model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/xlsr_53_56k.pt)
+#### Training scripts
+```bash
+python train.py ${MANIFEST_ROOT} \
+    --save-dir ${save_dir} \
+    --user-dir examples/speech_text_joint_to_text \
+    --train-subset train_es_en_tedx,train_es_es_tedx,train_fr_en_tedx,train_fr_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_en_tedx,train_pt_pt_tedx \
+    --valid-subset valid_es_en_tedx,valid_es_es_tedx,valid_es_fr_tedx,valid_es_it_tedx,valid_es_pt_tedx,valid_fr_en_tedx,valid_fr_es_tedx,valid_fr_fr_tedx,valid_fr_pt_tedx,valid_it_en_tedx,valid_it_es_tedx,valid_it_it_tedx,valid_pt_en_tedx,valid_pt_es_tedx,valid_pt_pt_tedx \
+    --config-yaml config.yaml --ddp-backend no_c10d \
+    --num-workers 2 --task speech_text_joint_to_text \
+    --criterion guided_label_smoothed_cross_entropy_with_accuracy \
+    --label-smoothing 0.3 --guide-alpha 0.8 \
+    --disable-text-guide-update-num 5000 --arch dualinputxmtransformer_base \
+    --max-tokens 500000 --max-sentences 3 --max-tokens-valid 800000 \
+    --max-source-positions 800000 --enc-grad-mult 2.0 \
+    --attentive-cost-regularization 0.02 --optimizer adam \
+    --clip-norm 1.0 --log-format simple --log-interval 200 \
+    --keep-last-epochs 5 --seed 1 \
+    --w2v-path ${w2v_path} \
+    --load-pretrained-mbart-from ${mbart_path} \
+    --max-update 1000000 --update-freq 4 \
+    --skip-invalid-size-inputs-valid-test \
+    --skip-encoder-projection --save-interval 1 \
+    --attention-dropout 0.3 --mbart-dropout 0.3 \
+    --finetune-w2v-params all --finetune-mbart-decoder-params all \
+    --finetune-mbart-encoder-params all --stack-w2v-mbart-encoder \
+    --drop-w2v-layers 12 --normalize \
+    --lr 5e-05 --lr-scheduler inverse_sqrt --warmup-updates 5000
+```
+## Evaluation
+```bash
+python ./fairseq_cli/generate.py
+   ${MANIFEST_ROOT} \
+   --task speech_text_joint_to_text \
+   --user-dir ./examples/speech_text_joint_to_text \
+   --load-speech-only  --gen-subset  test_es_en_tedx \
+   --path  ${model}  \
+   --max-source-positions 800000 \
+   --skip-invalid-size-inputs-valid-test \
+   --config-yaml config.yaml \
+   --infer-target-lang en  \
+   --max-tokens 800000 \
+   --beam 5 \
+   --results-path ${RESULTS_DIR}  \
+   --scoring sacrebleu
+```
+The trained model can be downloaded [here](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/checkpoint17.pt)
+|direction|es_en|fr_en|pt_en|it_en|fr_es|pt_es|it_es|es_es|fr_fr|pt_pt|it_it|
+|---|---|---|---|---|---|---|---|---|---|---|---|
+|BLEU|31.62|36.93|35.07|27.12|38.87|35.57|34.13|74.59|74.64|70.84|69.76|

fairseq/examples/speech_text_joint_to_text/docs/pre-training.md ADDED Viewed

	@@ -0,0 +1,192 @@

+[[Back]](..)
+# Unified Speech-Text Pre-training for Speech Translation and Recognition
+This directory contains the  pre-training recipes from paper ["Unified Speech-Text Pre-training for Speech Translation and Recognition"](https://arxiv.org/abs/2204.05409).
+## Librispeech ASR Pre-training
+### Prepare Data
+#### Download files
+#### Prepare pre-training data
+-   Text to text task (T2T): prepare the binary data following the similar steps in [EN_DE Joint training](./ende-mustc.md). The source  data is presented as phomeme token sequence and the target  data is coded as subword tokens via SentencePiece. The text data is downloaded from [openslr](https://www.openslr.org/12)
+-   Self-supervised speech learning task (SSL): The data is prepared as [wav2vec 2.0](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec/README.md)
+-   Speech to phoneme classification task (S2P): The tsv file contains 5 fields: "id",  "audio",   "n_frames",    "tgt_text",  and  "align". The tgt_text field is corresponding to the phoneme based representation of the speech data. "align" field contains the alignment information. The phoneme level forced alignment for the labelled speech data (i.e. Librispeech) can be obtained via [kaldi](http://kaldi-asr.org) or [MFA](https://montrealcorpustools.github.io/Montreal-Forced-Aligner/). The segmentation information is normalized to 0$\sim$1 for the whole utterance. The snapshot of the tsv file is below:
+```
+id  audio   n_frames    tgt_text    align
+116-288045-0000 /librispeech/dev-other/116/288045/116-288045-0000.flac    170400  <sil> ▁AE1 Z AY1 ▁AH0 P R OW1 CH T ▁DH AH1 ▁S IH1 T IY0 <sil> AY1 ▁HH ER1 D ▁B EH1 L Z ▁R IH1 NG IH0 NG <sil> ▁AE1 N D AH0 ▁L IH1 T AH0 L ▁L EY1 T ER0 AY1 ▁F AW1 N D ▁DH AH0 ▁S T R IY1 T S ▁AH0 S T IH1 R ▁W IH0 TH ▁TH R AO1 NG Z ▁AH0 V ▁W EH1 L ▁D R EH1 S T ▁P IY1 P AH0 L ▁IH1 N ▁F AE1 M L IY0 ▁G R UW1 P S <sil> ▁W EH1 N D IH0 NG ▁DH EH1 R ▁W EY1 <sil> ▁HH IH1 DH ER0 ▁AH0 N D ▁TH IH1 DH ER0 <sil> 0.047977 0.056444 0.064911 0.075259 0.081844 0.089370 0.095014 0.104421 0.109125 0.111947 0.115710 0.120414 0.134525 0.141110 0.143932 0.174036 0.176858 0.190028 0.199436 0.207902 0.218250 0.224835 0.231421 0.242709 0.251176 0.257761 0.263405 0.268109 0.270931 0.290687 0.342427 0.349953 0.353716 0.356538 0.360301 0.363123 0.365945 0.368768 0.371590 0.376294 0.384760 0.394167 0.401693 0.409219 0.419567 0.430856 0.441204 0.444026 0.446849 0.449671 0.456256 0.463782 0.471308 0.477893 0.486359 0.491063 0.494826 0.501411 0.512700 0.517404 0.520226 0.534337 0.540922 0.545626 0.550329 0.559737 0.568203 0.583255 0.592662 0.600188 0.603951 0.611477 0.619003 0.624647 0.634055 0.639699 0.646284 0.653810 0.659454 0.664158 0.670743 0.682032 0.687676 0.692380 0.708373 0.713076 0.719661 0.729069 0.740357 0.744120 0.748824 0.752587 0.761994 0.770461 0.781750 0.790216 0.805268 0.808090 0.823142 0.832549 0.836312 0.840075 0.843838 0.851364 0.854186 0.857008 0.862653 0.878645 0.898401 0.901223 0.906867 0.913452 0.920038 0.926623 0.934149 0.939793 0.942615 0.945437 0.952023 0.957667 0.977422 1.000000
+```
+-   Speech to text task (S2T): The data preparation follow the steps in [EN_DE Joint training](./ende-mustc.md).
+#### Prepare fine-tuning data:
+We re-use the data from T2T and S2T tasks in the fine-tuning stage.
+### Model Build
+#### Pre-training
+```
+python train.py  $T2T_DATA \
+    --save-dir $SAVE_PRE_PATH --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising \
+    --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d \
+    --lang-pairs pho-wrd --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 --no-emb-update-unsup --report-accuracy --lr 0.001 --end-learning-rate 1e-06 \
+    --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 --update-freq 6 --validate-interval-updates 10000 --train-subset train \
+    --valid-subset valid,valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \
+    --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train_960.ali --sup-speech-valid-subset dev-clean.ali --sup-speech-s2s-data $S2T_DATA_PATH \
+    --sup-speech-s2s-train-subset train --sup-speech-s2s-valid-subset dev-clean --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv \
+    --batch-size 200 --batch-size-valid 150 --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 3072 --max-speech-positions 600000 \
+    --max-sample-size 750000 --min-sample-size 64000 --max-speech-tokens 750000 --max-tokens-valid 750000 --skip-invalid-size-inputs-valid-test \
+    --unsupervised-speech-sample-ratio 3.0 --supervised-speech-sample-ratio 5 --supervised-speech-s2s-sample-ratio 5 --text-sample-ratio 1.0 --mask 0.3 --mask-random 0.1 \
+    --mask-length span-poisson --speech-sup-mask-prob 0.3 --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack \
+    --no-scale-feature --activation-fn gelu --speech-extractor-mode default --stacked-encoder all --encoder-normalize-before --decoder-normalize-before \
+    --encoder-learned-pos --decoder-learned-pos --dropout 0.1 --load-pretrained-mbart-encoder-from $BART --load-pretrained-mbart-decoder-from $BART
+```
+The current implementation also supports model pre-training without the forced alignment supervised data. In this case, CTC is used to optimize the S2P task. We need to do following changes for the setting:
+1. options to be added
+```
+--use-sup-speech-ctc --criterion speech_text_pretrain_compound
+```
+2. options to be deleted
+```
+--same-data-update --criterion speech_text_pretrain_cross_entropy
+```
+However, we find the CTC based pre-training is still worse than the forced alignment based setting. It could be partially due to the inferior pre-training setting that we re-use the forced alignment based pre-training setting for the CTC based pre-training.
+#### Fine-tuning
+```
+python train.py  $S2T_DATA_PATH \
+    --save-dir $SAVE_FT_PATH  --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack \
+    --user-dir examples/speech_text_joint_to_text --max-update 100000 --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 3 --clip-norm 10.0 \
+    --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --label-smoothing 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \
+    --enc-grad-mult 2.0 --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --max-target-positions 1024 --no-scale-feature \
+    --activation-fn gelu --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt \
+    --encoder-normalize-before --decoder-normalize-before --speech-extractor-mode default --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 \
+    --speech-mask-length 10 --speech-mask-prob 0.65 --text-sample-ratio 0.25 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data text_bin \
+    --text-input-cost-ratio 0.5 --langpairs pho-wrd --update-mix-data --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 500 \
+    --config-yaml config.yaml --skip-invalid-size-inputs-valid-test --keep-last-epochs 50 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos
+```
+### Evaluation
+The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL
+```
+python ./fairseq_cli/generate.py \
+    $S2T_DATA_PATH \
+    --task speech_text_joint_to_text \
+    --max-tokens 800000  \
+    --max-source-positions 800000 \
+    --nbest 1 \
+    --results-path $RESULTS_LOG \
+    --batch-size 512 \
+    --path $FINAL_MODEL \
+    --gen-subset $SUBSET \
+    --config-yaml config.yaml \
+    --scoring wer \
+    --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \
+    --user-dir examples/speech_text_joint_to_text --load-speech-only \
+    --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''}
+```
+### Results and models
+| | dev-clean | dev-other | test-clean | test-other |
+|---|---|---|---|---|
+| WER| 2.0 | 4.4 | 2.1 |4.6 |
+**Model Links**:
+-   [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/config_s2p.yaml): Config for S2P
+-   [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/spm.model): Sentence Piece model
+-   [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/src_dict.txt): Source Phoneme Dictionary
+-   [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/tgt_dict.txt): Target Sentence Piece Dictionary
+-   [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/config.yaml): Config for S2T
+-   [BART](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/bart.pt): trained from Librispeech text data
+-   [Joint Pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/checkpoint6.pt): model pre-trained with 960 hours Librispeech data (S2P, S2T) Librispeech text training data (T2T) and Librilight data (SSL)
+-   [Fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/checkpoint_ave_10.pt): the pre-trained model is fined one 960 hours Librispeech speech and text data. (S2T + T2T)
+## MuST-C
+### Prepare Data
+Compared with the ASR Librispeech ASR recipe, the differences are below:
+-   Replace the speech data with corresponding MuST-C data
+-   Parallel text data from WMT is replaced the Librispeech text data
+### Model Build
+#### Pre-training
+EN-DE is used as an example
+```
+python train.py  $TXT_DATA \
+    --save-dir $SAVE_PRE_PATH  --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 \
+    --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d --lang-pairs-bitext en-fr --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 \
+    --no-emb-update-unsup --use-decoder-output-proj --report-accuracy --lr 0.001 --end-learning-rate 1e-06 --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 \
+    --update-freq 8 --validate-interval-updates 10000 --train-subset train --valid-subset valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \
+    --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train --sup-speech-valid-subset dev --sup-speech-s2s-data $S2T_DATA_PATH --sup-speech-s2s-train-subset train \
+    --sup-speech-s2s-valid-subset dev --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv --batch-size 200 --batch-size-valid 100 \
+    --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 2048 --max-speech-positions 600000 --max-sample-size 600000 --min-sample-size 64000 \
+    --max-speech-tokens 600000 --max-tokens-valid 600000 --skip-invalid-size-inputs-valid-test --unsupervised-speech-sample-ratio 1.2 --supervised-speech-sample-ratio 10 \
+    --supervised-speech-s2s-sample-ratio 10 --bitext-sample-ratio 0.5 --mask 0.3 --mask-random 0.1 --mask-length span-poisson --speech-sup-mask-prob 0.3 \
+    --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack --no-scale-feature --activation-fn gelu --speech-extractor-mode default \
+    --stacked-encoder s2s --encoder-normalize-before --decoder-normalize-before --encoder-learned-pos --decoder-learned-pos --dropout 0.1 \
+    --load-pretrained-mbart-encoder-from $EN_FR_NMT --load-pretrained-mbart-decoder-from $EN_FR_NMT
+```
+#### Fine-tuning
+```
+python train.py $S2T_DATA_PATH \
+    --save-dir $SAVE_FT_PATH --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack --user-dir examples/speech_text_joint_to_text \
+    --max-epoch 25 --update-mix-data --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 4 --clip-norm 10.0 --warmup-updates 20000 \
+    --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --attentive-cost-regularization 0.02 --enc-grad-mult 2.0 --label-smoothing 0.1 \
+    --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt \
+    --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt  --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 --speech-mask-length 10 \
+    --speech-mask-prob 0.65 --text-sample-ratio 0.05 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data data-bin-wt --text-input-cost-ratio 0.5 \
+    --langpairs en-fr --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 100 --config-yaml config.yaml --skip-invalid-size-inputs-valid-test \
+    --noise-token '▁NOISE' --keep-last-epochs 40 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos --activation-fn gelu \
+    --speech-extractor-mode default --max-target-positions 1024 --encoder-normalize-before --decoder-normalize-before
+```
+### Evaluation
+The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL
+```
+python fairseq_cli/generate.py \
+    $S2T_DATA_PATH \
+    --task speech_text_joint_to_text \
+    --nbest 1 \
+    --max-tokens 800000 \
+    --max-source-positions 800000 \
+    --results-path $RESULTS_LOG \
+    --batch-size 512 \
+    --path $FINAL_MODEL \
+    --gen-subset $SUBSET \
+    --config-yaml config.yaml \
+    --scoring sacrebleu \
+    --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \
+    --user-dir examples/speech_text_joint_to_text --load-speech-only \
+    --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''}
+```
+### Results and models
+| | en-fr | en-es | en-de |
+|---|---|---|---|
+| BLEU| 39.7 | 33.2 |29.2 |
+**Model Links**:
+1.  DE
+  - [de config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/config.yaml)
+  - [de src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/src_dict.txt)
+  - [de tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/tgt_dict.txt)
+  - [de spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/spm.model)
+  - [de pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/nmt.pt)
+  - [de pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_pretraing.pt)
+  - [de fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_finetune_ave10.pt)
+2.  ES
+  - [es config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/config.yaml)
+  - [es src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/src_dict.txt)
+  - [es tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/tgt_dict.txt)
+  - [es spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/spm.model)
+  - [es pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/nmt.pt)
+  - [es pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_pretraing.pt)
+  - [es fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_finetune_ave10.pt)
+3.  FR
+  - [fr config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/config.yaml)
+  - [fr src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/src_dict.txt)
+  - [fr tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/tgt_dict.txt)
+  - [fr spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/spm.model)
+  - [fr pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/nmt.pt)
+  - [fr pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_pretraing.pt)
+  - [fr fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_finetune_ave10.pt)
+4. [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/config_s2p.yaml)

fairseq/examples/speech_text_joint_to_text/models/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os

fairseq/examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py ADDED Viewed

	@@ -0,0 +1,698 @@

+#!/usr/bin/env python3
+import logging
+from collections import OrderedDict, namedtuple
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from fairseq import checkpoint_utils, utils
+from fairseq.file_io import PathManager
+from fairseq.models import (
+    FairseqDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_text import (
+    MultiInputDecoder,
+    MultiModalityEncoder,
+    SpeechWavTransformerEncoder,
+    StackedSpeechWavTransformerEncoder,
+)
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+)
+logger = logging.getLogger(__name__)
+class SpeechTextPreTrainEncoder(MultiModalityEncoder):
+    def __init__(
+        self,
+        dictionary,
+        sup_speech_encoder,
+        sup_s2s_speech_encoder,
+        unsup_speech_encoder,
+        text_encoder,
+    ):
+        super().__init__(dictionary)
+        self.sup_speech_encoder = sup_speech_encoder
+        self.sup_s2s_speech_encoder = sup_s2s_speech_encoder
+        self.unsup_speech_encoder = unsup_speech_encoder
+        self.text_encoder = text_encoder
+    @classmethod
+    def update_transformer_encoder_cfg(cls, args, update_dict):
+        cfg = dict(args._get_kwargs())
+        for fkey in update_dict.keys():
+            cfg[fkey] = update_dict[fkey]
+        cfg.pop("_name", None)  # remove keys start with _
+        model_args = namedtuple("args", cfg.keys())(*cfg.values())
+        return model_args
+    @classmethod
+    def build_text_encoder(cls, args, src_dictionary):
+        enc_emb = nn.Embedding(
+            len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad()
+        )
+        model_args = cls.update_transformer_encoder_cfg(
+            args, {"encoder_layers": args.text_encoder_layers}
+        )
+        text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb)
+        return text_encoder
+    @classmethod
+    def build_speech_encoder(cls, args):
+        model_args = cls.update_transformer_encoder_cfg(
+            args,
+            {
+                "encoder_layers": args.speech_encoder_layers,
+                "speech_mask_prob": args.speech_sup_mask_prob,
+            },
+        )
+        speech_encoder = SpeechWavTransformerEncoder(model_args)
+        return speech_encoder
+    @classmethod
+    def share_layers(cls, src_layers, tgt_layers):  # share layer but not dropout
+        # share parameters in src_layers with tgt_layers
+        assert len(src_layers) == len(tgt_layers)
+        for i, ly in enumerate(src_layers):
+            tly = tgt_layers[i]
+            tly.self_attn = ly.self_attn
+            tly.self_attn_layer_norm = ly.self_attn_layer_norm
+            tly.activation_fn = ly.activation_fn
+            tly.normalize_before = ly.normalize_before
+            tly.fc1 = ly.fc1
+            tly.fc2 = ly.fc2
+            tly.final_layer_norm = ly.final_layer_norm
+            if hasattr(tly, "encoder_attn"):
+                tly.encoder_attn = ly.encoder_attn
+                tly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm
+        return tgt_layers
+    @classmethod
+    def build_unsup_speech_encoder(cls, args, sup_speech_encoder):
+        model_args = cls.update_transformer_encoder_cfg(
+            args,
+            {
+                "encoder_layers": args.speech_encoder_layers,
+                "speech_mask_prob": args.speech_unsup_mask_prob,
+                "encoder_layerdrop": 0.0,
+                "decoder_layerdrop": 0.0,
+                "dropout": args.speech_unsup_dropout,
+                "activation_dropout": args.speech_unsup_dropout,
+                "attention_dropout": 0.0,
+                "dropout_features": args.speech_unsup_feature_dropout,
+                "dropout_input": args.speech_unsup_feature_dropout,
+            },
+        )
+        unsup_speech_encoder = SpeechWavTransformerEncoder(model_args, alway_mask=True)
+        unsup_speech_encoder.layer_norm = sup_speech_encoder.layer_norm
+        unsup_speech_encoder.layers = cls.share_layers(
+            sup_speech_encoder.layers, unsup_speech_encoder.layers
+        )
+        unsup_speech_encoder.mask_emb = sup_speech_encoder.mask_emb
+        unsup_speech_encoder.embed_positions = sup_speech_encoder.embed_positions
+        unsup_speech_encoder.feat_layer_norm = sup_speech_encoder.feat_layer_norm
+        unsup_speech_encoder.feat_proj = sup_speech_encoder.feat_proj
+        unsup_speech_encoder.subsample = sup_speech_encoder.subsample
+        return unsup_speech_encoder
+    @classmethod
+    def build_encoder(cls, args, dictionary):
+        text_encoder = cls.build_text_encoder(args, dictionary)
+        if getattr(args, "load_pretrained_mbart_encoder_from", None):
+            text_encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=text_encoder,
+                checkpoint=args.load_pretrained_mbart_encoder_from,
+            )
+        speech_encoder = cls.build_speech_encoder(args)
+        if getattr(args, "load_pretrained_feature_extractor_from", None):
+            def load_feature_extractor(component, checkpoint):
+                if not PathManager.exists(checkpoint):
+                    raise IOError("Model file not found: {}".format(checkpoint))
+                state = checkpoint_utils.load_checkpoint_to_cpu(checkpoint)
+                component_state_dict = OrderedDict()
+                component_prefix = "feature_extractor"
+                for key in state["model"].keys():
+                    if key.startswith(component_prefix):
+                        component_subkey = key[len(component_prefix) + 1 :]
+                        component_state_dict[component_subkey] = state["model"][key]
+                component.load_state_dict(component_state_dict, strict=True)
+                return component
+            speech_encoder.subsample = load_feature_extractor(
+                speech_encoder.subsample, args.load_pretrained_feature_extractor_from
+            )
+        speech_s2s_encoder = speech_encoder
+        unsup_speech_encoder = cls.build_unsup_speech_encoder(args, speech_encoder)
+        if getattr(args, "stacked_encoder", "none") != "none":
+            if args.encoder_shared_text_layers_from_begin > 0:
+                raise ValueError(
+                    "We can not stack encoders and share encoders at the same time!"
+                )
+            speech_s2s_encoder = StackedSpeechWavTransformerEncoder(
+                speech_encoder, text_encoder.layers, text_encoder.layer_norm
+            )
+            if args.stacked_encoder == "all":
+                speech_encoder = speech_s2s_encoder
+                unsup_speech_encoder = StackedSpeechWavTransformerEncoder(
+                    unsup_speech_encoder, text_encoder.layers, text_encoder.layer_norm
+                )
+        else:
+            cls.share_speech_text_encoder(
+                speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin
+            )
+        return SpeechTextPreTrainEncoder(
+            dictionary,
+            speech_encoder,
+            speech_s2s_encoder,
+            unsup_speech_encoder,
+            text_encoder,
+        )
+    @classmethod
+    def share_speech_text_encoder(
+        cls, speech_encoder, text_encoder, shared_layers_from_begin
+    ):
+        if shared_layers_from_begin > 0:
+            num_text_encoder_layers = len(text_encoder.layers)
+            assert len(speech_encoder.layers) >= shared_layers_from_begin
+            assert num_text_encoder_layers >= shared_layers_from_begin
+            assert len(speech_encoder.layers) >= num_text_encoder_layers
+            for i, ly in enumerate(
+                speech_encoder.layers[
+                    -num_text_encoder_layers : -num_text_encoder_layers
+                    + shared_layers_from_begin
+                ]
+            ):
+                assert isinstance(text_encoder.layers[i], type(ly))
+                text_encoder.layers[i] = ly
+    def select_encoder(self, mode, **kwargs):
+        if mode in ("speech", "sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"):
+            kwargs["features_only"] = True
+            if mode == "sup_speech_s2s":
+                return self.sup_s2s_speech_encoder, kwargs
+            return self.sup_speech_encoder, kwargs
+        elif mode == "unsup_speech":
+            kwargs["features_only"] = False
+            return self.unsup_speech_encoder, kwargs
+        elif mode in ("text", "bitext"):
+            return self.text_encoder, kwargs
+        else:
+            raise NotImplementedError(f"{mode} is not supported")
+        return None, kwargs
+    def forward(self, src_tokens, src_lengths=None, mode="", alignment=None, **kwargs):
+        return super().forward(src_tokens, src_lengths, mode, **kwargs)
+# SpeechDummyDecoder works as an extension of encoder, so we could fit encoder only training into seq2seq training
+class SpeechDummyDecoder(FairseqDecoder):
+    def __init__(
+        self,
+        dictionary,
+        output_embedding,
+        no_emb_update_unsup=False,
+        use_output_proj=False,
+    ):
+        super().__init__(dictionary)
+        self.output_embedding = output_embedding
+        num_embedding, num_dim = self.output_embedding.weight.size()
+        self.out_proj = (
+            None if use_output_proj is False else nn.Linear(num_dim, num_dim)
+        )
+        self.no_emb_update_unsup = no_emb_update_unsup
+    def extend_alignment(self, alignment, src_lengths, prev_output_tokens):
+        # alignment:    B X N
+        # src_lengths:  B X T
+        # prev_output_tokens:    B X (N + 1)
+        tgt_tokens = prev_output_tokens[
+            :, 1:
+        ]  # remove the leading start of sentence token
+        ext_alignment = (
+            torch.ones(len(src_lengths), src_lengths.max(), device=src_lengths.device)
+            .long()
+            .fill_(self.dictionary.pad())
+        )
+        for bs in range(src_lengths.size(0)):
+            tgt_length = tgt_tokens[bs].ne(self.dictionary.pad()).sum().item()
+            assert tgt_length == sum(alignment[bs].ne(1)) + 1
+            src_st = 0
+            for i in range(tgt_length):
+                tok = tgt_tokens[bs][i]
+                src_ed = (alignment[bs][i] * src_lengths[bs]).int().item()
+                ext_alignment[bs][src_st:src_ed].fill_(tok)
+                src_st = src_ed
+        return ext_alignment
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out,
+        incremental_state=None,
+        mode="speech",
+        alignment=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+        Returns:
+            sup_speech_ctc:
+                dictionary{"logits": logits, "padding_mask": padding_mask}
+            sup_speech_ali and unsup_speech:
+                tuple:
+                    - the decoder's output of shape `(batch, tgt_len, vocab)`
+                    - a dictionary with any model-specific outputs
+        """
+        emb_weight = self.output_embedding.weight
+        if (
+            mode == "unsup_speech" and self.no_emb_update_unsup
+        ):  # no gradient for embedding here
+            emb_weight = emb_weight.detach()
+        enc_out = (
+            encoder_out["encoder_out"][0]
+            if self.out_proj is None
+            else self.out_proj(encoder_out["encoder_out"][0])
+        )
+        logits = F.linear(enc_out, emb_weight, None).transpose(0, 1)  # B X T X C
+        others = None
+        if mode in (
+            "speech",
+            "sup_speech_ctc",
+        ):  # speech data with label, do forcealignment
+            if len(encoder_out["encoder_padding_mask"]) > 0:
+                padding_mask = encoder_out["encoder_padding_mask"][0]
+                logits = logits.masked_fill(padding_mask, float("-inf"))
+            else:
+                seq_len, bsz = encoder_out["encoder_out"][0].size()[:2]
+                padding_mask = torch.zeros(
+                    bsz, seq_len, device=encoder_out["encoder_out"][0].device
+                ).bool()
+            return {"x": logits, "padding_mask": padding_mask}
+        elif mode == "sup_speech_ali":
+            src_lengths = None
+            if len(encoder_out["encoder_padding_mask"]) > 0:
+                src_lengths = (1 - encoder_out["encoder_padding_mask"][0].long()).sum(
+                    -1
+                )
+            else:
+                seq_len, bsz = encoder_out["encoder_out"][0].size()[:2]
+                src_lengths = (
+                    torch.ones(bsz, device=encoder_out["encoder_out"][0].device).long()
+                    * seq_len
+                )
+            assert alignment is not None
+            alignment = self.extend_alignment(
+                alignment, src_lengths, prev_output_tokens
+            )
+            others = {"pseudo_target_tokens": alignment}
+        elif mode == "unsup_speech":
+            enc_out_ori = (
+                encoder_out["encoder_unmasked_out"][0]
+                if self.out_proj is None
+                else self.out_proj(encoder_out["encoder_unmasked_out"][0])
+            )
+            logits_ori = F.linear(enc_out_ori, emb_weight, None).transpose(0, 1)
+            if len(encoder_out["encoder_padding_mask"]) > 0:
+                encoder_padding_mask = encoder_out["encoder_padding_mask"][0]
+                logits_ori = logits_ori.masked_fill(encoder_padding_mask, float("-inf"))
+            pseudo_labels = utils.log_softmax(logits_ori, dim=-1)
+            others = {
+                "pseudo_target_logprobs": pseudo_labels,
+                "padding_mask": encoder_out["encoder_padding_mask"],  # B X T
+                "mask_indices": encoder_out[
+                    "mask_indices"
+                ],  # True for masked frames B X T
+            }
+        return logits, others
+    def get_normalized_probs(
+        self,
+        net_output: Dict[str, Tensor],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        return self.get_normalized_probs_scriptable(
+            (net_output["x"], None), log_probs, sample
+        )
+class SpeechTextPreTrainDecoder(MultiInputDecoder):
+    def __init__(self, dictionary, speech_decoder, text_decoder):
+        super().__init__(dictionary)
+        self.speech_decoder = speech_decoder
+        self.text_decoder = text_decoder
+    def select_decoder(self, mode, **kwargs):
+        if mode == "unsup_speech":
+            kwargs["mode"] = mode
+            return self.speech_decoder, kwargs
+        if mode in ("text", "bitext"):
+            return self.text_decoder, kwargs
+        if mode in ("speech", "sup_speech_ctc", "sup_speech_ali"):
+            kwargs["mode"] = mode
+            return self.speech_decoder, kwargs
+        if mode in ("speech", "sup_speech_s2s"):
+            if "alignment" in kwargs:
+                del kwargs["alignment"]
+            return self.text_decoder, kwargs
+        raise NotImplementedError(f"{mode} is not supported")
+        return None, kwargs
+    def get_normalized_probs(
+        self,
+        net_output,
+        log_probs,
+        sample=None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        if isinstance(net_output, dict):
+            return self.speech_decoder.get_normalized_probs(
+                net_output, log_probs, sample
+            )
+        return self.text_decoder.get_normalized_probs(net_output, log_probs, sample)
+    @classmethod
+    def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None):
+        dec_emb = (
+            nn.Embedding(
+                len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad()
+            )
+            if dec_emb_share is None
+            else dec_emb_share
+        )
+        text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb)
+        return text_decoder
+    @classmethod
+    def build_dummy_speech_decoder(cls, args, dictionary, dec_emb_share=None):
+        dec_emb = (
+            nn.Embedding(len(dictionary), args.decoder_embed_dim, dictionary.pad())
+            if dec_emb_share is None
+            else dec_emb_share
+        )
+        speech_decoder = SpeechDummyDecoder(
+            dictionary,
+            dec_emb,
+            no_emb_update_unsup=getattr(args, "no_emb_update_unsup", False),
+            use_output_proj=getattr(args, "use_decoder_output_proj", False),
+        )
+        return speech_decoder
+    @classmethod
+    def build_decoder(
+        cls, args, text_dictionary, speech_dictionary, speech_output_embedding
+    ):
+        text_decoder = cls.build_text_decoder(args, text_dictionary)
+        speech_decoder = cls.build_dummy_speech_decoder(
+            args, speech_dictionary, speech_output_embedding
+        )
+        if getattr(args, "load_pretrained_mbart_decoder_from", None):
+            text_decoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=text_decoder,
+                checkpoint=args.load_pretrained_mbart_decoder_from,
+            )
+        return SpeechTextPreTrainDecoder(text_dictionary, speech_decoder, text_decoder)
+@register_model("speech_text_pretrain_bart")
+class SpeechTextPreTrainModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.num_updates = 0
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, src_lang_ids=None, **kwargs
+    ):
+        if src_lang_ids is not None:
+            encoder_out = self.encoder(
+                src_tokens, src_lengths=src_lengths, src_lang_ids=src_lang_ids, **kwargs
+            )
+        else:
+            encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        decoder_out = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return decoder_out
+    def max_positions(self):
+        return None  # it is provided in task
+    def get_targets(self, sample, net_output):
+        mode = sample["net_input"]["mode"]
+        if mode == "unsup_speech":
+            return {"target_logprobs": net_output[1]["pseudo_target_logprobs"]}
+        if mode == "sup_speech_ali":
+            return net_output[1]["pseudo_target_tokens"]
+        return sample["target"]
+    def get_normalized_probs(
+        self,
+        net_output,
+        log_probs,
+        sample=None,
+    ):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+        lprobs.batch_first = True
+        return lprobs
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        SpeechWavTransformerEncoder.add_args(parser)
+        parser.add_argument(
+            "--speech-sup-mask-prob",
+            type=float,
+            help="probability of replacing a token with mask (sup-speech)",
+        )
+        parser.add_argument(
+            "--speech-unsup-mask-prob",
+            type=float,
+            help="probability of replacing a token with mask (unsup-speech)",
+        )
+        parser.add_argument(
+            "--load-pretrained-mbart-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take text encoder  weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--load-pretrained-mbart-decoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take text decoder  weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--load-pretrained-feature-extractor-from",
+            type=str,
+            metavar="STR",
+            help="model to take feature extractor weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--speech-unsup-dropout",
+            type=float,
+            default=0,
+            help="dropout for unsupervised speech encoder",
+        )
+        parser.add_argument(
+            "--speech-unsup-feature-dropout",
+            type=float,
+            default=0,
+            help="dropout for unsupervised speech feature encoder",
+        )
+        parser.add_argument(
+            "--encoder-shared-text-layers-from-begin",
+            type=int,
+            help="number of text encoder layers shared with speech encoder (from first layer)",
+        )
+        parser.add_argument(
+            "--stacked-encoder",
+            default="none",
+            choices=["none", "s2s", "all"],
+            help="stack speech and text encoders",
+        )
+        parser.add_argument("--use-decoder-output-proj", action="store_true")
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = SpeechTextPreTrainEncoder.build_encoder(args, task.src_dict)
+        decoder = SpeechTextPreTrainDecoder.build_decoder(
+            args, task.tgt_dict, task.src_dict, encoder.text_encoder.embed_tokens
+        )
+        model = SpeechTextPreTrainModel(encoder, decoder)
+        return model
+    def upgrade_state_dict(self, state_dict):
+        """Upgrade old state dicts to work with newer code."""
+        if "decoder.speech_decoder.output_projection.weight" in state_dict:
+            del state_dict["decoder.speech_decoder.output_projection.weight"]
+        self.upgrade_state_dict_named(state_dict, "")
+@register_model_architecture(
+    "speech_text_pretrain_bart", "speech_text_pretrain_bart_base"
+)
+def speech_text_pretrain_bart_base(args):
+    # speech masking
+    args.dropout_input = getattr(args, "dropout_input", 0)
+    args.dropout_features = getattr(args, "dropout_features", 0)
+    args.speech_mask_length = getattr(args, "speech_mask_length", 10)
+    args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65)
+    args.speech_sup_mask_prob = getattr(args, "speech_sup_mask_prob", 0.3)
+    args.speech_unsup_mask_prob = getattr(
+        args, "speech_unsup_mask_prob", args.speech_mask_prob
+    )
+    args.speech_mask_selection = getattr(args, "speech_mask_selection", "static")
+    args.speech_mask_other = getattr(args, "speech_mask_other", 0)
+    args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1)
+    args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False)
+    args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10)
+    args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0)
+    args.speech_mask_channel_selection = getattr(
+        args, "speech_mask_channel_selection", "static"
+    )
+    args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0)
+    args.speech_mask_channel_min_space = getattr(
+        args, "speech_mask_channel_min_space", 1
+    )
+    args.speech_no_mask_channel_overlap = getattr(
+        args, "speech_no_mask_channel_overlap", False
+    )
+    args.no_scale_feature = getattr(args, "", False)
+    args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0)  # 0.1
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(
+        args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4
+    )
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.speech_conv_bias = getattr(args, "speech_conv_bias", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_attention_heads = getattr(
+        args, "decoder_attention_heads", args.encoder_attention_heads
+    )
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")  # gelu?
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.speech_unsup_dropout = getattr(args, "speech_unsup_dropout", 0)
+    args.speech_unsup_feature_dropout = getattr(args, "speech_unsup_feature_dropout", 0)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 6
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.no_emb_update_unsup = getattr(args, "no_emb_update_unsup", False)
+@register_model_architecture(
+    "speech_text_pretrain_bart", "speech_text_pretrain_bart_base_stack"
+)
+def speech_text_pretrain_bart_base_stack(args):
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 0
+    )
+    args.stacked_encoder = getattr(args, "stacked_encoder", "all")
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+    speech_text_pretrain_bart_base(args)
+@register_model_architecture(
+    "speech_text_pretrain_bart", "speech_text_pretrain_bart_large"
+)
+def speech_text_pretrain_bart_large(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 12)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 12
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.dropout = getattr(args, "dropout", 0.3)
+    speech_text_pretrain_bart_base(args)
+@register_model_architecture(
+    "speech_text_pretrain_bart", "speech_text_pretrain_bart_large_stack"
+)
+def speech_text_pretrain_bart_large_stack(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 12)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 0
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.stacked_encoder = getattr(args, "stacked_encoder", "s2s")
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+    speech_text_pretrain_bart_base(args)

fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py ADDED Viewed

	@@ -0,0 +1,1093 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from fairseq import checkpoint_utils
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.speech_to_text import (
+    TransformerDecoder,
+    S2TTransformerEncoder,
+)
+from fairseq.models.transformer import TransformerEncoder
+from fairseq.modules import (
+    TransformerEncoderLayer,
+    GradMultiply,
+    LayerNorm,
+)
+logger = logging.getLogger(__name__)
+class SpeechEoSEncoder(FairseqEncoder):
+    def __init__(self, encoder, eos_num, feat_dim, adapter_type="None", adapter_dim=0):
+        super().__init__(None)
+        self.encoder = encoder
+        self.eos_num = eos_num  # downsampling rate for speech input feature
+        self.eos_emb = (
+            nn.Parameter(torch.zeros(1, feat_dim), requires_grad=True)
+            if eos_num > 0
+            else None
+        )
+        self.adapter = self.add_adapter(adapter_type, adapter_dim)
+    def add_adapter(self, adapter_type, adapter_dim):
+        def _make_identity(linear, eps=1e-5):
+            assert isinstance(linear, nn.Linear)
+            linear.weight.data.mul_(eps)
+            linear.weight.data.fill_diagonal_(1.0)
+            if linear.bias is not None:
+                linear.bias.data.mul_(eps)
+        adapter = None
+        if adapter_type == "Linear":
+            assert adapter_dim > 0
+            adapter = nn.Sequential(
+                nn.Linear(adapter_dim, adapter_dim), LayerNorm(adapter_dim)
+            )
+            # initialize the adapter as identity matrix first
+            _make_identity(adapter[0])
+        elif adapter_type == "MLP":
+            assert adapter_dim > 0
+            # assume the model is pre-norm model
+            adapter = nn.Sequential(
+                nn.Linear(adapter_dim, 2 * adapter_dim),
+                nn.ReLU(),
+                nn.Linear(2 * adapter_dim, adapter_dim),
+                LayerNorm(adapter_dim),
+            )
+            _make_identity(adapter[0])
+            _make_identity(adapter[2])
+        return adapter
+    def add_eos(self, src_tokens, src_lengths):
+        bsz, max_seq_len, fdim = src_tokens.size()
+        if self.eos_num > 0:
+            src_token_eos = torch.zeros(
+                [bsz, max_seq_len + self.eos_num, fdim],
+                dtype=src_tokens.dtype,
+                device=src_tokens.device,
+            )
+            src_token_eos[:, :max_seq_len] = src_tokens
+            for bi in range(bsz):
+                src_token_eos[bi][
+                    src_lengths[bi] : src_lengths[bi] + self.eos_num
+                ] = self.eos_emb.expand(self.eos_num, fdim)
+            src_lengths = src_lengths + self.eos_num
+            src_tokens = src_token_eos
+        return src_tokens, src_lengths
+    def apply_adapter(self, enc_out):
+        if self.adapter is None:
+            return enc_out
+        rst = self.adapter(enc_out.encoder_out)
+        if enc_out.encoder_padding_mask is not None:
+            rst.masked_fill_(
+                enc_out.encoder_padding_mask.transpose(0, 1).unsqueeze(-1), 0
+            )
+        return EncoderOut(
+            encoder_out=rst,
+            encoder_padding_mask=enc_out.encoder_padding_mask,
+            encoder_embedding=enc_out.encoder_embedding,
+            encoder_states=enc_out.encoder_states,
+            src_tokens=enc_out.src_tokens,
+            src_lengths=enc_out.src_lengths,
+        )
+    def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        src_tokens, src_lengths = self.add_eos(src_tokens, src_lengths)
+        enc_out = self.encoder(src_tokens, src_lengths, return_all_hiddens)
+        enc_out = self.apply_adapter(enc_out)
+        return enc_out
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return self.encoder.reorder_encoder_out(encoder_out, new_order)
+class DualInputEncoder(FairseqEncoder):
+    def __init__(
+        self,
+        args,
+        spch_encoder,
+        text_encoder,
+        dictionary,
+        cross_attentive_loss_before_last_layer=-1,
+    ):
+        super().__init__(dictionary)
+        self.spch_encoder = spch_encoder
+        self.text_encoder = text_encoder
+        self.enc_grad_mult = args.enc_grad_mult
+        self.cross_attentive_loss_before_last_layer = (
+            cross_attentive_loss_before_last_layer
+        )
+        self.use_cross_attentive_loss = (
+            False if cross_attentive_loss_before_last_layer <= -1 else True
+        )
+        self.enc2_along_grad_mult = args.enc2_along_grad_mult
+    @classmethod
+    def set_shared_layer(cls, share_level, src_layer, tgt_layer):
+        """
+        share parameters from tgt_layer to src_layer
+        share_level:
+            0: share everything
+            1: share everything but different model
+            2: share weight but not bias, layernorm
+        """
+        if share_level == 0:
+            return tgt_layer
+        if isinstance(src_layer, nn.Linear):
+            return tgt_layer
+        if isinstance(src_layer, TransformerEncoderLayer):
+            assert src_layer.embed_dim == tgt_layer.embed_dim
+            assert src_layer.normalize_before == tgt_layer.normalize_before
+            if share_level == 1:
+                src_layer.fc1 = tgt_layer.fc1
+                src_layer.fc2 = tgt_layer.fc2
+                src_layer.self_attn = tgt_layer.self_attn
+                src_layer.final_layer_norm = tgt_layer.final_layer_norm
+                src_layer.self_attn_layer_norm = tgt_layer.self_attn_layer_norm
+                src_layer.layernorm_embedding = tgt_layer.layernorm_embedding
+            else:
+                src_layer.fc1.weight = tgt_layer.fc1.weight
+                src_layer.fc2.weight = tgt_layer.fc2.weight
+                src_layer.self_attn.k_proj.weight = tgt_layer.self_attn.k_proj.weight
+                src_layer.self_attn.v_proj.weight = tgt_layer.self_attn.v_proj.weight
+                src_layer.self_attn.q_proj.weight = tgt_layer.self_attn.q_proj.weight
+                src_layer.self_attn.out_proj.weight = (
+                    tgt_layer.self_attn.out_proj.weight
+                )
+        else:
+            if share_level == 1:
+                return tgt_layer
+        return src_layer
+    @classmethod
+    def build_spch_encoder(cls, args):
+        cfg = {
+            "input_feat_per_channel": args.input_feat_per_channel,
+            "input_channels": args.input_channels,
+            "conv_kernel_sizes": args.conv_kernel_sizes,
+            "conv_channels": args.conv_channels,
+            "encoder_embed_dim": args.encoder_embed_dim,
+            "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim,
+            "encoder_layers": args.speech_encoder_layers,
+            "encoder_layerdrop": args.encoder_layerdrop,
+            "encoder_attention_heads": args.encoder_attention_heads,
+            "max_source_positions": args.max_source_positions,
+            "dropout": args.dropout,
+            "encoder_normalize_before": args.encoder_normalize_before,
+            "activation_dropout": args.activation_dropout,
+            "attention_dropout": args.attention_dropout,
+            "activation_fn": args.activation_fn,
+            "layernorm_embedding": args.layernorm_embedding,
+            "no_token_positional_embeddings": args.no_token_positional_embeddings,
+            "no_scale_embedding": args.no_scale_embedding,
+            "quant_noise_pq": args.quant_noise_pq,
+            "encoder_freezing_updates": 0,
+        }
+        model_args = namedtuple("args", cfg.keys())(*cfg.values())
+        spch_encoder = S2TTransformerEncoder(model_args)
+        if args.add_speech_eos:
+            spch_encoder = SpeechEoSEncoder(
+                spch_encoder,
+                2 * len(args.conv_kernel_sizes.split(",")),
+                args.input_feat_per_channel,
+                adapter_type=getattr(args, "speech_encoder_adapter_type", "None"),
+                adapter_dim=args.encoder_embed_dim,
+            )
+        return spch_encoder
+    @classmethod
+    def build_text_encoder(cls, args, src_dictionary, spch_encoder):
+        if args.encoder_shared_layers > 0:
+            mx_shared_layers = (
+                args.speech_encoder_layers
+                if args.speech_encoder_layers < args.text_encoder_layers
+                else args.text_encoder_layers
+            )
+            args.encoder_shared_layers = (
+                args.encoder_shared_layers
+                if args.encoder_shared_layers <= mx_shared_layers
+                else mx_shared_layers
+            )
+        cfg = {
+            "encoder_embed_dim": args.encoder_text_embed_dim,
+            "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim,
+            "encoder_layers": args.text_encoder_layers,
+            "encoder_layerdrop": args.encoder_layerdrop,
+            "encoder_attention_heads": args.encoder_attention_heads,
+            "encoder_learned_pos": args.encoder_learned_pos,
+            "max_source_positions": args.max_source_positions,
+            "dropout": args.dropout,
+            "encoder_normalize_before": args.encoder_normalize_before,
+            "activation_dropout": args.activation_dropout,
+            "attention_dropout": args.attention_dropout,
+            "activation_fn": args.activation_fn,
+            "adaptive_input": args.adaptive_input,
+            "no_token_positional_embeddings": args.no_token_positional_embeddings,
+            "no_scale_embedding": args.no_scale_embedding,
+            "quant_noise_pq": args.quant_noise_pq,
+        }
+        model_args = namedtuple("args", cfg.keys())(*cfg.values())
+        enc_emb = nn.Embedding(
+            len(src_dictionary), model_args.encoder_embed_dim, src_dictionary.pad()
+        )
+        text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb)
+        if args.add_speech_eos:
+            spch_encoder = spch_encoder.encoder
+        if args.encoder_shared_layers > 0:
+            text_encoder.layer_norm = cls.set_shared_layer(
+                args.encoder_shared_layer_level,
+                text_encoder.layer_norm,
+                spch_encoder.layer_norm,
+            )
+            for i, ly in enumerate(
+                spch_encoder.transformer_layers[-args.encoder_shared_layers :]
+            ):
+                ly_id = i + args.text_encoder_layers - args.encoder_shared_layers
+                if not isinstance(text_encoder.layers[ly_id], type(ly)):
+                    if text_encoder.layers[ly_id]._get_name() not in ('TransformerEncoderLayerBase', 'TransformerEncoderLayer'):
+                        raise ValueError("The shared layers are expected from the same class")
+                text_encoder.layers[ly_id] = cls.set_shared_layer(
+                    args.encoder_shared_layer_level,
+                    text_encoder.layers[ly_id],
+                    ly,
+                )
+        return text_encoder
+    def mult_rst_grad(self, rst, ratio):
+        assert isinstance(rst, dict)  # instead of EncoderOut
+        assert len(rst["encoder_out"]) == 1
+        rst["encoder_out"][0] = GradMultiply.apply(rst["encoder_out"][0], ratio)
+        return rst
+    def process_attentive_loss_states(self, rst, interstates):
+        assert isinstance(rst, dict)  # instead of EncoderOut
+        rst["encoder_states"] = interstates
+        return rst
+    def forward(
+        self,
+        src_tokens,
+        src_lengths=None,
+        src_txt_tokens=None,
+        src_txt_lengths=None,
+        **kwargs
+    ):
+        """
+        Args:
+            src_tokens: padded tensor (B, T, C * feat)
+            src_lengths: tensor of original lengths of input utterances (speech) (B,)
+            src_txt_tokens: padded tensor (B, T)
+            src_txt_lengths: tensor of original lengths of input utterances (text) (B,)
+        """
+        # src_tokens only: inference
+        # src_tokens, src_lengths: speech only training
+        # src_txt_tokens, src_txt_lengths: text only training
+        # all valid: speech + text training
+        if src_tokens is None and src_txt_tokens is None:
+            raise ValueError(
+                "src_tokens and src_txt_tokens cannot be None at the same time"
+            )
+        ret1 = None
+        ret2 = None
+        return_all_hiddens = False
+        if src_tokens is not None:
+            if (
+                self.use_cross_attentive_loss and src_txt_tokens is not None
+            ):  # remove self.training so we can get attn score during validation step
+                return_all_hiddens = True
+            ret1 = self.spch_encoder(
+                src_tokens, src_lengths, return_all_hiddens=return_all_hiddens
+            )
+            if self.use_cross_attentive_loss and src_txt_tokens is not None:
+                assert self.cross_attentive_loss_before_last_layer < len(
+                    ret1["encoder_states"]
+                )
+                ret1 = self.process_attentive_loss_states(
+                    ret1,
+                    ret1["encoder_states"][
+                        -self.cross_attentive_loss_before_last_layer - 1
+                    ],
+                )
+        if src_txt_tokens is not None:
+            ret2 = self.text_encoder(
+                src_txt_tokens, src_txt_lengths, return_all_hiddens=return_all_hiddens
+            )
+            if return_all_hiddens:
+                if self.cross_attentive_loss_before_last_layer == len(
+                    self.text_encoder.layers
+                ):
+                    text_embedding, _ = self.text_encoder.forward_embedding(
+                        src_txt_tokens
+                    )
+                    text_embedding = text_embedding.transpose(0, 1)
+                    ret2 = self.process_attentive_loss_states(ret2, text_embedding)
+                else:
+                    assert self.cross_attentive_loss_before_last_layer < len(
+                        self.text_encoder.layers
+                    )
+                    ret2 = self.process_attentive_loss_states(
+                        ret2,
+                        ret2["encoder_states"][
+                            -self.cross_attentive_loss_before_last_layer - 1
+                        ],
+                    )
+        def merge_output(rst1, rst2):
+            if rst1 is None:
+                if not (self.enc2_along_grad_mult == 1.0 or self.training):
+                    rst2 = self.mult_rst_grad(rst2, self.enc2_along_grad_mult)
+                return rst2
+            if rst2 is None:
+                return rst1
+            if self.enc_grad_mult != 1.0 and self.training:
+                rst1 = self.mult_rst_grad(rst1, self.enc_grad_mult)
+                rst2 = self.mult_rst_grad(rst2, self.enc_grad_mult)
+            rst = (rst1, rst2)
+            return rst
+        return merge_output(ret1, ret2)
+    def reorder_encoder_out(self, encoder_out, new_order):
+        assert self.training is False  # used for inference only
+        return self.spch_encoder.reorder_encoder_out(encoder_out, new_order)
+# TransformerMultiInputDecoder: take one or two encoder inputs
+class TransformerMultiInputDecoder(FairseqDecoder):
+    def __init__(
+        self,
+        dictionary,
+        spch_decoder,
+        text_decoder,
+        compute_cross_attentive_loss=False,
+        cross_attentive_loss_with_norm=True,
+        cross_attentive_loss_reverse=False,
+    ):
+        super().__init__(dictionary)
+        self.spch_decoder = spch_decoder
+        self.text_decoder = text_decoder
+        self.compute_cross_attentive_loss = compute_cross_attentive_loss
+        self.cross_attentive_loss_with_norm = cross_attentive_loss_with_norm
+        self.cross_attentive_loss_reverse = cross_attentive_loss_reverse
+    @classmethod
+    def share_spchdecoder(cls, task_args, text_decoder, spch_decoder):
+        if task_args.decoder_shared_layer_level == 0:
+            return text_decoder
+        assert text_decoder.embed_tokens == spch_decoder.embed_tokens
+        spch_decoder.project_in_dim = text_decoder.project_in_dim
+        spch_decoder.embed_positions = text_decoder.embed_positions
+        spch_decoder.layernorm_embedding = text_decoder.layernorm_embedding
+        spch_decoder.project_out_dim = text_decoder.project_out_dim
+        spch_decoder.adaptive_softmax = text_decoder.adaptive_softmax
+        if task_args.decoder_shared_layer_level == 1:
+            spch_decoder.output_projection = text_decoder.output_projection
+            spch_decoder.layer_norm = text_decoder.layer_norm
+        else:  # 2
+            spch_decoder.output_projection.weight = (
+                text_decoder.output_projection.weight
+            )
+        for i, ly in enumerate(text_decoder.layers):
+            sly = spch_decoder.layers[i]
+            sly.self_attn = ly.self_attn
+            sly.self_attn_layer_norm = ly.self_attn_layer_norm
+            # sly.encoder_attn = ly.encoder_attn
+            if (
+                task_args.decoder_shared_layer_level == 1
+            ):  # share everything, but under different models
+                sly.encoder_attn = ly.encoder_attn
+                sly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm
+                sly.fc1 = ly.fc1
+                sly.fc2 = ly.fc2
+                sly.final_layer_norm = ly.final_layer_norm
+            else:  # task_args.decoder_shared_layer_level == 2: #separated encoder_attn_layer_norm and bias
+                sly.encoder_attn.k_proj.weight = ly.encoder_attn.k_proj.weight
+                sly.encoder_attn.v_proj.weight = ly.encoder_attn.v_proj.weight
+                sly.encoder_attn.q_proj.weight = ly.encoder_attn.q_proj.weight
+                sly.encoder_attn.out_proj.weight = ly.encoder_attn.out_proj.weight
+                sly.fc1.weight = ly.fc1.weight
+                sly.fc2.weight = ly.fc2.weight
+        return spch_decoder
+    def cross_attentive_loss(
+        self, teacher_states, student_states, teacher_masking, student_masking, eps=1e-6
+    ):
+        x = teacher_states.transpose(0, 1)  # from T X B X D to B X T X D
+        y = student_states.transpose(0, 1)
+        if self.cross_attentive_loss_with_norm:
+            x = x / (x.norm(dim=2, keepdim=True) + eps)
+            y = y / (y.norm(dim=2, keepdim=True) + eps)
+        dim = x.size(-1)
+        # lengths: batch X seqLen
+        sim_scores_xy = torch.bmm(x, y.transpose(1, 2))  # batch X lenx X leny ]
+        if y.dtype == torch.float16:
+            sim_scores_xy = sim_scores_xy.float()
+            y = y.float()
+            x = x.float()
+        if teacher_masking != []:
+            assert len(teacher_masking) == 1
+            sim_scores_xy = sim_scores_xy.masked_fill(
+                teacher_masking[0].unsqueeze(-1), float("-inf")
+            )
+        if student_masking != []:
+            sim_scores_xy = sim_scores_xy.masked_fill(
+                student_masking[0].unsqueeze(1), float("-inf")
+            )
+        # do masking
+        y_weights = utils.softmax(sim_scores_xy, dim=-1)
+        if teacher_masking != []:
+            y_weights = y_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0)
+        x_reconstruct_from_y = torch.bmm(y_weights, y)
+        sim_scores_xx = torch.bmm(x, x.transpose(1, 2))  # batch X lenx X lenx ]
+        x_weights = utils.softmax(sim_scores_xx, dim=-1)
+        if teacher_masking != []:
+            x_weights = x_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0)
+        # no gradient for teacher state
+        x_reconstruct_from_x = torch.bmm(x_weights, x).detach()
+        cost = (x_reconstruct_from_x - x_reconstruct_from_y).norm(dim=2)
+        if teacher_masking != []:
+            cost = cost.masked_fill(teacher_masking[0], 0)
+        if not self.cross_attentive_loss_with_norm:
+            cost = cost / dim
+        return cost
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out,
+        incremental_state=None,
+        has_txt_input=False,
+        **kwargs
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for input feeding/teacher forcing. If there are
+                two or more input during training, they will share the same prev_output_tokens
+            encoder_out (tuple[Tensor]): output from the encoder, used for
+                encoder-side attention. It will be tuple if there are more inputs, but a tensor
+                if only one input
+            incremental_state ([dict]): dictionary used for storing state during
+                :ref:`Incremental decoding`. It is only valid for inference, only from single
+                input
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len,
+                  vocab)`. If there are N inputs, batch will be N bigger than a single input
+                - the last decoder layer's attention weights of shape `(batch,
+                  tgt_len, src_len)`
+        """
+        assert not isinstance(encoder_out, EncoderOut)
+        if isinstance(encoder_out, tuple):  # training with mulitple input
+            rst = []
+            assert len(encoder_out) == 2
+            for i, eo in enumerate(encoder_out):
+                assert incremental_state is None
+                if i == 0:
+                    rst.append(
+                        self.spch_decoder(prev_output_tokens, eo, incremental_state)
+                    )
+                else:
+                    rst.append(
+                        self.text_decoder(prev_output_tokens, eo, incremental_state)
+                    )
+            dec_out = torch.cat([r[0] for r in rst], dim=0)
+            attn_cost = None
+            if self.compute_cross_attentive_loss:
+                assert isinstance(encoder_out[0], dict)
+                if self.cross_attentive_loss_reverse:
+                    attn_cost = self.cross_attentive_loss(
+                        teacher_states=encoder_out[1]["encoder_states"],  # text_states
+                        student_states=encoder_out[0]["encoder_states"],  # spch_states
+                        teacher_masking=encoder_out[1]["encoder_padding_mask"],
+                        student_masking=encoder_out[0]["encoder_padding_mask"],
+                    )
+                else:
+                    attn_cost = self.cross_attentive_loss(
+                        teacher_states=encoder_out[0]["encoder_states"],  # spch_states
+                        student_states=encoder_out[1]["encoder_states"],  # text_states
+                        teacher_masking=encoder_out[0]["encoder_padding_mask"],
+                        student_masking=encoder_out[1]["encoder_padding_mask"],
+                    )
+            return (dec_out, {"attn_cost": attn_cost})
+        else:  # inference or training with one input
+            if has_txt_input:
+                return self.text_decoder(
+                    prev_output_tokens, encoder_out, incremental_state
+                )
+            return self.spch_decoder(prev_output_tokens, encoder_out, incremental_state)
+# Note:
+# dual input transformer:
+#    encoder: S2TTransformerEncoder for speech + TransformerEncoder for text
+#    decoder: TransformerDecoder for text
+@register_model("dual_input_s2t_transformer")
+class DualInputS2TTransformerModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.num_updates = 0
+    def max_positions(self):
+        return None  # it is provided in task
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # encoder 1: S2TTransformerEncoder for speech
+        parser.add_argument(
+            "--conv-kernel-sizes",
+            type=str,
+            metavar="N",
+            help="kernel sizes of Conv1d subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv1d subsampling layers",
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="""
+                encoder output dimension, can be None. If specified, projecting the
+                transformer output to the specified dimension""",
+        )
+        # standard Transformer
+        parser.add_argument(
+            "--activation-fn",
+            type=str,
+            default="relu",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-text-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder text embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        # non-standard transformer parameters
+        parser.add_argument(
+            "--speech-encoder-layers",
+            type=int,
+            metavar="N",
+            help="num speech encoder layers",
+        )
+        parser.add_argument(
+            "--text-encoder-layers",
+            type=int,
+            metavar="N",
+            help="num text encoder layers",
+        )
+        parser.add_argument(
+            "--encoder-shared-layers",
+            type=int,
+            metavar="N",
+            help="num shared encoder layers",
+        )
+        parser.add_argument(
+            "--encoder-shared-layer-level",
+            type=int,
+            metavar="N",
+            default=0,
+            choices=[0, 1, 2],
+            help="share layer level 0: all share 1: all share with separate model 2: share weight but not bias and layernorm",
+        )
+        parser.add_argument(
+            "--decoder-shared-layer-level",
+            default=0,
+            choices=[0, 1, 2],
+            type=int,
+            metavar="N",
+            help="0: share everything; 1: share everything with different model 2: no share layer_norm and bias",
+        )
+        ###
+        parser.add_argument(
+            "--text-input-cost-ratio",
+            type=float,
+            default=1.0,
+            metavar="V",
+            help="text input cost ratio relative to speech input cost",
+        )
+        parser.add_argument(
+            "--init-scale",
+            type=float,
+            default=1.0,
+            metavar="V",
+            help="scale the initial weight by given factor",
+        )
+        parser.add_argument(
+            "--enc-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc1 and enc2 gradient by V",
+        )
+        parser.add_argument(
+            "--enc2-along-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc2 gradient by V if only enc2 is used",
+        )
+        parser.add_argument(
+            "--load-pretrain-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained encoder """,
+        )
+        parser.add_argument(
+            "--load-pretrain-speech-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained speech encoder """,
+        )
+        parser.add_argument(
+            "--load-pretrain-text-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained text encoder """,
+        )
+        parser.add_argument(
+            "--load-pretrain-text-encoder-last",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained text encoder """,
+        )
+        parser.add_argument(
+            "--load-pretrain-decoder",
+            type=str,
+            metavar="EXPR",
+            default="",
+            help=""" path to the pretrained encoder """,
+        )
+        parser.add_argument(
+            "--add-speech-eos",
+            action="store_true",
+            help="add eos token at the end of input feature",
+        )
+        parser.add_argument(
+            "--speech-encoder-adapter-type",
+            type=str,
+            metavar="EXPR",
+            default="None",
+            choices=["None", "Linear", "MLP"],
+            help="add speech encoder adapter",
+        )
+    @classmethod
+    def build_encoder(cls, args, task):
+        spch_encoder = DualInputEncoder.build_spch_encoder(args)
+        text_encoder = DualInputEncoder.build_text_encoder(
+            args, task.src_dict, spch_encoder
+        )
+        cross_attentive_loss_before_last_layer = (
+            0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1
+        )
+        encoder = DualInputEncoder(
+            args,
+            spch_encoder,
+            text_encoder,
+            task.src_dict,
+            cross_attentive_loss_before_last_layer,
+        )
+        if args.init_scale != 1.0:
+            with torch.no_grad():
+                for param in encoder.parameters():
+                    param.data.mul_(args.init_scale)
+        if args.load_pretrain_text_encoder != "":
+            checkpoint_utils.load_pretrained_component_from_model(
+                text_encoder, args.load_pretrain_text_encoder
+            )
+        if args.load_pretrain_speech_encoder != "":
+            if hasattr(spch_encoder, "encoder"):
+                checkpoint_utils.load_pretrained_component_from_model(
+                    spch_encoder.encoder, args.load_pretrain_speech_encoder
+                )
+            else:
+                checkpoint_utils.load_pretrained_component_from_model(
+                    spch_encoder, args.load_pretrain_speech_encoder
+                )
+        if (
+            args.load_pretrain_text_encoder_last != ""
+        ):  # if share encoder, speech encoder parameters will be used.
+            # It provides a chance to use pre-trained mt encoder instead
+            checkpoint_utils.load_pretrained_component_from_model(
+                text_encoder, args.load_pretrain_text_encoder_last
+            )
+        if args.load_pretrain_encoder != "":
+            checkpoint_utils.load_pretrained_component_from_model(
+                encoder, args.load_pretrain_encoder
+            )
+        return encoder
+    @classmethod
+    def build_decoder(cls, args, task):
+        dec_cfg = {
+            "decoder_layerdrop": args.decoder_layerdrop,
+            "share_decoder_input_output_embed": args.share_decoder_input_output_embed,
+            "decoder_embed_dim": args.decoder_embed_dim,
+            "max_target_positions": args.max_target_positions,
+            "dropout": args.dropout,
+            "encoder_learned_pos": args.encoder_learned_pos,
+            "decoder_learned_pos": args.decoder_learned_pos,
+            "layernorm_embedding": args.layernorm_embedding,
+            "decoder_normalize_before": args.decoder_normalize_before,
+            "activation_dropout": args.activation_dropout,
+            "attention_dropout": args.attention_dropout,
+            "decoder_ffn_embed_dim": args.decoder_ffn_embed_dim,
+            "decoder_layers": args.decoder_layers,
+            "decoder_attention_heads": args.decoder_attention_heads,
+            "decoder_output_dim": args.decoder_embed_dim,
+            "no_scale_embedding": args.no_scale_embedding,
+            "adaptive_input": args.adaptive_input,
+            "quant_noise_pq": args.quant_noise_pq,
+            "adaptive_softmax_cutoff": args.adaptive_softmax_cutoff,
+            "tie_adaptive_weights": args.tie_adaptive_weights,
+            "no_token_positional_embeddings": args.no_token_positional_embeddings,
+            "encoder": {"embed_dim":args.encoder_embed_dim}
+        }
+        dec_cfg = namedtuple("args", dec_cfg.keys())(*dec_cfg.values())
+        dec_emb = nn.Embedding(
+            len(task.target_dictionary),
+            args.decoder_embed_dim,
+            task.target_dictionary.pad(),
+        )
+        compute_cross_attentive_loss = (
+            True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False
+        )
+        cross_attentive_loss_without_norm = getattr(
+            args, "attentive_cost_without_normalize", False
+        )
+        cross_attentive_loss_reverse = (
+            False  # getattr(args, "attentive_cost_reverse", False)
+        )
+        text_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb)
+        spch_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb)
+        spch_decoder = TransformerMultiInputDecoder.share_spchdecoder(
+            args, text_decoder, spch_decoder
+        )
+        decoder = TransformerMultiInputDecoder(
+            dictionary=task.target_dictionary,
+            spch_decoder=spch_decoder,
+            text_decoder=text_decoder,
+            compute_cross_attentive_loss=compute_cross_attentive_loss,
+            cross_attentive_loss_with_norm=True
+            if not cross_attentive_loss_without_norm
+            else False,
+            cross_attentive_loss_reverse=cross_attentive_loss_reverse,
+        )
+        if args.init_scale != 1.0:
+            with torch.no_grad():
+                for param in decoder.parameters():
+                    param.data.mul_(args.init_scale)
+        if args.load_pretrain_decoder != "":
+            try:
+                checkpoint_utils.load_pretrained_component_from_model(
+                    decoder, args.load_pretrain_decoder
+                )
+            except RuntimeError:
+                checkpoint_utils.load_pretrained_component_from_model(
+                    decoder.text_decoder, args.load_pretrain_decoder
+                )
+                if args.decoder_shared_layer_level > 0:
+                    checkpoint_utils.load_pretrained_component_from_model(
+                        decoder.spch_decoder, args.load_pretrain_decoder
+                    )
+        return decoder
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted
+        # (in case there are any new ones)
+        dualinputs2ttransformer_base(args)
+        encoder = cls.build_encoder(args, task)
+        decoder = cls.build_decoder(args, task)
+        return cls(encoder, decoder)
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        lprobs.batch_first = True
+        return lprobs
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        use_encoder_outputs=False,
+        src_txt_tokens=None,
+        src_txt_lengths=None,
+        mode="sup_speech",
+        **kwargs
+    ):
+        """
+        Run the forward pass for an encoder-decoder model.
+        First feed a batch of source tokens through the encoder. Then, feed the
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
+            encoder_out = self.encoder(src_tokens, src_lengths)
+            return self.decoder(prev_output_tokens, encoder_out)
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            mode = 'sup_speech' or 'text'
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        if mode == "text":
+            assert src_txt_tokens is None
+            src_txt_tokens = src_tokens
+            src_txt_lengths = src_lengths
+            src_tokens = None
+            src_lengths = None
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            src_txt_tokens=src_txt_tokens,
+            src_txt_lengths=src_txt_lengths,
+            **kwargs
+        )
+        has_txt_input = True if src_txt_tokens is not None else False
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            has_txt_input=has_txt_input,
+            **kwargs
+        )
+        if use_encoder_outputs:
+            return decoder_out, encoder_out
+        return decoder_out
+@register_model_architecture(
+    "dual_input_s2t_transformer", "dualinputs2ttransformer_base"
+)
+def dualinputs2ttransformer_base(args):
+    args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0)
+    # Convolutional subsampler
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5")
+    args.conv_channels = getattr(args, "conv_channels", 1024)
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_text_embed_dim = getattr(
+        args, "encoder_text_embed_dim", args.encoder_embed_dim
+    )
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.encoder_shared_layers = getattr(args, "encoder_shared_layers", 0)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.add_speech_eos = getattr(args, "add_speech_eos", False)
+@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_s")
+def dualinputs2ttransformer_s(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 7)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 7)
+    args.decoder_layers = getattr(args, "decoder_layers", 7)
+    dualinputs2ttransformer_base(args)
+@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_m")
+def dualinputs2ttransformer_m(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.dropout = getattr(args, "dropout", 0.15)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    dualinputs2ttransformer_base(args)
+@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_b")
+def dualinputs2ttransformer_b(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 768 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12)
+    args.dropout = getattr(args, "dropout", 0.15)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    dualinputs2ttransformer_base(args)
+@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_l")
+def dualinputs2ttransformer_l(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.2)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    dualinputs2ttransformer_base(args)

fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from collections import OrderedDict, namedtuple
+import torch.nn as nn
+from fairseq import checkpoint_utils, utils
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.file_io import PathManager
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.speech_to_text import (
+    SpeechWavTransformerEncoder,
+    StackedSpeechWavTransformerEncoder,
+    TransformerDecoder,
+)
+from fairseq.models.transformer import TransformerEncoder
+from .s2t_dualinputtransformer import (
+    DualInputEncoder,
+    DualInputS2TTransformerModel,
+    TransformerMultiInputDecoder,
+)
+logger = logging.getLogger(__name__)
+@register_model("dual_input_wav_transformer")
+class DualInputWavTransformerModel(DualInputS2TTransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        def add_transformer_args(parser):
+            # We can't use TransformerModel.add_args(parser), since it defines max-source-positions which is duplicated with tasks/speech_to_text.py
+            # Transformer
+            parser.add_argument(
+                "--activation-fn",
+                type=str,
+                default="relu",
+                choices=utils.get_available_activation_fns(),
+                help="activation function to use",
+            )
+            parser.add_argument(
+                "--dropout", type=float, metavar="D", help="dropout probability"
+            )
+            parser.add_argument(
+                "--attention-dropout",
+                type=float,
+                metavar="D",
+                help="dropout probability for attention weights",
+            )
+            parser.add_argument(
+                "--activation-dropout",
+                "--relu-dropout",
+                type=float,
+                metavar="D",
+                help="dropout probability after activation in FFN.",
+            )
+            parser.add_argument(
+                "--encoder-embed-dim",
+                type=int,
+                metavar="N",
+                help="encoder embedding dimension",
+            )
+            parser.add_argument(
+                "--encoder-ffn-embed-dim",
+                type=int,
+                metavar="N",
+                help="encoder embedding dimension for FFN",
+            )
+            parser.add_argument(
+                "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+            )
+            parser.add_argument(
+                "--encoder-attention-heads",
+                type=int,
+                metavar="N",
+                help="num encoder attention heads",
+            )
+            parser.add_argument(
+                "--encoder-normalize-before",
+                action="store_true",
+                help="apply layernorm before each encoder block",
+            )
+            parser.add_argument(
+                "--decoder-embed-dim",
+                type=int,
+                metavar="N",
+                help="decoder embedding dimension",
+            )
+            parser.add_argument(
+                "--decoder-ffn-embed-dim",
+                type=int,
+                metavar="N",
+                help="decoder embedding dimension for FFN",
+            )
+            parser.add_argument(
+                "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+            )
+            parser.add_argument(
+                "--decoder-attention-heads",
+                type=int,
+                metavar="N",
+                help="num decoder attention heads",
+            )
+            parser.add_argument(
+                "--decoder-normalize-before",
+                action="store_true",
+                help="apply layernorm before each decoder block",
+            )
+            parser.add_argument(
+                "--share-decoder-input-output-embed",
+                action="store_true",
+                help="share decoder input and output embeddings",
+            )
+            parser.add_argument(
+                "--layernorm-embedding",
+                action="store_true",
+                help="add layernorm to embedding",
+            )
+            parser.add_argument(
+                "--no-scale-embedding",
+                action="store_true",
+                help="if True, dont scale embeddings",
+            )
+            parser.add_argument(
+                "--encoder-learned-pos",
+                action="store_true",
+                help="use learned positional embeddings",
+            )
+            parser.add_argument(
+                "--decoder-learned-pos",
+                action="store_true",
+                help="use learned positional embeddings",
+            )
+        add_transformer_args(parser)
+        SpeechWavTransformerEncoder.add_args(parser)
+        parser.add_argument(
+            "--load-pretrained-speech-text-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained speech text encoder from SpeechTextPreTrainModel """,
+        )
+        parser.add_argument(
+            "--load-pretrained-wav2vec-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained speech text encoder from wav2vec """,
+        )
+        parser.add_argument(
+            "--load-pretrained-speech-text-decoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained speech text decoder from SpeechTextPreTrainModel """,
+        )
+        parser.add_argument(
+            "--load-pretrained-text-decoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to the pretrained  text decoder """,
+        )
+        parser.add_argument(
+            "--load-init-encoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to load seed encoder model """,
+        )
+        parser.add_argument(
+            "--load-init-decoder",
+            type=str,
+            default="",
+            metavar="EXPR",
+            help=""" path to load seed decoder model """,
+        )
+        parser.add_argument(
+            "--text-input-cost-ratio",
+            type=float,
+            default=1.0,
+            metavar="V",
+            help="text input cost ratio relative to speech input cost",
+        )
+        parser.add_argument(
+            "--enc-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc1 and enc2 gradient by V",
+        )
+        parser.add_argument(
+            "--enc2-along-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc2 gradient by V if only enc2 is used",
+        )
+        parser.add_argument(
+            "--no-strict-check-pretrain-model",
+            action="store_true",
+            help="Don't apply strict model check for the pretrained model",
+        )
+        parser.add_argument(
+            "--stacked-encoder",
+            action="store_true",
+            help="stack speech and text encoders",
+        )
+    @classmethod
+    def update_transformer_encoder_cfg(cls, args, update_dict):
+        cfg = dict(args._get_kwargs())
+        for fkey in update_dict.keys():
+            cfg[fkey] = update_dict[fkey]
+        cfg.pop("_name", None)  # remove keys start with _
+        model_args = namedtuple("args", cfg.keys())(*cfg.values())
+        return model_args
+    @classmethod
+    def build_text_encoder(cls, args, src_dictionary):
+        enc_emb = nn.Embedding(
+            len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad()
+        )
+        model_args = cls.update_transformer_encoder_cfg(
+            args,
+            {
+                "encoder_layers": args.text_encoder_layers,
+                "max_source_positions": args.max_positions_text,
+            },
+        )
+        text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb)
+        return text_encoder
+    @classmethod
+    def build_speech_encoder(cls, args):
+        model_args = cls.update_transformer_encoder_cfg(
+            args, {"encoder_layers": args.speech_encoder_layers}
+        )
+        speech_encoder = SpeechWavTransformerEncoder(model_args)
+        return speech_encoder
+    @classmethod
+    def check_args(cls, condition, is_strict, msg):
+        if condition:
+            return
+        if is_strict:
+            raise ValueError(msg)
+        logger.warn(msg)
+    @classmethod
+    def build_encoder(cls, args, task):
+        # text_encoder = cls.build_text_encoder(args, task.source_dictionary )
+        text_encoder = cls.build_text_encoder(args, task.src_dict)
+        speech_encoder = cls.build_speech_encoder(args)
+        if args.load_pretrained_wav2vec_encoder:
+            component_pairs = (
+                ("feature_extractor", speech_encoder.subsample),
+                ("post_extract_proj", speech_encoder.feat_proj),
+                ("layer_norm", speech_encoder.feat_layer_norm),
+                ("encoder.pos_conv", speech_encoder.embed_positions),
+                ("encoder.layers", speech_encoder.layers),
+                ("encoder.layer_norm", speech_encoder.layer_norm),
+                ("mask_emb", speech_encoder.mask_emb),
+            )
+            state = cls.load_pretrained_speech_text_components(
+                args.load_pretrained_wav2vec_encoder, component_pairs
+            )
+            cls.check_args(
+                args.encoder_normalize_before
+                == state["cfg"]["model"]["layer_norm_first"],
+                not args.no_strict_check_pretrain_model,
+                f"encoder_normalize_before {args.encoder_normalize_before} doesn't match with the pretrained model",
+            )
+            cls.check_args(
+                args.activation_fn == state["cfg"]["model"]["activation_fn"],
+                not args.no_strict_check_pretrain_model,
+                f"activation_fn {args.activation_fn} doesn't match with the pretrained model",
+            )
+        if getattr(args, "stacked_encoder", False):
+            if args.encoder_shared_text_layers_from_begin > 0:
+                raise ValueError(
+                    "We can not stack encoders and share encoders at the same time!"
+                )
+            speech_encoder = StackedSpeechWavTransformerEncoder(
+                speech_encoder, text_encoder.layers, text_encoder.layer_norm
+            )
+        else:
+            cls.share_speech_text_encoder(
+                speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin
+            )
+        cross_attentive_loss_before_last_layer = (
+            0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1
+        )
+        encoder = DualInputEncoder(
+            args,
+            speech_encoder,
+            text_encoder,
+            task.src_dict,
+            cross_attentive_loss_before_last_layer,
+        )
+        if args.load_pretrained_speech_text_encoder:
+            component_pairs = (
+                ("encoder.sup_s2s_speech_encoder", encoder.spch_encoder),
+                ("encoder.text_encoder", encoder.text_encoder),
+            )
+            cls.load_pretrained_speech_text_components(
+                args.load_pretrained_speech_text_encoder, component_pairs
+            )
+        if getattr(args, "load_init_encoder", "") != "":
+            checkpoint_utils.load_pretrained_component_from_model(
+                encoder, args.load_init_encoder
+            )
+        return encoder
+    @classmethod
+    def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None):
+        dec_emb = (
+            nn.Embedding(
+                len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad()
+            )
+            if dec_emb_share is None
+            else dec_emb_share
+        )
+        text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb)
+        return text_decoder
+    @classmethod
+    def build_decoder(cls, args, task):
+        text_decoder = cls.build_text_decoder(args, task.target_dictionary)
+        compute_cross_attentive_loss = (
+            True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False
+        )
+        cross_attentive_loss_without_norm = getattr(
+            args, "attentive_cost_without_normalize", False
+        )
+        cross_attentive_loss_reverse = (
+            False  # getattr(args, "attentive_cost_reverse", False)
+        )
+        if getattr(args, "load_pretrained_text_decoder", "") != "":
+            checkpoint_utils.load_pretrained_component_from_model(
+                text_decoder, args.load_pretrained_text_decoder
+            )
+        if args.load_pretrained_speech_text_decoder:
+            component_pairs = (("decoder.text_decoder", text_decoder),)
+            cls.load_pretrained_speech_text_components(
+                args.load_pretrained_speech_text_decoder, component_pairs
+            )
+        decoder = TransformerMultiInputDecoder(
+            dictionary=task.target_dictionary,
+            spch_decoder=text_decoder,
+            text_decoder=text_decoder,
+            compute_cross_attentive_loss=compute_cross_attentive_loss,
+            cross_attentive_loss_with_norm=True
+            if not cross_attentive_loss_without_norm
+            else False,
+            cross_attentive_loss_reverse=cross_attentive_loss_reverse,
+        )
+        if getattr(args, "load_init_decoder", "") != "":
+            checkpoint_utils.load_pretrained_component_from_model(
+                decoder, args.load_init_decoder
+            )
+        return decoder
+    @classmethod
+    def load_pretrained_speech_text_components(cls, checkpoint, component_pairs):
+        if not PathManager.exists(checkpoint):
+            raise IOError("Model file not found: {}".format(checkpoint))
+        state = load_checkpoint_to_cpu(checkpoint)
+        for component_type, component in component_pairs:
+            if isinstance(component, nn.parameter.Parameter):
+                component.data.copy_(state["model"][component_type])
+            else:
+                component_state_dict = OrderedDict()
+                for key in state["model"].keys():
+                    if key.startswith(component_type):
+                        component_subkey = key[len(component_type) + 1 :]
+                        component_state_dict[component_subkey] = state["model"][key]
+                component.load_state_dict(component_state_dict, strict=True)
+        return state
+    @classmethod
+    def share_speech_text_encoder(
+        cls, speech_encoder, text_encoder, shared_layers_from_begin
+    ):
+        if shared_layers_from_begin > 0:
+            num_text_encoder_layers = len(text_encoder.layers)
+            assert len(speech_encoder.layers) >= shared_layers_from_begin
+            assert num_text_encoder_layers >= shared_layers_from_begin
+            assert len(speech_encoder.layers) >= num_text_encoder_layers
+            for i, ly in enumerate(
+                speech_encoder.layers[
+                    -num_text_encoder_layers : -num_text_encoder_layers
+                    + shared_layers_from_begin
+                ]
+            ):
+                assert isinstance(text_encoder.layers[i], type(ly))
+                text_encoder.layers[i] = ly
+@register_model_architecture(
+    "dual_input_wav_transformer", "dualinputs2twavtransformer_base"
+)
+def dualinputs2twavtransformer_base(args):
+    # speech masking
+    args.dropout_input = getattr(args, "dropout_input", 0)
+    args.dropout_features = getattr(args, "dropout_features", 0)
+    args.speech_mask_length = getattr(args, "speech_mask_length", 10)
+    args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65)
+    args.speech_mask_selection = getattr(args, "speech_mask_selection", "static")
+    args.speech_mask_other = getattr(args, "speech_mask_other", 0)
+    args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1)
+    args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False)
+    args.speech_conv_bias = getattr(args, "speech_conv_bias", False)
+    args.speech_extractor_mode = getattr(args, "speech_extractor_mode", "default")
+    args.no_strict_check_pretrain_model = getattr(
+        args, "no_strict_check_pretrain_model", False
+    )
+    args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10)
+    args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0)
+    args.speech_mask_channel_selection = getattr(
+        args, "speech_mask_channel_selection", "static"
+    )
+    args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0)
+    args.speech_mask_channel_min_space = getattr(
+        args, "speech_mask_channel_min_space", 1
+    )
+    args.speech_no_mask_channel_overlap = getattr(
+        args, "speech_no_mask_channel_overlap", False
+    )
+    args.no_scale_feature = getattr(args, "", False)
+    args.feature_grad_mult = getattr(args, "feature_grad_mult", 0.0)  # 0.1
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(
+        args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4
+    )
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.1)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_attention_heads = getattr(
+        args, "decoder_attention_heads", args.encoder_attention_heads
+    )
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0)
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")  # gelu?
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 6
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+@register_model_architecture(
+    "dual_input_wav_transformer", "dualinputs2twavtransformer_base_stack"
+)
+def dualinputs2twavtransformer_base_stack(args):
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 0
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.stacked_encoder = getattr(args, "stacked_encoder", True)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+    dualinputs2twavtransformer_base(args)
+@register_model_architecture(
+    "dual_input_wav_transformer", "dualinputs2twavtransformer_large"
+)
+def dualinputs2twavtransformer_large(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24)
+    args.text_encoder_layers = getattr(args, "text_encoder_layers", 12)
+    args.encoder_shared_text_layers_from_begin = getattr(
+        args, "encoder_shared_text_layers_from_begin", 12
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    dualinputs2twavtransformer_base(args)

fairseq/examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py ADDED Viewed

	@@ -0,0 +1,584 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import torch.nn as nn
+from fairseq import checkpoint_utils
+from fairseq import utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    register_model,
+    register_model_architecture,
+    FairseqEncoder,
+)
+from fairseq.models.speech_to_text import Wav2VecEncoderWithAdaptor
+from fairseq.models.speech_to_text.xm_transformer import (
+    set_default_adaptor_args,
+    set_default_w2v_encoder_args,
+    need_finetuning
+)
+from fairseq.models.transformer import TransformerEncoder, TransformerDecoder
+from fairseq.models.wav2vec import TransformerSentenceEncoderLayer
+from fairseq.utils import safe_hasattr
+from .s2t_dualinputtransformer import (
+    DualInputS2TTransformerModel,
+    TransformerMultiInputDecoder,
+    DualInputEncoder,
+)
+class TransformerSentenceEncoderLayerStd(TransformerSentenceEncoderLayer):
+    def __init__(self, sent_enc_layer):
+        super(TransformerSentenceEncoderLayer, self).__init__()
+        self.embedding_dim = sent_enc_layer.embedding_dim
+        self.dropout = sent_enc_layer.dropout
+        self.activation_dropout = sent_enc_layer.activation_dropout
+        # Initialize blocks
+        self.activation_fn = sent_enc_layer.activation_fn
+        self.self_attn = sent_enc_layer.self_attn
+        self.dropout1 = sent_enc_layer.dropout1
+        self.dropout2 = sent_enc_layer.dropout2
+        self.dropout3 = sent_enc_layer.dropout3
+        self.layer_norm_first = sent_enc_layer.layer_norm_first
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = sent_enc_layer.self_attn_layer_norm
+        self.fc1 = sent_enc_layer.fc1
+        self.fc2 = sent_enc_layer.fc2
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = sent_enc_layer.final_layer_norm
+    def forward(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+        need_weights=None,
+        att_args=None,
+    ):
+        x, attn = super().forward(
+            x, self_attn_mask, self_attn_padding_mask, need_weights, att_args
+        )
+        return x
+# TODO retire SharedEncoder
+class SharedEncoder(FairseqEncoder):
+    def __init__(self, wav2vec_enc, mbart_enc, adaptor, shared_layers):
+        super().__init__(None)
+        self.w2v_encoder = wav2vec_enc
+        self.shared_layers = self.w2v_encoder.w2v_model.encoder.layers[-shared_layers:]
+        self.w2v_encoder.w2v_model.encoder.layers = (
+            self.w2v_encoder.w2v_model.encoder.layers[:-shared_layers]
+        )
+        self.adaptor = adaptor
+        if self.shared_layers[-1].layer_norm_first:
+            self.final_layer_norm = mbart_enc.layer_norm
+        else:
+            mbart_enc.layer_norm = None
+            self.final_layer_norm = None
+        shared_layer_from = len(mbart_enc.layers) - shared_layers
+        if shared_layer_from < 0:
+            shared_layer_from = 0
+        for layer_id, layer in enumerate(self.shared_layers):
+            mbart_enc.layers[
+                shared_layer_from + layer_id
+            ] = TransformerSentenceEncoderLayerStd(layer)
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        padding_mask = lengths_to_padding_mask(src_lengths)
+        if not padding_mask.any():
+            padding_mask = None
+        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
+        x = out["encoder_out"]
+        enc_padding_mask = None
+        if out["encoder_padding_mask"] is not None:
+            enc_padding_mask = out["encoder_padding_mask"].transpose(
+                0, 1
+            )  # T X B --> B X T
+        x, enc_padding_mask = self.adaptor(x, enc_padding_mask)
+        for layer in self.shared_layers:
+            x, _ = layer(x, enc_padding_mask)
+        if self.final_layer_norm is not None:
+            x = self.final_layer_norm(x)
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [enc_padding_mask]
+            if enc_padding_mask is not None
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": [],  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+class StackedWav2VecEncoderWithAdaptor(FairseqEncoder):
+    def __init__(
+        self,
+        wav2vec_enc,
+        mbart_enc_layers,
+        mbart_layer_norm,
+        adaptor,
+        drop_w2v_layers=0,
+    ):
+        super().__init__(None)
+        self.w2v_encoder = wav2vec_enc
+        self.adaptor = adaptor
+        self.mbart_encoder_layers = mbart_enc_layers
+        self.final_layer_norm = mbart_layer_norm
+        if drop_w2v_layers > 0:
+            self.w2v_encoder.w2v_model.encoder.layers = (
+                self.w2v_encoder.w2v_model.encoder.layers[:-drop_w2v_layers]
+            )
+    def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs):
+        padding_mask = lengths_to_padding_mask(src_lengths)
+        if not padding_mask.any():
+            padding_mask = None
+        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
+        x = out["encoder_out"]
+        enc_padding_mask = None
+        if out["padding_mask"] is not None:
+            enc_padding_mask = out["padding_mask"]  # B X T
+        x, enc_padding_mask = self.adaptor(x, enc_padding_mask)
+        encoder_states = []
+        for layer in self.mbart_encoder_layers:
+            x = layer(x, enc_padding_mask)
+            if return_all_hiddens:
+                encoder_states.append(x)
+        if self.final_layer_norm is not None:
+            x = self.final_layer_norm(x)
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [enc_padding_mask]
+            if enc_padding_mask is not None
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+    def reorder_encoder_out(self, encoder_out, new_order):
+        new_encoder_out = (
+            []
+            if len(encoder_out["encoder_out"]) == 0
+            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
+        )
+        new_encoder_padding_mask = (
+            []
+            if len(encoder_out["encoder_padding_mask"]) == 0
+            else [
+                x.index_select(0, new_order)
+                for x in encoder_out["encoder_padding_mask"]
+            ]
+        )
+        new_encoder_embedding = (
+            []
+            if len(encoder_out["encoder_embedding"]) == 0
+            else [
+                x.index_select(0, new_order) for x in encoder_out["encoder_embedding"]
+            ]
+        )
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],  # B x T
+            "src_lengths": [],  # B x 1
+        }
+# Note:
+# dual input transformer:
+#    encoder: wav2vec for speech + mbart encoder for text
+#    decoder: mbart decoder  for text
+@register_model("dual_input_xm_transformer")
+class DualInputXMTransformerModel(DualInputS2TTransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # wav2vec encoder
+        Wav2VecEncoderWithAdaptor.add_args(parser)
+        # add_decoder_args(parser)
+        # mbart Transformer
+        parser.add_argument(
+            "--activation-fn",
+            type=str,
+            default="relu",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--mbart-dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--mbart-attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--mbart-activation-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        parser.add_argument(
+            "--load-pretrained-mbart-from",
+            type=str,
+            metavar="STR",
+            help="model to take text encoder decoder weights from (for initialization)",
+        )
+        # parser.add_argument("--finetune-w2v-params", type=str, metavar="STR",
+        #                    help="comma-separated param strings to finetune.")
+        parser.add_argument(
+            "--finetune-mbart-decoder-params",
+            type=str,
+            metavar="STR",
+            help="comma-separated param strings to finetune.",
+        )
+        parser.add_argument(
+            "--finetune-mbart-encoder-params",
+            type=str,
+            metavar="STR",
+            help="comma-separated param strings to finetune.",
+        )
+        parser.add_argument(
+            "--skip-encoder-projection",
+            action="store_true",
+            help="skip the projection layer in encoder",
+        )
+        parser.add_argument(
+            "--enc-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc1 and enc2 gradient by V",
+        )
+        parser.add_argument(
+            "--enc2-along-grad-mult",
+            type=float,
+            metavar="V",
+            default=1.0,
+            help="multiply enc2 gradient by V if only enc2 is used",
+        )
+        parser.add_argument(
+            "--text-input-cost-ratio",
+            type=float,
+            default=1.0,
+            metavar="V",
+            help="text input cost ratio relative to speech input cost",
+        )
+        parser.add_argument(
+            "--stack-w2v-mbart-encoder",
+            action="store_true",
+            help="stack w2v and mbart encoder",
+        )
+        parser.add_argument(
+            "--stack-w2v-mbart-nonorm-encoder",
+            action="store_true",
+            help="stack w2v and mbart encoder",
+        )
+        parser.add_argument(
+            "--no-final-norm-decoder", action="store_true", help="no layer norm"
+        )
+        parser.add_argument(
+            "--drop-w2v-layers",
+            type=int,
+            default=0,
+            metavar="N",
+            help="drop w2v encoder layers",
+        )
+        parser.add_argument(
+            "--share-w2v-text-encoder",
+            action="store_true",
+            help="share w2v encoder layers with text encoder",
+        )
+        parser.add_argument(
+            "--shared-w2v-layers",
+            type=int,
+            default=0,
+            metavar="N",
+            help="shared encoder layers from w2v encoder",
+        )
+    @classmethod
+    def build_encoder(cls, args, task):
+        _args = copy.deepcopy(args)
+        _args.dropout = args.mbart_dropout
+        _args.attention_dropout = args.mbart_attention_dropout
+        _args.activation_dropout = args.mbart_activation_dropout
+        _args.max_source_positions = 1024
+        enc_emb = nn.Embedding(
+            len(task.src_dict), _args.encoder_embed_dim, task.src_dict.pad()
+        )
+        text_encoder = TransformerEncoder(_args, task.src_dict, enc_emb)
+        spch_encoder = Wav2VecEncoderWithAdaptor(args)
+        if getattr(args, "load_pretrained_mbart_from", None):
+            text_encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=text_encoder, checkpoint=args.load_pretrained_mbart_from
+            )
+        if getattr(args, "stack_w2v_mbart_encoder", False):
+            assert getattr(args, "share_w2v_text_encoder", False) is False
+            spch_encoder = StackedWav2VecEncoderWithAdaptor(
+                spch_encoder.w2v_encoder,
+                text_encoder.layers,
+                text_encoder.layer_norm,
+                spch_encoder.adaptor,
+                args.drop_w2v_layers,
+            )
+        elif getattr(args, "stack_w2v_mbart_nonorm_encoder", False):
+            text_encoder.layer_norm = None
+            spch_encoder = StackedWav2VecEncoderWithAdaptor(
+                spch_encoder.w2v_encoder,
+                text_encoder.layers,
+                text_encoder.layer_norm,
+                spch_encoder.adaptor,
+                args.drop_w2v_layers,
+            )
+        elif getattr(args, "share_w2v_text_encoder", False):
+            spch_encoder = SharedEncoder(
+                spch_encoder.w2v_encoder,
+                text_encoder,
+                spch_encoder.adaptor,
+                args.shared_w2v_layers,
+            )
+        for k, p in spch_encoder.named_parameters():
+            # Freeze pretrained models by default
+            if safe_hasattr(
+                args, "finetune_w2v_params"
+            ) and need_finetuning(args.finetune_w2v_params, k):
+                p.requires_grad = True
+            else:
+                p.requires_grad = False
+        for k, p in text_encoder.named_parameters():
+            # Freeze pretrained models by default
+            if safe_hasattr(
+                args, "finetune_mbart_encoder_params"
+            ) and need_finetuning(
+                args.finetune_mbart_encoder_params, k
+            ):
+                p.requires_grad = True
+            else:
+                p.requires_grad = False
+        cross_attentive_loss_before_last_layer = (
+            0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1
+        )
+        encoder = DualInputEncoder(
+            args,
+            spch_encoder,
+            text_encoder,
+            task.src_dict,
+            cross_attentive_loss_before_last_layer,
+        )
+        return encoder
+    @classmethod
+    def build_decoder(cls, args, task):
+        _args = copy.deepcopy(args)
+        _args.dropout = args.mbart_dropout
+        _args.attention_dropout = args.mbart_attention_dropout
+        _args.activation_dropout = args.mbart_activation_dropout
+        _args.max_target_positions = 1024
+        dec_emb = nn.Embedding(
+            len(task.tgt_dict), _args.encoder_embed_dim, task.tgt_dict.pad()
+        )
+        decoder = TransformerDecoder(_args, task.tgt_dict, dec_emb)
+        if getattr(args, "load_pretrained_mbart_from", None):
+            decoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=decoder, checkpoint=args.load_pretrained_mbart_from
+            )
+        if getattr(args, "no_final_norm_decoder", False):
+            decoder.layer_norm = None
+        for k, p in decoder.named_parameters():
+            # Freeze pretrained models by default
+            if safe_hasattr(
+                args, "finetune_mbart_decoder_params"
+            ) and need_finetuning(
+                args.finetune_mbart_decoder_params, k
+            ):
+                p.requires_grad = True
+            else:
+                p.requires_grad = False
+        compute_cross_attentive_loss = (
+            True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False
+        )
+        cross_attentive_loss_without_norm = getattr(
+            args, "attentive_cost_without_normalize", False
+        )
+        cross_attentive_loss_reverse = (
+            False  # getattr(args, "attentive_cost_reverse", False)
+        )
+        decoder = TransformerMultiInputDecoder(
+            dictionary=task.target_dictionary,
+            spch_decoder=decoder,
+            text_decoder=decoder,
+            compute_cross_attentive_loss=compute_cross_attentive_loss,
+            cross_attentive_loss_with_norm=True
+            if not cross_attentive_loss_without_norm
+            else False,
+            cross_attentive_loss_reverse=cross_attentive_loss_reverse,
+        )
+        return decoder
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted
+        # (in case there are any new ones)
+        dualinputxmtransformer_base(args)
+        encoder = cls.build_encoder(args, task)
+        decoder = cls.build_decoder(args, task)
+        return cls(encoder, decoder)
+@register_model_architecture("dual_input_xm_transformer", "dualinputxmtransformer_base")
+def dualinputxmtransformer_base(args):
+    # wav2vec encoder
+    set_default_w2v_encoder_args(args)
+    set_default_adaptor_args(args)
+    # mbart model
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(
+        args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim
+    )
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024)
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.mbart_attention_dropout = getattr(args, "mbart_attention_dropout", 0.0)
+    args.mbart_activation_dropout = getattr(args, "mbart_activation_dropout", 0.0)
+    args.mbart_dropout = getattr(args, "mbart_dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", True
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)

fairseq/examples/speech_text_joint_to_text/scripts/convert_model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import re
+from collections import OrderedDict
+import torch
+from fairseq.file_io import PathManager
+def is_update(param_name, module_name):
+    if module_name in param_name:
+        return True
+    return False
+def load_checkpoint(src_cpt):
+    with PathManager.open(src_cpt, "rb") as f:
+        state_src = torch.load(
+            f,
+            map_location=(
+                lambda s, _: torch.serialization.default_restore_location(s, "cpu")
+            ),
+        )
+    return state_src
+def save_checkpoint(tgt_cpt, states):
+    with PathManager.open(tgt_cpt, "wb") as f:
+        torch.save(
+            states,
+            f,
+        )
+# convert the pre-trained model into bart model
+def main():
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument('--input-model', required=True,
+                        help='Input checkpoint file path.')
+    parser.add_argument('--output-model', required=True,
+                        help='output checkpoint file path.')
+    # fmt: on
+    args = parser.parse_args()
+    print(args)
+    states = load_checkpoint(args.input_model)
+    model = states["model"]
+    new_model = OrderedDict()
+    for key in model.keys():
+        if re.search("^encoder.text_encoder", key):
+            new_key = re.sub("encoder.text_encoder", "encoder", key)
+            new_model[new_key] = model[key]
+        elif re.search("^decoder.text_decoder", key):
+            new_key = re.sub("decoder.text_decoder", "decoder", key)
+            new_model[new_key] = model[key]
+    states["model"] = new_model
+    save_checkpoint(args.output_model, states)
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_text_joint_to_text/scripts/g2p_encode.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import itertools
+import logging
+import re
+import time
+from g2p_en import G2p
+logger = logging.getLogger(__name__)
+FAIL_SENT = "FAILED_SENTENCE"
+def parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, required=True)
+    parser.add_argument("--out-path", type=str, required=True)
+    parser.add_argument("--lower-case", action="store_true")
+    parser.add_argument("--do-filter", action="store_true")
+    parser.add_argument("--use-word-start", action="store_true")
+    parser.add_argument("--dup-vowel", default=1, type=int)
+    parser.add_argument("--dup-consonant", default=1, type=int)
+    parser.add_argument("--no-punc", action="store_true")
+    parser.add_argument("--reserve-word", type=str, default="")
+    parser.add_argument(
+        "--reserve-first-column",
+        action="store_true",
+        help="first column is sentence id",
+    )
+    ###
+    parser.add_argument("--parallel-process-num", default=1, type=int)
+    parser.add_argument("--logdir", default="")
+    args = parser.parse_args()
+    return args
+def process_sent(sent, g2p, res_wrds, args):
+    sents = pre_process_sent(sent, args.do_filter, args.lower_case, res_wrds)
+    pho_seqs = [do_g2p(g2p, s, res_wrds, i == 0) for i, s in enumerate(sents)]
+    pho_seq = (
+        [FAIL_SENT]
+        if [FAIL_SENT] in pho_seqs
+        else list(itertools.chain.from_iterable(pho_seqs))
+    )
+    if args.no_punc:
+        pho_seq = remove_punc(pho_seq)
+    if args.dup_vowel > 1 or args.dup_consonant > 1:
+        pho_seq = dup_pho(pho_seq, args.dup_vowel, args.dup_consonant)
+    if args.use_word_start:
+        pho_seq = add_word_start(pho_seq)
+    return " ".join(pho_seq)
+def remove_punc(sent):
+    ns = []
+    regex = re.compile("[^a-zA-Z0-9 ]")
+    for p in sent:
+        if (not regex.search(p)) or p == FAIL_SENT:
+            if p == " " and (len(ns) == 0 or ns[-1] == " "):
+                continue
+            ns.append(p)
+    return ns
+def do_g2p(g2p, sent, res_wrds, is_first_sent):
+    if sent in res_wrds:
+        pho_seq = [res_wrds[sent]]
+    else:
+        pho_seq = g2p(sent)
+    if not is_first_sent:
+        pho_seq = [" "] + pho_seq  # add space to separate
+    return pho_seq
+def pre_process_sent(sent, do_filter, lower_case, res_wrds):
+    if do_filter:
+        sent = re.sub("-", " ", sent)
+        sent = re.sub("—", " ", sent)
+    if len(res_wrds) > 0:
+        wrds = sent.split()
+        wrds = ["SPLIT_ME " + w + " SPLIT_ME" if w in res_wrds else w for w in wrds]
+        sents = [x.strip() for x in " ".join(wrds).split("SPLIT_ME") if x.strip() != ""]
+    else:
+        sents = [sent]
+    if lower_case:
+        sents = [s.lower() if s not in res_wrds else s for s in sents]
+    return sents
+def dup_pho(sent, dup_v_num, dup_c_num):
+    """
+    duplicate phoneme defined as cmudict
+    http://www.speech.cs.cmu.edu/cgi-bin/cmudict
+    """
+    if dup_v_num == 1 and dup_c_num == 1:
+        return sent
+    ns = []
+    for p in sent:
+        ns.append(p)
+        if re.search(r"\d$", p):
+            for i in range(1, dup_v_num):
+                ns.append(f"{p}-{i}P")
+        elif re.search(r"\w", p):
+            for i in range(1, dup_c_num):
+                ns.append(f"{p}-{i}P")
+    return ns
+def add_word_start(sent):
+    ns = []
+    do_add = True
+    ws = "▁"
+    for p in sent:
+        if do_add:
+            p = ws + p
+            do_add = False
+        if p == " ":
+            do_add = True
+        else:
+            ns.append(p)
+    return ns
+def load_reserve_word(reserve_word):
+    if reserve_word == "":
+        return []
+    with open(reserve_word, "r") as fp:
+        res_wrds = [x.strip().split() for x in fp.readlines() if x.strip() != ""]
+        assert sum([0 if len(x) == 2 else 1 for x in res_wrds]) == 0
+        res_wrds = dict(res_wrds)
+    return res_wrds
+def process_sents(sents, args):
+    g2p = G2p()
+    out_sents = []
+    res_wrds = load_reserve_word(args.reserve_word)
+    for sent in sents:
+        col1 = ""
+        if args.reserve_first_column:
+            col1, sent = sent.split(None, 1)
+        sent = process_sent(sent, g2p, res_wrds, args)
+        if args.reserve_first_column and col1 != "":
+            sent = f"{col1} {sent}"
+        out_sents.append(sent)
+    return out_sents
+def main():
+    args = parse()
+    out_sents = []
+    with open(args.data_path, "r") as fp:
+        sent_list = [x.strip() for x in fp.readlines()]
+    if args.parallel_process_num > 1:
+        try:
+            import submitit
+        except ImportError:
+            logger.warn(
+                "submitit is not found and only one job is used to process the data"
+            )
+            submitit = None
+    if args.parallel_process_num == 1 or submitit is None:
+        out_sents = process_sents(sent_list, args)
+    else:
+        # process sentences with parallel computation
+        lsize = len(sent_list) // args.parallel_process_num + 1
+        executor = submitit.AutoExecutor(folder=args.logdir)
+        executor.update_parameters(timeout_min=1000, cpus_per_task=4)
+        jobs = []
+        for i in range(args.parallel_process_num):
+            job = executor.submit(
+                process_sents, sent_list[lsize * i : lsize * (i + 1)], args
+            )
+            jobs.append(job)
+        is_running = True
+        while is_running:
+            time.sleep(5)
+            is_running = sum([job.done() for job in jobs]) < len(jobs)
+        out_sents = list(itertools.chain.from_iterable([job.result() for job in jobs]))
+    with open(args.out_path, "w") as fp:
+        fp.write("\n".join(out_sents) + "\n")
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_text_joint_to_text/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os

fairseq/examples/speech_text_joint_to_text/tasks/pair_denoising.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import logging
+import os
+import re
+import numpy as np
+import torch
+from examples.speech_text_joint_to_text.data.pair_denoising_dataset import (
+    LanguagePairDenoisingDataset,
+)
+from fairseq import utils
+from fairseq.data import (
+    ConcatDataset,
+    Dictionary,
+    LanguagePairDataset,
+    ResamplingDataset,
+    TransformEosConcatLangPairDataset,
+    TransformEosLangPairDataset,
+    data_utils,
+    indexed_dataset,
+)
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import TranslationTask
+logger = logging.getLogger(__name__)
+def gen_whole_word_mask(args, dictionary):
+    def is_beginning_of_word(i):
+        if i < dictionary.nspecial:
+            # special elements are always considered beginnings
+            return True
+        tok = dictionary[i]
+        if tok.startswith("madeupword"):
+            return True
+        if tok in ["<unk>", "<s>", "</s>", "<pad>"]:
+            return True
+        return tok.startswith("\u2581")
+    if args.use_mask_whole_words:
+        mask_whole_words = torch.ByteTensor(
+            list(map(is_beginning_of_word, range(len(dictionary))))
+        )
+    else:
+        # it will mask every token as word leading token, since no bpe model is loaded for phoneme tokens
+        return get_whole_word_mask(args, dictionary)
+    return mask_whole_words
+@register_task("paired_denoising")
+class PairedDenoisingTask(TranslationTask):
+    LANG_TAG_TEMPLATE = "<lang:{}>"  # Tag for language (target)
+    @staticmethod
+    def add_args(parser):
+        TranslationTask.add_args(parser)
+        # bart setting
+        parser.add_argument(
+            "--mask",
+            default=0.0,
+            type=float,
+            help="fraction of words/subwords that will be masked",
+        )
+        parser.add_argument(
+            "--mask-random",
+            default=0.0,
+            type=float,
+            help="instead of using [MASK], use random token this often",
+        )
+        parser.add_argument(
+            "--insert",
+            default=0.0,
+            type=float,
+            help="insert this percentage of additional random tokens",
+        )
+        parser.add_argument(
+            "--poisson-lambda",
+            default=3.0,
+            type=float,
+            help="randomly shuffle sentences for this proportion of inputs",
+        )
+        parser.add_argument(
+            "--mask-length",
+            default="span-poisson",
+            type=str,
+            choices=["subword", "word", "span-poisson"],
+            help="mask length to choose",
+        )
+        parser.add_argument(
+            "--replace-length",
+            default=1,
+            type=int,
+            help="when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)",
+        )
+        # multi-lingual
+        parser.add_argument(
+            "--multilang-sampling-alpha",
+            type=float,
+            default=1.0,
+            help="smoothing alpha for sample ratios across multiple datasets",
+        )
+        parser.add_argument(
+            "--lang-pairs",
+            default="",
+            metavar="PAIRS",
+            help="comma-separated list of language pairs (in training order): phnen-en,phnfr-fr,phnit-it. Do masking",
+        )
+        parser.add_argument(
+            "--lang-pairs-bitext",
+            default="",
+            metavar="PAIRS",
+            help="comma-separated list of language pairs (in training order): en-de,en-fr,de-fr. No masking",
+        )
+        parser.add_argument("--add-src-lang-token", default=False, action="store_true")
+        parser.add_argument("--add-tgt-lang-token", default=False, action="store_true")
+        parser.add_argument(
+            "--no-whole-word-mask-langs",
+            type=str,
+            default="",
+            metavar="N",
+            help="languages without spacing between words dont support whole word masking",
+        )
+        parser.add_argument(
+            "--use-mask-whole-words", default=False, action="store_true"
+        )
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        paths = args.data.split(":")
+        assert len(paths) > 0
+        src_dict = Dictionary.load(
+            os.path.join(paths[0], "src_dict.txt")
+        )  # assume all languages share a source dictionary
+        tgt_dict = Dictionary.load(
+            os.path.join(paths[0], "tgt_dict.txt")
+        )  # assume all languages share a target dictionary
+        lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext
+        lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs))
+        src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")]
+        tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")]
+        if args.add_src_lang_token:
+            for lang in src_langs:
+                assert (
+                    src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang))
+                    != src_dict.unk()
+                )
+        if args.add_tgt_lang_token:
+            for lang in tgt_langs:
+                assert (
+                    tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang))
+                    != tgt_dict.unk()
+                )
+        logger.info("source dictionary: {} types".format(len(src_dict)))
+        logger.info("target dictionary: {} types".format(len(tgt_dict)))
+        if not hasattr(args, "shuffle_instance"):
+            args.shuffle_instance = False
+        return cls(args, src_dict, tgt_dict)
+    def __init__(self, args, src_dict, tgt_dict):
+        super().__init__(args, src_dict, tgt_dict)
+        # check mask token
+        self.mask_idx = self.src_dict.index("<mask>")
+        assert self.mask_idx != self.src_dict.unk()
+        self.lang_pairs = args.lang_pairs
+        self.lang_pairs_bitext = args.lang_pairs_bitext
+        self.args = args
+    @classmethod
+    def language_pair_denoising_dataset(
+        cls,
+        data_path,
+        do_mask,
+        split,
+        src,
+        src_dict,
+        tgt,
+        tgt_dict,
+        mask_idx,
+        mask_whole_words,
+        seed,
+        args,
+        dataset_impl,
+        combine=False,
+        left_pad_source=True,
+        left_pad_target=False,
+        max_source_positions=1024,
+        max_target_positions=1024,
+        shuffle=True,
+        src_lang_id=None,
+        tgt_lang_id=None,
+    ):
+        def split_exists(split, src, tgt, lang, data_path):
+            filename = os.path.join(
+                data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)
+            )
+            return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+        src_datasets = []
+        tgt_datasets = []
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else "")
+            # infer langcode
+            if split_exists(split_k, src, tgt, src, data_path):
+                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt))
+            elif split_exists(split_k, tgt, src, src, data_path):
+                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src))
+            else:
+                if k > 0:
+                    break
+                else:
+                    raise FileNotFoundError(
+                        "Dataset not found: {} ({})".format(split, data_path)
+                    )
+            src_dataset = data_utils.load_indexed_dataset(
+                prefix + src, src_dict, dataset_impl
+            )
+            src_datasets.append(src_dataset)
+            tgt_dataset = data_utils.load_indexed_dataset(
+                prefix + tgt, tgt_dict, dataset_impl
+            )
+            if tgt_dataset is not None:
+                tgt_datasets.append(tgt_dataset)
+            logger.info(
+                "{} {} {}-{} {} examples".format(
+                    data_path, split_k, src, tgt, len(src_datasets[-1])
+                )
+            )
+            if not combine:
+                break
+        assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0
+        if len(src_datasets) == 1:
+            src_dataset = src_datasets[0]
+            tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None
+        else:
+            sample_ratios = [1] * len(src_datasets)
+            src_dataset = ConcatDataset(src_datasets, sample_ratios)
+            if len(tgt_datasets) > 0:
+                tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
+            else:
+                tgt_dataset = None
+        eos = None
+        tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None
+        if not do_mask:
+            return LanguagePairDataset(
+                src_dataset,
+                src_dataset.sizes,
+                src_dict,
+                tgt_dataset,
+                tgt_dataset_sizes,
+                tgt_dict,
+                left_pad_source=left_pad_source,
+                left_pad_target=left_pad_target,
+                eos=eos,
+                shuffle=shuffle,
+                src_lang_id=src_lang_id,
+                tgt_lang_id=tgt_lang_id,
+            )
+        return LanguagePairDenoisingDataset(
+            src_dataset,
+            src_dataset.sizes,
+            src_dict,
+            tgt_dataset,
+            tgt_dataset_sizes,
+            tgt_dict,
+            mask_idx,
+            mask_whole_words,
+            seed,
+            args,
+            left_pad_source=left_pad_source,
+            left_pad_target=left_pad_target,
+            eos=eos,
+            shuffle=shuffle,
+            src_lang_id=src_lang_id,
+            tgt_lang_id=tgt_lang_id,
+        )
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob ** self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+    def resample_datasets(self, lang_datasets, lang_pairs_all, epoch):
+        # For train subset, additionally up or down sample languages.
+        if self.args.multilang_sampling_alpha == 1.0:
+            return lang_datasets
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        sample_probs = self._get_sample_prob(dataset_lengths)
+        logger.info(
+            "Sample probability by language pair: {}".format(
+                {
+                    lp: "{0:.4f}".format(sample_probs[id])
+                    for id, lp in enumerate(lang_pairs_all)
+                }
+            )
+        )
+        size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+        logger.info(
+            "Up/Down Sampling ratio by language: {}".format(
+                {
+                    lp: "{0:.2f}".format(size_ratio[id])
+                    for id, lp in enumerate(lang_pairs_all)
+                }
+            )
+        )
+        resampled_lang_datasets = [
+            ResamplingDataset(
+                lang_datasets[i],
+                size_ratio=size_ratio[i],
+                seed=self.args.seed,
+                epoch=epoch,
+                replace=size_ratio[i] >= 1.0,
+            )
+            for i, d in enumerate(lang_datasets)
+        ]
+        return resampled_lang_datasets
+    def load_dataset_only(
+        self, split, lang_pairs, do_mask=True, epoch=1, combine=False
+    ):
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        # TODO unk token will be considered as first word too, though it might be an unknown phoneme within a word
+        # get_whole_word_mask returns a tensor (size V by 1 ) to indicate if a token is a word start token
+        mask_whole_src_words = gen_whole_word_mask(self.args, self.src_dict)
+        language_without_segmentations = self.args.no_whole_word_mask_langs.split(",")
+        lang_datasets = []
+        eos_bos = []
+        lang_pairs = lang_pairs.split(",") if lang_pairs != "" else []
+        assert len(lang_pairs) > 0
+        for lp in lang_pairs:
+            src, tgt = lp.split("-")
+            lang_mask_whole_src_words = (
+                mask_whole_src_words
+                if src not in language_without_segmentations
+                else None
+            )
+            end_token = (
+                self.source_dictionary.index(
+                    PairedDenoisingTask.LANG_TAG_TEMPLATE.format(src)
+                )
+                if self.args.add_src_lang_token
+                else None
+            )
+            bos_token = (
+                self.target_dictionary.index(
+                    PairedDenoisingTask.LANG_TAG_TEMPLATE.format(tgt)
+                )
+                if self.args.add_tgt_lang_token
+                else None
+            )
+            src_lang_id = None
+            if self.args.add_src_lang_token or self.args.add_tgt_lang_token:
+                eos_bos.append((end_token, bos_token))
+            dataset = PairedDenoisingTask.language_pair_denoising_dataset(
+                data_path,
+                do_mask,
+                split,
+                src,
+                self.source_dictionary,
+                tgt,
+                self.target_dictionary,
+                self.mask_idx,
+                lang_mask_whole_src_words,
+                self.args.seed,
+                self.args,
+                self.args.dataset_impl,
+                combine=combine,
+                left_pad_source=utils.eval_bool(self.args.left_pad_source),
+                left_pad_target=utils.eval_bool(self.args.left_pad_target),
+                max_source_positions=self.args.max_source_positions,
+                max_target_positions=self.args.max_target_positions,
+                src_lang_id=src_lang_id,
+            )
+            lang_datasets.append(dataset)
+        if len(lang_datasets) == 0:
+            return
+        elif len(lang_datasets) == 1:
+            dataset = lang_datasets[0]
+            if self.args.add_src_lang_token or self.args.add_tgt_lang_token:
+                end_token, bos_token = eos_bos[0]
+                dataset = TransformEosLangPairDataset(
+                    dataset,
+                    src_eos=self.source_dictionary.eos(),
+                    new_src_eos=end_token,
+                    tgt_bos=self.target_dictionary.eos(),
+                    new_tgt_bos=bos_token,
+                )
+        else:
+            end_tokens = [item[0] for item in eos_bos if item[0] is not None]
+            bos_tokens = [item[1] for item in eos_bos if item[1] is not None]
+            lang_datasets = self.resample_datasets(lang_datasets, lang_pairs, epoch)
+            dataset = TransformEosConcatLangPairDataset(
+                lang_datasets,
+                self.source_dictionary.eos(),
+                self.target_dictionary.eos(),
+                new_src_eos=end_tokens,
+                new_tgt_bos=bos_tokens,
+            )
+        return dataset
+    # split in (train, valid, test, ...)
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        self.datasets[split] = self.load_dataset_only(
+            split, self.lang_pairs, epoch=epoch, combine=combine
+        )

fairseq/examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py ADDED Viewed

	@@ -0,0 +1,654 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import re
+from argparse import Namespace
+from pathlib import Path
+from fairseq.data import ConcatDataset, Dictionary, encoders
+from fairseq.data.audio.multi_modality_dataset import (
+    FileAudioDatasetWrapper,
+    ModalityDatasetItem,
+    MultiModalityDataset,
+)
+from fairseq.data.audio.speech_to_text_joint_dataset import (
+    S2TJointDataConfig,
+    SpeechToTextJointDatasetCreator,
+)
+from fairseq.data.iterators import GroupedEpochBatchIterator
+from fairseq.tasks import register_task
+from .pair_denoising import PairedDenoisingTask
+logger = logging.getLogger(__name__)
+@register_task("speech_text_joint_denoising")
+class SpeechTextJointDenoisingPreTask(PairedDenoisingTask):
+    """
+    Joint denoising training task for speech and text.
+    """
+    SIL_TOKEN = "sil"
+    @classmethod
+    def add_args(cls, parser):
+        PairedDenoisingTask.add_args(parser)
+        # set max tokens and position
+        parser.add_argument(
+            "--max-text-tokens",
+            type=int,
+            metavar="N",
+            default=1024,
+            help="maximum samples for encoder text input ",
+        )
+        parser.add_argument(
+            "--max-speech-tokens",
+            type=int,
+            metavar="N",
+            default=50000,
+            help="maximum samples for encoder speech input ",
+        )
+        parser.add_argument(
+            "--max-speech-positions",
+            type=int,
+            metavar="N",
+            default=400,
+            help="maximum tokens for per encoder text input ",
+        )
+        parser.add_argument(
+            "--max-sample-size",
+            type=int,
+            metavar="N",
+            default=32000,
+            help="max sample size to crop to for batching (unsupervised speech) ",
+        )
+        parser.add_argument(
+            "--min-sample-size",
+            type=int,
+            metavar="N",
+            default=4000,
+            help="min sample size to crop to for batching (unsupervised speech) ",
+        )
+        # set mini-batch ratio for different modalities/subtasks
+        # s2p
+        parser.add_argument(
+            "--supervised-speech-sample-ratio",
+            default="1",
+            type=str,
+            metavar="N",
+            help="Multiple Ratio for speech dataset with transcripts ",
+        )
+        # s2t
+        parser.add_argument(
+            "--supervised-speech-s2s-sample-ratio",
+            default="1",
+            type=str,
+            metavar="N",
+            help="Multiple Ratio for speech dataset with transcripts ",
+        )
+        # ssl
+        parser.add_argument(
+            "--unsupervised-speech-sample-ratio",
+            default="1",
+            type=str,
+            metavar="N",
+            help="Multiple Ratio for speech dataset without transcripts ",
+        )
+        # t2t with monolingual data (masking)
+        parser.add_argument(
+            "--text-sample-ratio",
+            default="1",
+            type=str,
+            metavar="N",
+            help="Multiple Ratio for text set ",
+        )
+        # t2t with parallel data (no masking)
+        parser.add_argument(
+            "--bitext-sample-ratio",
+            default="1",
+            type=str,
+            metavar="N",
+            help="Multiple Ratio for text set (bitext) ",
+        )
+        # train_subset = "train", 'valid' or so
+        # parallel data is loaded according to string lang_pairs and lang_pairs_no_mask from args.data
+        # (un)supervised speech is loaded from args.(un)sup_speech_{train,valid}_subset
+        parser.add_argument(
+            "--sup-speech-data", default="", help="path to supervised speech data"
+        )
+        parser.add_argument(
+            "--sup-speech-train-subset",
+            default="",
+            help="supervised speech training subsets",
+        )
+        parser.add_argument(
+            "--sup-speech-valid-subset",
+            default="",
+            help="supervised speech validation subsets",
+        )
+        parser.add_argument(
+            "--config-yaml",
+            default="config.yaml",
+            help="supervised speech configuration yaml file",
+        )
+        parser.add_argument(
+            "--sup-speech-s2s-data", default="", help="path to supervised speech data"
+        )
+        parser.add_argument(
+            "--sup-speech-s2s-train-subset",
+            default="",
+            help="supervised speech training subsets",
+        )
+        parser.add_argument(
+            "--sup-speech-s2s-valid-subset",
+            default="",
+            help="supervised speech validation subsets",
+        )
+        parser.add_argument(
+            "--config-s2s-yaml",
+            default="config.yaml",
+            help="supervised speech configuration yaml file",
+        )
+        parser.add_argument(
+            "--unsup-speech-train-data",
+            default="",
+            help="path to unsupervised speech training data (tsv)",
+        )
+        parser.add_argument(
+            "--unsup-speech-valid-data",
+            default="",
+            help="path to unsupervised speech valid data (tsv)",
+        )
+        parser.add_argument(
+            "--sample-rate",
+            type=int,
+            metavar="N",
+            default=16000,
+            help="input audio sampling rate",
+        )
+        parser.add_argument(
+            "--no-emb-update-unsup",
+            default=False,
+            action="store_true",
+            help="no update for output embedding during unsupervised_speech mode",
+        )
+        parser.add_argument("--same-data-update", default=False, action="store_true")
+        # used for sup_speech_ali
+        parser.add_argument(
+            "--use-sup-speech-ctc",
+            default=False,
+            action="store_true",
+            help="use speech_sup_ctc instead of speech_sup_ali",
+        )
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        paths = args.data.split(":")
+        assert len(paths) > 0
+        src_dict = Dictionary.load(
+            os.path.join(paths[0], "src_dict.txt")
+        )  # assume all languages share a source dictionary
+        tgt_dict = Dictionary.load(
+            os.path.join(paths[0], "tgt_dict.txt")
+        )  # assume all languages share a target dictionary
+        lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext
+        lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs))
+        if lang_pairs != "":
+            src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")]
+            tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")]
+        else:
+            src_langs = []
+            tgt_langs = []
+        if args.add_src_lang_token:
+            for lang in src_langs:
+                assert (
+                    src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang))
+                    != src_dict.unk()
+                )
+        if args.add_tgt_lang_token:
+            for lang in tgt_langs:
+                assert (
+                    tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang))
+                    != tgt_dict.unk()
+                )
+        logger.info("source dictionary: {} types".format(len(src_dict)))
+        logger.info("target dictionary: {} types".format(len(tgt_dict)))
+        if not hasattr(args, "shuffle_instance"):
+            args.shuffle_instance = False
+        return cls(args, src_dict, tgt_dict)
+    def __init__(self, args, src_dict, tgt_dict):
+        super().__init__(args, src_dict, tgt_dict)
+        self.data_cfg = S2TJointDataConfig(
+            Path(args.sup_speech_data) / args.config_yaml
+        )
+        logger.info(
+            f"load supervised speech data configure from {Path(args.sup_speech_data) / args.config_yaml}"
+        )
+        self.data_s2s_cfg = (
+            S2TJointDataConfig(Path(args.sup_speech_s2s_data) / args.config_s2s_yaml)
+            if args.sup_speech_s2s_train_subset != ""
+            else None
+        )
+        if self.data_s2s_cfg is not None:
+            logger.info(
+                f"load supervised sequece to sequence speech data configure from {Path(args.sup_speech_s2s_data) / args.config_yaml}"
+            )
+        def parse_data_ratio(sample_ratio):
+            ratios = sample_ratio.split(",")
+            if len(ratios) == 1:
+                return [float(ratios[0])]
+            epoch_ratios = []
+            for item in ratios:
+                ep, r = item.split(":")
+                ep = int(ep)
+                r = float(r)
+                assert ep > 0  # epoch is 1 based
+                assert ep >= len(epoch_ratios)
+                if len(epoch_ratios) == 0:
+                    epoch_ratios.append(
+                        r
+                    )  # epoch_ratios[0] is not used, but we still set it to the first value to make thing simple.
+                while len(epoch_ratios) < ep:
+                    epoch_ratios.append(epoch_ratios[-1])
+                epoch_ratios.append(r)
+            return epoch_ratios
+        self.sup_ratio = parse_data_ratio(args.supervised_speech_sample_ratio)
+        self.sup_s2s_ratio = parse_data_ratio(args.supervised_speech_s2s_sample_ratio)
+        self.text_ratio = parse_data_ratio(args.text_sample_ratio)
+        self.bitext_ratio = parse_data_ratio(args.bitext_sample_ratio)
+        self.unsup_ratio = parse_data_ratio(args.unsupervised_speech_sample_ratio)
+        self.sample_mode = None
+    def build_model(self, args):
+        args.input_feat_per_channel = self.data_cfg.input_feat_per_channel
+        args.input_channels = self.data_cfg.input_channels
+        return super().build_model(args)
+    def build_tokenizer(self, data_cfg, msg=""):
+        logger.info(f"pre-tokenizer {msg}: {data_cfg.pre_tokenizer}")
+        return encoders.build_tokenizer(Namespace(**data_cfg.pre_tokenizer))
+    def build_bpe(self, data_cfg, msg=""):
+        logger.info(f"tokenizer {msg}: {data_cfg.bpe_tokenizer}")
+        return encoders.build_bpe(Namespace(**data_cfg.bpe_tokenizer))
+    @classmethod
+    def resolve_data_type(cls, split, use_sup_speech_ctc):
+        if len(split.split("_")) == 1:
+            # default case, train or valid
+            is_train = split
+            dtype = "text"
+        else:
+            is_train, dtype = split.split("_", 1)
+        is_train = True if is_train == "train" else False
+        if dtype == "sup_speech":
+            dtype = "sup_speech_ctc" if use_sup_speech_ctc else "sup_speech_ali"
+        assert dtype in (
+            "text",
+            "bitext",
+            "sup_speech_ali",
+            "sup_speech_s2s",
+            "unsup_speech",
+            "sup_speech_ctc",
+        ), f"failed resolving {split} (it resulted into: {dtype} ; is_train={is_train})"
+        return is_train, dtype
+    def create_modalitydatasetitem(self, dtype, dataset):
+        dsitem = None
+        if dtype in ("text", "bitext"):
+            dsitem = ModalityDatasetItem(
+                dtype,
+                dataset,
+                (self.args.max_source_positions, self.args.max_target_positions),
+                self.args.max_text_tokens,
+                self.args.batch_size,
+            )
+        elif dtype in ("sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"):
+            dsitem = ModalityDatasetItem(
+                dtype,
+                dataset,
+                (self.args.max_speech_positions, self.args.max_target_positions),
+                self.args.max_speech_tokens,
+                self.args.batch_size,
+            )
+        elif dtype == "unsup_speech":
+            dsitem = ModalityDatasetItem(
+                dtype, dataset, 1e8, self.args.max_speech_tokens, self.args.batch_size
+            )
+        else:
+            raise ValueError(f"{dtype} is not supported")
+        return dsitem
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        def _get_sup_src_tgt_dict(src_dict, tgt_dict, use_s2s_sup_decoder):
+            if use_s2s_sup_decoder:
+                return None, tgt_dict
+            # use src_dict as tgt_dict here, since we use source dictionary as target for forcealignment
+            return None, src_dict
+        is_train, dtype = self.resolve_data_type(split, self.args.use_sup_speech_ctc)
+        # Note we use --add-tgt-lang-token instead of data_cfg.prepend_tgt_lang_tag_no_change to set target language tag in the text dataset
+        # Verify add_tgt_lang_token and prepend_tgt_lang_tag_no_change are same
+        # Note we use --multilang-sampling-alpha instead of data_cfg.sampling_text_alpha to set text data sampling
+        if is_train:
+            msets = []
+            # train split, load everything into one
+            if self.lang_pairs != "":
+                text_dataset = self.load_dataset_only(
+                    "train", self.lang_pairs, epoch=epoch, combine=combine
+                )
+                dsitem = self.create_modalitydatasetitem("text", text_dataset)
+                msets.append(dsitem)
+            if self.lang_pairs_bitext != "":  # load bitext
+                bitext_dataset = self.load_dataset_only(
+                    "train_bitext",
+                    self.lang_pairs_bitext,
+                    do_mask=False,
+                    epoch=epoch,
+                    combine=combine,
+                )
+                dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset)
+                msets.append(dsitem)
+            if self.args.sup_speech_train_subset != "":
+                pre_tokenizer = self.build_tokenizer(self.data_cfg)
+                bpe_tokenizer = self.build_bpe(self.data_cfg)
+                append_eos = True
+                sup_speech_type = "sup_speech_ali"
+                if self.args.use_sup_speech_ctc:
+                    # CTC mode
+                    sup_speech_type = "sup_speech_ctc"
+                    append_eos = False  # CTC doesn't need eos in the target
+                src_dict, tgt_dict = _get_sup_src_tgt_dict(
+                    self.src_dict, self.tgt_dict, False
+                )
+                sup_speech_dataset = SpeechToTextJointDatasetCreator.from_tsv(
+                    self.args.sup_speech_data,
+                    self.data_cfg,
+                    self.args.sup_speech_train_subset,
+                    tgt_dict=tgt_dict,
+                    src_dict=src_dict,
+                    pre_tokenizer=pre_tokenizer,
+                    bpe_tokenizer=bpe_tokenizer,
+                    src_pre_tokenizer=None,
+                    src_bpe_tokenizer=None,
+                    is_train_split=is_train,
+                    epoch=epoch,
+                    seed=self.args.seed,
+                    append_eos=append_eos,
+                )
+                dsitem = self.create_modalitydatasetitem(
+                    sup_speech_type, sup_speech_dataset
+                )
+                msets.append(dsitem)
+            if self.args.sup_speech_s2s_train_subset != "":
+                pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg, msg="(s2s)")
+                bpe_tokenizer = self.build_bpe(self.data_s2s_cfg, msg="(s2s)")
+                # make sure self.data_cfg.prepend_tgt_lang_tag_no_change == self.args.add_tgt_lang_token
+                src_dict, tgt_dict = _get_sup_src_tgt_dict(
+                    self.src_dict, self.tgt_dict, True
+                )
+                sup_speech_s2s_dataset = SpeechToTextJointDatasetCreator.from_tsv(
+                    self.args.sup_speech_s2s_data,
+                    self.data_s2s_cfg,
+                    self.args.sup_speech_s2s_train_subset,
+                    tgt_dict=tgt_dict,
+                    src_dict=src_dict,
+                    pre_tokenizer=pre_tokenizer,
+                    bpe_tokenizer=bpe_tokenizer,
+                    src_pre_tokenizer=None,
+                    src_bpe_tokenizer=None,
+                    is_train_split=is_train,
+                    epoch=epoch,
+                    seed=self.args.seed,
+                )
+                dsitem = self.create_modalitydatasetitem(
+                    "sup_speech_s2s", sup_speech_s2s_dataset
+                )
+                msets.append(dsitem)
+            if self.args.unsup_speech_train_data != "":
+                unsup_speech_dataset = FileAudioDatasetWrapper(
+                    self.args.unsup_speech_train_data,
+                    self.args.sample_rate,
+                    max_sample_size=self.args.max_sample_size,
+                    min_sample_size=self.args.min_sample_size,
+                    normalize=False,
+                )
+                dsitem = self.create_modalitydatasetitem(
+                    "unsup_speech", unsup_speech_dataset
+                )
+                msets.append(dsitem)
+            pre_train_dataset = MultiModalityDataset(msets)
+            self.datasets[split] = pre_train_dataset
+        else:  # validation split, load them for each type of data
+            if dtype == "text":
+                text_dataset = self.load_dataset_only(
+                    split, self.lang_pairs, epoch=epoch, combine=combine
+                )
+                dsitem = self.create_modalitydatasetitem("text", text_dataset)
+                self.datasets[split] = MultiModalityDataset([dsitem])
+            elif dtype == "bitext":
+                bitext_dataset = self.load_dataset_only(
+                    split,
+                    self.lang_pairs_bitext,
+                    do_mask=False,
+                    epoch=epoch,
+                    combine=combine,
+                )
+                dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset)
+                self.datasets[split] = MultiModalityDataset([dsitem])
+            elif dtype in ("sup_speech_ctc", "sup_speech_ali"):
+                assert self.args.sup_speech_valid_subset != ""
+                pre_tokenizer = self.build_tokenizer(self.data_cfg)
+                bpe_tokenizer = self.build_bpe(self.data_cfg)
+                append_eos = True
+                if dtype == "sup_speech_ctc":
+                    # CTC mode
+                    append_eos = False  # CTC doesn't need eos
+                    assert self.args.use_sup_speech_ctc
+                datasets = []
+                for split_name in self.args.sup_speech_valid_subset.split(","):
+                    src_dict, tgt_dict = _get_sup_src_tgt_dict(
+                        self.src_dict, self.tgt_dict, False
+                    )
+                    datasets.append(
+                        SpeechToTextJointDatasetCreator.from_tsv(
+                            self.args.sup_speech_data,
+                            self.data_cfg,
+                            split_name,
+                            tgt_dict=tgt_dict,
+                            src_dict=src_dict,
+                            pre_tokenizer=pre_tokenizer,
+                            bpe_tokenizer=bpe_tokenizer,
+                            src_pre_tokenizer=None,
+                            src_bpe_tokenizer=None,
+                            is_train_split=is_train,
+                            epoch=epoch,
+                            seed=self.args.seed,
+                            append_eos=append_eos,
+                        )
+                    )
+                dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets)
+                dsitem = self.create_modalitydatasetitem(dtype, dset)
+                self.datasets[split] = MultiModalityDataset([dsitem])
+            elif dtype == "sup_speech_s2s":
+                assert self.args.sup_speech_s2s_valid_subset != ""
+                pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg)
+                bpe_tokenizer = self.build_bpe(self.data_s2s_cfg)
+                datasets = []
+                for split_name in self.args.sup_speech_s2s_valid_subset.split(","):
+                    src_dict, tgt_dict = _get_sup_src_tgt_dict(
+                        self.src_dict, self.tgt_dict, True
+                    )
+                    datasets.append(
+                        SpeechToTextJointDatasetCreator.from_tsv(
+                            self.args.sup_speech_s2s_data,
+                            self.data_s2s_cfg,
+                            split_name,
+                            tgt_dict=tgt_dict,
+                            src_dict=src_dict,
+                            pre_tokenizer=pre_tokenizer,
+                            bpe_tokenizer=bpe_tokenizer,
+                            src_pre_tokenizer=None,
+                            src_bpe_tokenizer=None,
+                            is_train_split=is_train,
+                            epoch=epoch,
+                            seed=self.args.seed,
+                        )
+                    )
+                dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets)
+                dsitem = self.create_modalitydatasetitem("sup_speech_s2s", dset)
+                self.datasets[split] = MultiModalityDataset([dsitem])
+            elif dtype == "unsup_speech":
+                assert self.args.unsup_speech_valid_data != ""
+                unsup_speech_dataset = FileAudioDatasetWrapper(
+                    self.args.unsup_speech_valid_data,
+                    self.args.sample_rate,
+                    max_sample_size=self.args.max_sample_size,
+                    min_sample_size=self.args.min_sample_size,
+                    normalize=False,
+                )
+                dsitem = self.create_modalitydatasetitem(
+                    "unsup_speech", unsup_speech_dataset
+                )
+                self.datasets[split] = MultiModalityDataset([dsitem])
+            else:
+                raise ValueError(f"Unsupported type {dtype}")
+    def get_sample_ratio(self, epoch):
+        sup_ratio = (
+            self.sup_ratio[epoch] if len(self.sup_ratio) > epoch else self.sup_ratio[-1]
+        )
+        sup_s2s_ratio = (
+            self.sup_s2s_ratio[epoch]
+            if len(self.sup_s2s_ratio) > epoch
+            else self.sup_s2s_ratio[-1]
+        )
+        unsup_ratio = (
+            self.unsup_ratio[epoch]
+            if len(self.unsup_ratio) > epoch
+            else self.unsup_ratio[-1]
+        )
+        text_ratio = (
+            self.text_ratio[epoch]
+            if len(self.text_ratio) > epoch
+            else self.text_ratio[-1]
+        )
+        bitext_ratio = (
+            self.bitext_ratio[epoch]
+            if len(self.bitext_ratio) > epoch
+            else self.bitext_ratio[-1]
+        )
+        return text_ratio, bitext_ratio, sup_ratio, sup_s2s_ratio, unsup_ratio
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=0,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+        skip_remainder_batch=False,
+        grouped_shuffling=False,
+        update_epoch_batch_itr=False,
+    ):
+        assert isinstance(dataset, MultiModalityDataset)
+        if len(dataset.id_to_mode) == 1:
+            max_positions = dataset.max_positions[0]
+            max_tokens = dataset.max_tokens[0]
+            max_sentences = dataset.max_sentences[0]
+            return super().get_batch_iterator(
+                dataset,
+                max_tokens,
+                max_sentences,
+                max_positions,
+                ignore_invalid_inputs,
+                required_batch_size_multiple,
+                seed,
+                num_shards,
+                shard_id,
+                num_workers,
+                epoch,
+                data_buffer_size,
+                disable_iterator_cache,
+                skip_remainder_batch=skip_remainder_batch,
+            )
+        mult_ratio = []
+        (
+            text_ratio,
+            bitext_ratio,
+            sup_ratio,
+            sup_s2s_ratio,
+            unsup_ratio,
+        ) = self.get_sample_ratio(epoch)
+        for mode in dataset.id_to_mode:
+            if mode in ("sup_speech_ctc", "sup_speech_ali"):
+                mult_ratio.append(sup_ratio)
+            elif mode == "sup_speech_s2s":
+                mult_ratio.append(sup_s2s_ratio)
+            elif mode == "text":
+                mult_ratio.append(text_ratio)
+            elif mode == "bitext":
+                mult_ratio.append(bitext_ratio)
+            elif mode == "unsup_speech":
+                mult_ratio.append(unsup_ratio)
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+        batch_samplers = dataset.get_batch_samplers(
+            mult_ratio, required_batch_size_multiple, seed
+        )
+        # return a reusable, sharded iterator
+        epoch_iter = GroupedEpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_samplers=batch_samplers,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+            mult_rate=max(self.args.update_freq) if self.args.same_data_update else 1,
+            buffer_size=data_buffer_size,
+            skip_remainder_batch=skip_remainder_batch,
+        )
+        self.dataset_to_epoch_iter[dataset] = {}  # refresh it every epoch
+        return epoch_iter

fairseq/examples/speech_text_joint_to_text/tasks/speech_text_joint.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+from argparse import Namespace
+from pathlib import Path
+import torch
+from fairseq.data import (
+    encoders,
+    Dictionary,
+    ResamplingDataset,
+    TransformEosLangPairDataset,
+    ConcatDataset,
+)
+from fairseq.data.iterators import GroupedEpochBatchIterator
+from fairseq.data.audio.multi_modality_dataset import (
+    MultiModalityDataset,
+    LangPairMaskDataset,
+    ModalityDatasetItem,
+)
+from fairseq.data.audio.speech_to_text_dataset import (
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+)
+from fairseq.data.audio.speech_to_text_joint_dataset import (
+    S2TJointDataConfig,
+    SpeechToTextJointDatasetCreator,
+)
+from fairseq.tasks import register_task
+from fairseq.tasks.speech_to_text import SpeechToTextTask
+from fairseq.tasks.translation import load_langpair_dataset
+logger = logging.getLogger(__name__)
+LANG_TAG_TEMPLATE = "<lang:{}>"
+@register_task("speech_text_joint_to_text")
+class SpeechTextJointToTextTask(SpeechToTextTask):
+    """
+    Task for joint training speech and text to text.
+    """
+    @classmethod
+    def add_args(cls, parser):
+        """Add task-specific arguments to the parser."""
+        super(SpeechTextJointToTextTask, cls).add_args(parser)
+        ###
+        parser.add_argument(
+            "--parallel-text-data",
+            default="",
+            help="path to parallel text data directory",
+        )
+        parser.add_argument(
+            "--max-tokens-text",
+            type=int,
+            metavar="N",
+            help="maximum tokens for encoder text input ",
+        )
+        parser.add_argument(
+            "--max-positions-text",
+            type=int,
+            metavar="N",
+            default=400,
+            help="maximum tokens for per encoder text input ",
+        )
+        parser.add_argument(
+            "--langpairs",
+            default=None,
+            metavar="S",
+            help='language pairs for text training, separated with ","',
+        )
+        parser.add_argument(
+            "--speech-sample-ratio",
+            default=1,
+            type=float,
+            metavar="N",
+            help="Multiple Ratio for speech dataset with transcripts ",
+        )
+        parser.add_argument(
+            "--text-sample-ratio",
+            default=1,
+            type=float,
+            metavar="N",
+            help="Multiple Ratio for text set ",
+        )
+        parser.add_argument(
+            "--update-mix-data",
+            action="store_true",
+            help="use mixed data in one update when update-freq  > 1",
+        )
+        parser.add_argument(
+            "--load-speech-only", action="store_true", help="load speech data only",
+        )
+        parser.add_argument(
+            "--mask-text-ratio",
+            type=float,
+            metavar="V",
+            default=0.0,
+            help="mask V source tokens for text only mode",
+        )
+        parser.add_argument(
+            "--mask-text-type",
+            default="random",
+            choices=["random", "tail"],
+            help="mask text typed",
+        )
+        parser.add_argument(
+            "--noise-token",
+            default="",
+            help="noise token for masking src text tokens if mask-text-ratio > 0",
+        )
+        parser.add_argument(
+            "--infer-target-lang",
+            default="",
+            metavar="S",
+            help="target language for inference",
+        )
+    def __init__(self, args, src_dict, tgt_dict, infer_tgt_lang_id=None):
+        super().__init__(args, tgt_dict)
+        self.src_dict = src_dict
+        self.data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml)
+        assert self.tgt_dict.pad() == self.src_dict.pad()
+        assert self.tgt_dict.eos() == self.src_dict.eos()
+        self.speech_only = args.load_speech_only
+        self._infer_tgt_lang_id = infer_tgt_lang_id
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries)."""
+        data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml)
+        tgt_dict_path = Path(args.data) / data_cfg.vocab_filename
+        src_dict_path = Path(args.data) / data_cfg.src_vocab_filename
+        if (not os.path.isfile(src_dict_path)) or (not os.path.isfile(tgt_dict_path)):
+            raise FileNotFoundError("Dict not found: {}".format(args.data))
+        src_dict = Dictionary.load(src_dict_path.as_posix())
+        tgt_dict = Dictionary.load(tgt_dict_path.as_posix())
+        print("| src dictionary: {} types".format(len(src_dict)))
+        print("| tgt dictionary: {} types".format(len(tgt_dict)))
+        if args.parallel_text_data != "":
+            if not os.path.isabs(args.parallel_text_data):
+                args.parallel_text_data = os.path.join(
+                    args.data, args.parallel_text_data
+                )
+            if args.langpairs is None:
+                raise Exception(
+                    "Could not infer language pair, please provide it explicitly"
+                )
+        infer_tgt_lang_id = None
+        if args.infer_target_lang != "" and data_cfg.prepend_tgt_lang_tag_no_change:
+            tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format(
+                args.infer_target_lang
+            )
+            infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag)
+            assert infer_tgt_lang_id != tgt_dict.unk()
+        return cls(args, src_dict, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id)
+    def load_langpair_dataset(
+        self, prepend_tgt_lang_tag=False, sampling_alpha=1.0, epoch=0
+    ):
+        lang_pairs = []
+        text_dataset = None
+        split = "train"
+        for lp in self.args.langpairs.split(","):
+            src, tgt = lp.split("-")
+            text_dataset = load_langpair_dataset(
+                self.args.parallel_text_data,
+                split,
+                src,
+                self.src_dict,
+                tgt,
+                self.tgt_dict,
+                combine=True,
+                dataset_impl=None,
+                upsample_primary=1,
+                left_pad_source=False,
+                left_pad_target=False,
+                max_source_positions=self.args.max_positions_text,
+                max_target_positions=self.args.max_target_positions,
+                load_alignments=False,
+                truncate_source=False,
+            )
+            if prepend_tgt_lang_tag:
+                # TODO
+                text_dataset = TransformEosLangPairDataset(
+                    text_dataset,
+                    src_eos=self.src_dict.eos(),
+                    tgt_bos=self.tgt_dict.eos(),  # 'prev_output_tokens' starts with eos
+                    new_tgt_bos=self.tgt_dict.index(LANG_TAG_TEMPLATE.format(tgt)),
+                )
+            lang_pairs.append(text_dataset)
+        if len(lang_pairs) > 1:
+            if sampling_alpha != 1.0:
+                size_ratios = SpeechToTextDatasetCreator.get_size_ratios(
+                    self.args.langpairs.split(","),
+                    [len(s) for s in lang_pairs],
+                    alpha=sampling_alpha,
+                )
+                lang_pairs = [
+                    ResamplingDataset(d, size_ratio=r, epoch=epoch, replace=(r >= 1.0))
+                    for d, r in zip(lang_pairs, size_ratios)
+                ]
+            return ConcatDataset(lang_pairs)
+        return text_dataset
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            return generator.generate(
+                models,
+                sample,
+                prefix_tokens=prefix_tokens,
+                constraints=constraints,
+                bos_token=self._infer_tgt_lang_id,
+            )
+    def build_src_tokenizer(self, args):
+        logger.info(f"src-pre-tokenizer: {self.data_cfg.src_pre_tokenizer}")
+        return encoders.build_tokenizer(Namespace(**self.data_cfg.src_pre_tokenizer))
+    def build_src_bpe(self, args):
+        logger.info(f"tokenizer: {self.data_cfg.src_bpe_tokenizer}")
+        return encoders.build_bpe(Namespace(**self.data_cfg.src_bpe_tokenizer))
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        is_train_split = split.startswith("train")
+        pre_tokenizer = self.build_tokenizer(self.args)
+        bpe_tokenizer = self.build_bpe(self.args)
+        src_pre_tokenizer = self.build_src_tokenizer(self.args)
+        src_bpe_tokenizer = self.build_src_bpe(self.args)
+        ast_dataset = SpeechToTextJointDatasetCreator.from_tsv(
+            self.args.data,
+            self.data_cfg,
+            split,
+            self.tgt_dict,
+            src_dict=None if self.speech_only else self.src_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            src_pre_tokenizer=src_pre_tokenizer,
+            src_bpe_tokenizer=src_bpe_tokenizer,
+            is_train_split=is_train_split,
+            epoch=epoch,
+            seed=self.args.seed,
+        )
+        noise_token_id = -1
+        text_dataset = None
+        if self.args.parallel_text_data != "" and is_train_split:
+            text_dataset = self.load_langpair_dataset(
+                self.data_cfg.prepend_tgt_lang_tag_no_change, 1.0, epoch=epoch,
+            )
+            if self.args.mask_text_ratio > 0:
+                # add mask
+                noise_token_id = (
+                    self.src_dict.unk()
+                    if self.args.noise_token == ""
+                    else self.src_dict.index(self.args.noise_token)
+                )
+                text_dataset = LangPairMaskDataset(
+                    text_dataset,
+                    src_bos=self.src_dict.bos(),
+                    src_eos=self.src_dict.eos(),
+                    noise_id=noise_token_id,
+                    mask_ratio=self.args.mask_text_ratio,
+                    mask_type=self.args.mask_text_type,
+                )
+        if text_dataset is not None:
+            mdsets = [
+                ModalityDatasetItem(
+                    "sup_speech",
+                    ast_dataset,
+                    (self.args.max_source_positions, self.args.max_target_positions),
+                    self.args.max_tokens,
+                    self.args.batch_size,
+                ),
+                ModalityDatasetItem(
+                    "text",
+                    text_dataset,
+                    (self.args.max_positions_text, self.args.max_target_positions),
+                    self.args.max_tokens_text
+                    if self.args.max_tokens_text is not None
+                    else self.args.max_tokens,
+                    self.args.batch_size,
+                ),
+            ]
+            ast_dataset = MultiModalityDataset(mdsets)
+        self.datasets[split] = ast_dataset
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.tgt_dict
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None if self.speech_only else self.src_dict
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=0,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+        skip_remainder_batch=False,
+        grouped_shuffling=False,
+        update_epoch_batch_itr=False,
+    ):
+        if not isinstance(dataset, MultiModalityDataset):
+            return super(SpeechTextJointToTextTask, self).get_batch_iterator(
+                dataset,
+                max_tokens,
+                max_sentences,
+                max_positions,
+                ignore_invalid_inputs,
+                required_batch_size_multiple,
+                seed,
+                num_shards,
+                shard_id,
+                num_workers,
+                epoch,
+                data_buffer_size,
+                disable_iterator_cache,
+                skip_remainder_batch=skip_remainder_batch,
+                update_epoch_batch_itr=update_epoch_batch_itr,
+            )
+        mult_ratio = [self.args.speech_sample_ratio, self.args.text_sample_ratio]
+        assert len(dataset.datasets) == 2
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+        batch_samplers = dataset.get_batch_samplers(
+            mult_ratio, required_batch_size_multiple, seed
+        )
+        # return a reusable, sharded iterator
+        epoch_iter = GroupedEpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_samplers=batch_samplers,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+            mult_rate=1 if self.args.update_mix_data else max(self.args.update_freq),
+            buffer_size=data_buffer_size,
+            skip_remainder_batch=skip_remainder_batch,
+        )
+        self.dataset_to_epoch_iter[dataset] = {}  # refresh it every epoch
+        return epoch_iter

fairseq/examples/speech_to_speech/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Speech to speech translation (S2ST)
+We provide the implementation and resources for the following work on speech-to-speech translation (S2ST):
+* [Direct speech-to-speech translation with discrete units (Lee et al. 2021)](docs/direct_s2st_discrete_units.md)
+* [Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](docs/textless_s2st_real_data.md)
+* [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation](docs/enhanced_direct_s2st_discrete_units.md)

fairseq/examples/speech_to_speech/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import unity  # noqa

fairseq/examples/speech_to_speech/asr_bleu/README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# ASR-BLEU evaluation toolkit
+This toolkit provides a set of public ASR models used for evaluation of different speech-to-speech translation systems at FAIR. It enables easier score comparisons between different system's outputs.
+The ASRGenerator wraps different CTC-based ASR models from HuggingFace and fairseq code bases. Torchaudio CTC decoder is built on top of it to decode given audio files.
+Please see `asr_model_cfgs.json` for a list of languages covered currently.
+The high-level pipeline is simple by design: given a lang tag, script loads the ASR model, transcribes model's predicted audio, and computes the BLEU score against provided reference translations using sacrebleu.
+# Dependencies
+Please see `requirements.txt`.
+# Usage examples
+This toolkit have been used with:
+* Speechmatrix project: https://github.com/facebookresearch/fairseq/tree/ust/examples/speech_matrix.
+* Hokkien speech-to-speech translation project: https://github.com/facebookresearch/fairseq/tree/ust/examples/hokkien.
+# Standalone run example
+High-level example, please substitute arguments per your case:
+```bash
+python compute_asr_bleu.py --lang <LANG> \
+--audio_dirpath <PATH_TO_AUDIO_DIR> \
+--reference_path <PATH_TO_REFERENCES_FILE> \
+--reference_format txt
+```
+For more details about arguments please see the script argparser help.

fairseq/examples/speech_to_speech/asr_bleu/__init__.py ADDED Viewed

File without changes

fairseq/examples/speech_to_speech/asr_bleu/asr_model_cfgs.json ADDED Viewed

	@@ -0,0 +1,198 @@

+{
+    "en": {
+        "oct22": {
+            "desc": "Wav2Vec 2.0 Large (LV-60) + Self Training from https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#pre-trained-models",
+            "ckpt_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt",
+            "dict_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt",
+            "model_type": "fairseq",
+            "lang": "en",
+            "post_process": "collapse"
+        }
+    },
+    "hok": {
+        "oct22": {
+            "desc": "Hokkien ASR model, for details check [TODO add paper link]",
+            "ckpt_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/checkpoint_best.pt",
+            "dict_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/dict.ltr.txt",
+            "model_type": "fairseq",
+            "lang": "hok",
+            "post_process": "none"
+        }
+    },
+    "es": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
+            "model_type": "hf",
+            "lang": "es",
+            "post_process": "collapse"
+        }
+    },
+    "fr": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-fr-voxpopuli-french",
+            "model_type": "hf",
+            "lang": "fr",
+            "post_process": "collapse"
+        }
+    },
+    "zh": {
+        "oct22": {
+            "model_path": "ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt",
+            "model_type": "hf",
+            "lang": "zh",
+            "post_process": "collapse"
+        }
+    },
+    "tr": {
+        "oct22": {
+            "model_path": "cahya/wav2vec2-large-xlsr-turkish-artificial-cv",
+            "model_type": "hf",
+            "lang": "tr",
+            "post_process": "collapse"
+        }
+    },
+    "ar": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
+            "model_type": "hf",
+            "lang": "ar",
+            "post_process": "collapse"
+        }
+    },
+    "vi": {
+        "oct22": {
+            "model_path": "not-tanh/wav2vec2-large-xlsr-53-vietnamese",
+            "model_type": "hf",
+            "lang": "vi",
+            "post_process": "collapse"
+        }
+    },
+    "de": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-german",
+            "model_type": "hf",
+            "lang": "de",
+            "post_process": "collapse"
+        }
+    },
+    "pl": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-polish",
+            "model_type": "hf",
+            "lang": "pl",
+            "post_process": "collapse"
+        }
+    },
+    "it": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-italian",
+            "model_type": "hf",
+            "lang": "it",
+            "post_process": "collapse"
+        }
+    },
+    "pt": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-portuguese",
+            "model_type": "hf",
+            "lang": "pt",
+            "post_process": "collapse"
+        }
+    },
+    "ro": {
+        "oct22": {
+            "model_path": "gigant/romanian-wav2vec2",
+            "model_type": "hf",
+            "lang": "ro",
+            "post_process": "collapse"
+        }
+    },
+    "cs": {
+        "oct22": {
+            "model_path": "comodoro/wav2vec2-xls-r-300m-cs-250",
+            "model_type": "hf",
+            "lang": "cs",
+            "post_process": "collapse"
+        }
+    },
+    "sk": {
+        "oct22": {
+            "model_path": "anuragshas/wav2vec2-xls-r-300m-sk-cv8-with-lm",
+            "model_type": "hf",
+            "lang": "sk",
+            "post_process": "collapse"
+        }
+    },
+    "sl": {
+        "oct22": {
+            "model_path": "anuragshas/wav2vec2-xls-r-300m-sl-cv8-with-lm",
+            "model_type": "hf",
+            "lang": "sl",
+            "post_process": "collapse"
+        }
+    },
+    "fi": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish",
+            "model_type": "hf",
+            "lang": "fi",
+            "post_process": "collapse"
+        }
+    },
+    "hu": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
+            "model_type": "hf",
+            "lang": "hu",
+            "post_process": "collapse"
+        }
+    },
+    "et": {
+        "oct22": {
+            "model_path": "RASMUS/wav2vec2-xlsr-1b-et",
+            "model_type": "hf",
+            "lang": "et",
+            "post_process": "collapse"
+        }
+    },
+    "lt": {
+        "oct22": {
+            "model_path": "sammy786/wav2vec2-xlsr-lithuanian",
+            "model_type": "hf",
+            "lang": "lt",
+            "post_process": "collapse"
+        }
+    },
+    "nl": {
+        "oct22": {
+            "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-dutch",
+            "model_type": "hf",
+            "lang": "nl",
+            "post_process": "collapse"
+        }
+    },
+    "lv": {
+        "oct22": {
+            "model_path": "reach-vb/wav2vec2-large-xls-r-1B-common_voice7-lv-ft",
+            "model_type": "hf",
+            "lang": "lv",
+            "post_process": "collapse"
+        }
+    },
+    "sv": {
+        "oct22": {
+            "model_path": "marinone94/xls-r-300m-sv-robust",
+            "model_type": "hf",
+            "lang": "sv",
+            "post_process": "collapse"
+        }
+    },
+    "hr": {
+        "oct22": {
+            "model_path": "classla/wav2vec2-xls-r-parlaspeech-hr",
+            "model_type": "hf",
+            "lang": "hr",
+            "post_process": "collapse"
+        }
+    }
+}

fairseq/examples/speech_to_speech/asr_bleu/compute_asr_bleu.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+from typing import Dict, List
+import sacrebleu
+import pandas as pd
+from glob import glob
+from pathlib import Path
+from utils import retrieve_asr_config, ASRGenerator
+from tqdm import tqdm
+from argparse import ArgumentParser
+def merge_tailo_init_final(text):
+    """
+    Hokkien ASR hypothesis post-processing.
+    """
+    sps = text.strip().split()
+    results = []
+    last_syllable = ""
+    for sp in sps:
+        if sp == "NULLINIT" or sp == "nullinit":
+            continue
+        last_syllable += sp
+        if sp[-1].isnumeric():
+            results.append(last_syllable)
+            last_syllable = ""
+    if last_syllable != "":
+        results.append(last_syllable)
+    return " ".join(results)
+def remove_tone(text):
+    """
+    Used for tone-less evaluation of Hokkien
+    """
+    return " ".join([t[:-1] for t in text.split()])
+def extract_audio_for_eval(audio_dirpath: str, audio_format: str):
+    if audio_format == "n_pred.wav":
+        """
+        The assumption here is that 0_pred.wav corresponds to the reference at line position 0 from the reference manifest
+        """
+        audio_list = []
+        audio_fp_list = glob((Path(audio_dirpath) / "*_pred.wav").as_posix())
+        audio_fp_list = sorted(
+            audio_fp_list, key=lambda x: int(os.path.basename(x).split("_")[0])
+        )
+        for i in range(len(audio_fp_list)):
+            try:
+                audio_fp = (Path(audio_dirpath) / f"{i}_pred.wav").as_posix()
+                assert (
+                    audio_fp in audio_fp_list
+                ), f"{Path(audio_fp).name} does not exist in {audio_dirpath}"
+            except AssertionError:
+                # check the audio with random speaker
+                audio_fp = Path(audio_dirpath) / f"{i}_spk*_pred.wav"
+                audio_fp = glob(
+                    audio_fp.as_posix()
+                )  # resolve audio filepath with random speaker
+                assert len(audio_fp) == 1
+                audio_fp = audio_fp[0]
+            audio_list.append(audio_fp)
+    else:
+        raise NotImplementedError
+    return audio_list
+def extract_text_for_eval(
+    references_filepath: str, reference_format: str, reference_tsv_column: str = None
+):
+    if reference_format == "txt":
+        reference_sentences = open(references_filepath, "r").readlines()
+        reference_sentences = [l.strip() for l in reference_sentences]
+    elif reference_format == "tsv":
+        tsv_df = pd.read_csv(references_filepath, sep="\t", quoting=3)
+        reference_sentences = tsv_df[reference_tsv_column].to_list()
+        reference_sentences = [l.strip() for l in reference_sentences]
+    else:
+        raise NotImplementedError
+    return reference_sentences
+def compose_eval_data(
+    audio_dirpath: str,
+    audio_format: str,
+    references_filepath: str,
+    reference_format: str,
+    reference_tsv_column: str = None,
+    save_manifest_filepath=None,
+):
+    """
+    Speech matrix decoding pipeline produces audio with the following mask "N_pred.wav" where N is the order of the corresponding input sample
+    """
+    reference_sentences = extract_text_for_eval(
+        references_filepath, reference_format, reference_tsv_column
+    )
+    predicted_audio_fp_list = extract_audio_for_eval(audio_dirpath, audio_format)
+    assert len(predicted_audio_fp_list) == len(reference_sentences)
+    audio_text_pairs = [
+        (audio, reference)
+        for audio, reference in zip(predicted_audio_fp_list, reference_sentences)
+    ]
+    tsv_manifest = pd.DataFrame(audio_text_pairs, columns=["prediction", "reference"])
+    if save_manifest_filepath is not None:
+        tsv_manifest.to_csv(save_manifest_filepath, sep="\t", quoting=3)
+    return tsv_manifest
+def load_eval_data_from_tsv(eval_data_filepath: str):
+    """
+    We may load the result of `compose_eval_data` directly if needed
+    """
+    eval_df = pd.from_csv(eval_data_filepath, sep="\t")
+    return eval_df
+def run_asr_bleu(args):
+    asr_config = retrieve_asr_config(
+        args.lang, args.asr_version, json_path="./asr_model_cfgs.json"
+    )
+    asr_model = ASRGenerator(asr_config)
+    eval_manifest = compose_eval_data(
+        audio_dirpath=args.audio_dirpath,
+        audio_format=args.audio_format,
+        references_filepath=args.reference_path,
+        reference_format=args.reference_format,
+        reference_tsv_column=args.reference_tsv_column,
+        save_manifest_filepath=None,
+    )
+    prediction_transcripts = []
+    for _, eval_pair in tqdm(
+        eval_manifest.iterrows(),
+        desc="Transcribing predictions",
+        total=len(eval_manifest),
+    ):
+        transcription = asr_model.transcribe_audiofile(eval_pair.prediction)
+        prediction_transcripts.append(transcription.lower())
+    if args.lang == "hok":
+        prediction_transcripts = [
+            merge_tailo_init_final(text) for text in prediction_transcripts
+        ]
+    references = eval_manifest["reference"].tolist()
+    bleu_score = sacrebleu.corpus_bleu(prediction_transcripts, [references])
+    print(bleu_score)
+    return prediction_transcripts, bleu_score
+def main():
+    parser = ArgumentParser(
+        description="This script computes the ASR-BLEU metric between model's generated audio and the text reference sequences."
+    )
+    parser.add_argument(
+        "--lang",
+        help="The target language used to initialize ASR model, see asr_model_cfgs.json for available languages",
+        type=str,
+    )
+    parser.add_argument(
+        "--asr_version",
+        type=str,
+        default="oct22",
+        help="For future support we add and extra layer of asr versions. The current most recent version is oct22 meaning October 2022",
+    )
+    parser.add_argument(
+        "--audio_dirpath",
+        type=str,
+        help="Path to the directory containing the audio predictions from the translation model",
+    )
+    parser.add_argument(
+        "--reference_path",
+        type=str,
+        help="Path to the file containing reference translations in the form of normalized text (to be compared to ASR predictions",
+    )
+    parser.add_argument(
+        "--reference_format",
+        choices=["txt", "tsv"],
+        help="Format of reference file. Txt means plain text format where each line represents single reference sequence",
+    )
+    parser.add_argument(
+        "--reference_tsv_column",
+        default=None,
+        type=str,
+        help="If format is tsv, then specify the column name which contains reference sequence",
+    )
+    parser.add_argument(
+        "--audio_format",
+        default="n_pred.wav",
+        choices=["n_pred.wav"],
+        help="Audio format n_pred.wav corresponds to names like 94_pred.wav or 94_spk7_pred.wav where spk7 is the speaker id",
+    )
+    parser.add_argument(
+        "--results_dirpath",
+        default=None,
+        type=str,
+        help="If specified, the resulting BLEU score will be written to this file path as txt file",
+    )
+    parser.add_argument(
+        "--transcripts_path",
+        default=None,
+        type=str,
+        help="If specified, the predicted transcripts will be written to this path as a txt file.",
+    )
+    args = parser.parse_args()
+    prediction_transcripts, bleu_score = run_asr_bleu(args)
+    result_filename = f"{args.reference_format}_{args.lang}_bleu.txt"
+    if args.results_dirpath is not None:
+        if not Path(args.results_dirpath).exists():
+            Path(args.results_dirpath).mkdir(parents=True)
+        with open(Path(args.results_dirpath) / result_filename, "w") as f:
+            f.write(bleu_score.format(width=2))
+    if args.transcripts_path is not None:
+        with open(args.transcripts_path, "w") as f:
+            for transcript in prediction_transcripts:
+                f.write(transcript + "\n")
+if __name__ == "__main__":
+    main()
+"""
+Example to load Sl audio and references, compute BLEU:
+export lang=fi; split=vp && python compute_asr_bleu.py --lang $lang --audio_dirpath /checkpoint/hygong/S2S/speech_matrix_release_ckpts/generated_waveform_release/en-$lang/test_$split/checkpoint.pt --audio_format n_pred.wav --reference_path /large_experiments/ust/hygong/S2S/SpeechEncoder/manifests/vp-vp/en-$lang/test_$split.$lang --reference_format txt --results_dirpath ./
+"""

fairseq/examples/speech_to_speech/asr_bleu/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fairseq==0.12.2
+pandas==1.4.3
+sacrebleu==2.2.0
+torch==1.12.1
+torchaudio==0.12.1
+tqdm==4.64.0
+transformers==4.21.1

fairseq/examples/speech_to_speech/asr_bleu/utils.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import json
+import re
+import urllib.request
+from pathlib import Path
+import fairseq
+import torch
+from fairseq.data.data_utils import lengths_to_padding_mask
+from tqdm import tqdm
+try:
+    import torchaudio
+    from torchaudio.models.decoder import ctc_decoder
+except ImportError:
+    raise ImportError("Upgrade torchaudio to 0.12 to enable CTC decoding")
+class DownloadProgressBar(tqdm):
+    """A class to represent a download progress bar"""
+    def update_to(self, b=1, bsize=1, tsize=None) -> None:
+        """
+        Update the download progress
+        """
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)
+def retrieve_asr_config(lang_key: str, asr_version: str, json_path: str) -> dict:
+    """
+    Retrieve the asr model configs
+    Args:
+        lang_key: the lanuage type as the key name
+        json_path: the path of the config json file
+    Returns:
+        Dict of all the configs in the json file
+    """
+    with open(json_path, "r") as f:
+        asr_model_cfgs = json.load(f)
+    return asr_model_cfgs[lang_key][asr_version]
+class ASRGenerator(object):
+    """A class to represent a ASR generator"""
+    def __init__(
+        self,
+        model_cfg: dict,
+        cache_dirpath: str = (Path.home() / ".cache" / "ust_asr").as_posix(),
+    ) -> None:
+        """
+        Construct all the necessary attributes of the ASRGenerator class
+        Args:
+            model_cfg: the dict of the asr model config
+            cache_dirpath: the default cache path is "Path.home()/.cache/ust_asr"
+        """
+        self.cache_dirpath = Path(cache_dirpath) / model_cfg["lang"]
+        self.model_cfg = model_cfg
+        self.use_cuda = torch.cuda.is_available()
+        torchaudio.set_audio_backend("sox_io")
+        if self.model_cfg["model_type"] == "hf":
+            self.prepare_hf_model(self.model_cfg)
+        elif self.model_cfg["model_type"] == "fairseq":
+            self.prepare_fairseq_model(self.model_cfg)
+        else:
+            raise NotImplementedError(
+                f"Model type {self.model_cfg['model_type']} is not supported"
+            )
+        if self.model_cfg["post_process"] == "collapse":
+            self.post_process_fn = lambda hypo: "".join(hypo).replace(
+                self.sil_token, " "
+            )
+        elif self.model_cfg["post_process"] == "none":
+            self.post_process_fn = lambda hypo: " ".join(hypo).replace(
+                self.sil_token, " "
+            )
+        else:
+            raise NotImplementedError
+        if self.use_cuda:
+            self.model.cuda()
+        self.model.eval()
+        self.decoder = ctc_decoder(
+            lexicon=None,
+            tokens=self.tokens,
+            lm=None,
+            nbest=1,
+            beam_size=1,
+            beam_size_token=None,
+            lm_weight=0.0,
+            word_score=0.0,
+            unk_score=float("-inf"),
+            sil_token=self.sil_token,
+            sil_score=0.0,
+            log_add=False,
+            blank_token=self.blank_token,
+        )
+    def prepare_hf_model(self, model_cfg: dict) -> None:
+        """
+        Prepare the huggingface asr model
+        Args:
+            model_cfg: dict with the relevant ASR config
+        """
+        def infer_silence_token(vocab: list):
+            """
+            Different HF checkpoints have different notion of silence token
+            such as | or " " (space)
+            Important: when adding new HF asr model in, check what silence token it uses
+            """
+            if "|" in vocab:
+                return "|"
+            elif " " in vocab:
+                return " "
+            else:
+                raise RuntimeError("Silence token is not found in the vocabulary")
+        try:
+            from transformers import (AutoFeatureExtractor, AutoTokenizer,
+                                      Wav2Vec2ForCTC, Wav2Vec2Processor)
+        except ImportError:
+            raise ImportError("Install transformers to load HF wav2vec model")
+        model_path = model_cfg["model_path"]
+        self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.preprocessor = AutoFeatureExtractor.from_pretrained(model_path)
+        self.processor = Wav2Vec2Processor.from_pretrained(model_path)
+        # extra unk tokens are there to make some models work e.g. Finnish ASR has some vocab issue
+        vocab_list = [
+            self.tokenizer.decoder.get(i, f"{self.tokenizer.unk_token}1")
+            for i in range(self.tokenizer.vocab_size)
+        ]
+        self.sampling_rate = self.preprocessor.sampling_rate
+        self.normalize_input = self.preprocessor.do_normalize
+        self.tokens = vocab_list
+        self.sil_token = infer_silence_token(vocab_list)
+        self.blank_token = self.tokenizer.pad_token
+    def prepare_fairseq_model(self, model_cfg: dict) -> None:
+        """
+        Prepare the fairseq asr model
+        Args:
+            model_cfg: the specific model config dict must have: (1) ckpt_path, (2) dict_path
+        """
+        def download_file(url: str, cache_dir: Path):
+            download_path = cache_dir / url.split("/")[-1]
+            if not (cache_dir / url.split("/")[-1]).exists():
+                with DownloadProgressBar(
+                    unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1]
+                ) as t:
+                    cache_dir.mkdir(parents=True, exist_ok=True)
+                    urllib.request.urlretrieve(
+                        url, filename=download_path.as_posix(), reporthook=t.update_to
+                    )
+            else:
+                print(f"'{url}' exists in {cache_dir}")
+            return download_path.as_posix()
+        try:
+            ckpt_path = model_cfg["ckpt_path"]
+            dict_path = model_cfg["dict_path"]
+        except KeyError:
+            raise KeyError(
+                "Fairseq model cfg must provide (1) ckpt_path, (2) dict_path"
+            )
+        if re.search("^https", ckpt_path):
+            ckpt_path = download_file(ckpt_path, self.cache_dirpath)
+        if re.search("^https", dict_path):
+            dict_path = download_file(dict_path, self.cache_dirpath)
+        model, saved_cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [ckpt_path],
+            arg_overrides={
+                "task": "audio_finetuning",
+                "data": self.cache_dirpath.as_posix(),
+            },  # data must have dict in it
+        )
+        dict_lines = open(dict_path, "r").readlines()
+        tokens = [l.split()[0] for l in dict_lines]
+        # adding default fairseq special tokens
+        tokens = ["<s>", "<pad>", "</s>", "<unk>"] + tokens
+        self.model = model[0]
+        self.tokens = tokens
+        if "|" in tokens:
+            self.sil_token = "|"
+        else:
+            self.sil_token = tokens[
+                2
+            ]  # use eos as silence token if | not presented e.g., Hok ASR model
+        print(f"Inferring silence token from the dict: {self.sil_token}")
+        self.blank_token = self.tokens[0]
+        self.sampling_rate = saved_cfg.task.sample_rate
+        self.normalize_input = saved_cfg.task.normalize
+    @torch.inference_mode()
+    def load_audiofile(self, audio_path: str) -> torch.Tensor:
+        """
+        Load the audio files and apply resampling and normalizaion
+        Args:
+            audio_path: the audio file path
+        Returns:
+            audio_waveform: the audio waveform as a torch.Tensor object
+        """
+        audio_waveform, sampling_rate = torchaudio.load(audio_path)
+        if audio_waveform.dim == 2:
+            audio_waveform = audio_waveform.mean(-1)
+        if self.sampling_rate != sampling_rate:
+            audio_waveform = torchaudio.functional.resample(
+                audio_waveform, sampling_rate, self.sampling_rate
+            )
+        if self.normalize_input:
+            # following fairseq raw audio dataset
+            audio_waveform = torch.nn.functional.layer_norm(
+                audio_waveform, audio_waveform.shape
+            )
+        return audio_waveform
+    @torch.inference_mode()
+    def compute_emissions(self, audio_input: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the emissions for either fairseq or huggingface asr model
+        Args:
+            audio_path: the input audio waveform
+        Returns:
+            emissions: the logits of the encoded prediction.
+        """
+        if self.use_cuda:
+            audio_input = audio_input.to("cuda")
+        if isinstance(self.model, fairseq.models.wav2vec.wav2vec2_asr.Wav2VecCtc):
+            padding_mask = lengths_to_padding_mask(torch.tensor([audio_input.numel()]))
+            emissions = self.model.w2v_encoder(audio_input, padding_mask)[
+                "encoder_out"
+            ].transpose(0, 1)
+        else:
+            emissions = self.model(audio_input).logits
+        return emissions
+    def decode_emissions(self, emissions: torch.Tensor) -> str:
+        """
+        Decode the emissions and apply post process functions
+        Args:
+            emissions: the input Tensor object
+        Returns:
+            hypo: the str as the decoded transcriptions
+        """
+        emissions = emissions.cpu()
+        results = self.decoder(emissions)
+        # assuming the lexicon-free decoder and working with tokens
+        hypo = self.decoder.idxs_to_tokens(results[0][0].tokens)
+        hypo = self.post_process_fn(hypo)
+        return hypo
+    def transcribe_audiofile(self, audio_path: str, lower=True) -> str:
+        """
+        Transcribe the audio into string
+        Args:
+            audio_path: the input audio waveform
+            lower: the case of the transcriptions with lowercase as the default
+        Returns:
+            hypo: the transcription result
+        """
+        asr_input = self.load_audiofile(audio_path)
+        emissions = self.compute_emissions(asr_input)
+        hypo = self.decode_emissions(emissions)
+        return hypo.strip().lower() if lower else hypo.strip()

fairseq/examples/speech_to_speech/benchmarking/README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Benchmarking
+## Overview
+The goal of this framework is to support benchmarking various speech to speech translation(S2ST) models in terms of runtime, max-memory consumption and total number of floating point operations(FLOPS). It is a generic framework and can be easily extended to support any fairseq models. To accurately benchmark the performance, core inference modules are re-implemented based on fairseq_cli/generate.py (core.py/Processing) and examples/speech_to_text/generate_waveform.py(core.py/SpeechGeneration. To ensure that the end to end models and cascaded models are compared fairly, for cascaded models we only consider the performance metrics for model inference at all stages ignoring any intermediate data and io processing consumption. We run all the benchmarking runs on CPU as it is generally used in production environment and also due to lack of good benchmarking library support for GPUs.
+1. Runtime: Average time in seconds to run model inference on an example from a given dataset. We use [timeit](https://docs.python.org/3/library/timeit.html) library to measure the runtime.
+2. Max memory: Maximum memory in MiB averaged over by running the model inference on all examples from the given dataset. We use [memory_profiler](https://pypi.org/project/memory-profiler/) library to gather memory footprints for a code snippet and find the maximum to get the max memory used by the code. For cascaded models, we find the max of all stages to get the overall max_memory footprint.
+3. FLOPS: We compute the average number of floating point operations needed to run model inference for an example from the given dataset. We use [PAPI library](http://www.bnikolic.co.uk/blog/python/flops/2019/10/01/pytorch-count-flops.html) to benchmark the number of flops.
+## CLI Commands
+```{python}
+CUBLAS_WORKSPACE_CONFIG=:4096:8 python examples/speech_to_speech/benchmarking/get_metrics.py ‘’ --config $config
+```
+## Note:
+1. The npy dataset is a list of samples saved as a .npy file. Each sample is a dictionary with id, net_input.
+2. The raw dataset is a list of raw audio paths similar to wav2vec2 input tsv file
+```{python}
+sample: {
+    "id": xx,
+    "net_input": {
+        "src_tokens": torch.tensor([]),
+        "src_lengths": torch.tensor([])
+    }
+}
+```

fairseq/examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+general:
+  dataset_path: $npy_dataset
+  cpu: True
+  model_type: 2StageS2ST
+  dataset_size: 1
+stage1:
+  data: $data_bin_stage1
+  task: speech_to_text
+  path: $checkpoint_stage1
+  config_yaml: config.yaml
+  max_len_a: 2
+  max_len_b: 500
+stage2:
+  data: $data_bin_stage2
+  task: text_to_speech
+  path: $checkpoint_stage2
+  config_yaml: config.yaml

fairseq/examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+general:
+  dataset_path: $npy_dataset
+  cpu: True
+  model_type: 3StageS2ST
+  max_len_a: 2
+  max_len_b: 500
+  dataset_size: 1
+stage1:
+  data: $data_bin_stage1
+  task: speech_to_text
+  path: $checkpoint_stage1
+  config_yaml: config.yaml
+  max_len_a: 2
+  max_len_b: 500
+stage2:
+  data: $data_bin_stage2
+  task: translation
+  path: $checkpoint_stage2
+  config_yaml: config.yaml
+stage2:
+  data: $data_bin_stage3
+  task: text_to_speech
+  path: $checkpoint_stage3
+  config_yaml: config.yaml

fairseq/examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+general:
+  dataset_path: $npy_dataset_path
+  cpu: True
+  model_type: S2UT
+  dataset_size: 5
+  dump_speech_waveforms_dir: $dump_waveforms_dir_path
+stage1:
+  data: $data_bin
+  task: speech_to_speech
+  path:  $checkpoint
+  config_yaml: config.yaml
+  max_len_b: 100000
+  beam: 10
+  target_is_code: True
+  max_target_positions: 3000
+  target_code_size: 100
+stage2:
+  vocoder: $vocoder_path
+  vocoder_cfg: $vocoder_cfg_json
+  dur_prediction: True

fairseq/examples/speech_to_speech/benchmarking/configs/S2T.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+general:
+  dataset_path: $npy_dataset
+  cpu: True
+  model_type: S2T
+  dataset_size: 1
+stage1:
+  data: $data_bin
+  task: speech_to_text
+  path: $checkpoint
+  config_yaml: config.yaml
+  max_len_a: 2
+  max_len_b: 500

fairseq/examples/speech_to_speech/benchmarking/core.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import timeit
+import logging
+import torch
+from pypapi import events, papi_high as high
+from memory_profiler import memory_usage
+from torch import nn
+from argparse import Namespace
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq import checkpoint_utils, tasks, utils
+from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder
+from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader
+from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans
+from fairseq_cli.generate import get_symbols_to_strip_from_output
+import soundfile as sf
+import ast
+import json
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+torch.manual_seed(1)
+torch.set_deterministic(True)
+class BenchmarkingBase(nn.Module):
+    def __init__(self):
+        nn.Module.__init__(self)
+        self.s2x_task = None
+    def warm_up(self, sample, repeat):
+        """Warm up the model"""
+        for _i in range(repeat):
+            self.forward(sample)
+        logger.info(f"Model warmed up by running inference {repeat} times")
+    def benchmark_run_time(self, dataset, repeat):
+        """Benchmark average runtime for the model by calling benchmark_run_time_single_sample function"""
+        logger.info("Starting run time benchmarking")
+        time_elapsed = 0
+        for i, sample in enumerate(dataset):
+            time_elapsed += self.benchmark_run_time_single_sample(sample, repeat=repeat)
+            if i % 100 == 0:
+                logger.info(f"Benchmarked run time for {i}/{len(dataset)} samples")
+        total_time_elapsed = time_elapsed / len(dataset)
+        return total_time_elapsed
+    def benchmark_run_time_single_sample(self, sample, repeat):
+        """Benchmark average runtime for a single sample using timeit library. Units are seconds"""
+        timer = timeit.Timer(lambda: self.forward(sample))
+        time_elapsed = timer.timeit(repeat)
+        return time_elapsed / repeat
+    def count_flops(
+        self,
+        dataset,
+        repeat,
+    ):
+        """Use PYPAPI library to count average flops for model inference.
+        Note: It only works if the model is being run on cpu"""
+        logger.info("Starting flop counter")
+        high.start_counters([events.PAPI_DP_OPS])
+        for i, sample in enumerate(dataset):
+            for _r in range(repeat):
+                self.forward(sample)
+            if i % 100 == 0:
+                logger.info(f"Counted flops for {i}/{len(dataset)} samples")
+        flops = high.stop_counters()
+        flops = round(flops[0] / (repeat * len(dataset)))
+        return flops
+    def max_memory(self, dataset, repeat):
+        """Compute average max memory consumed by model inference. Units are MiB"""
+        logger.info("Starting memory benchmarking")
+        total_memory = 0
+        for i, sample in enumerate(dataset):
+            for _r in range(repeat):
+                total_memory += max(memory_usage((self.forward, (sample,), {})))
+            if i % 100 == 0:
+                logger.info(f"Benchmarked memory for {i}/{len(dataset)} samples")
+        total_memory = total_memory / (repeat * len(dataset))
+        return total_memory
+    def gather_all_metrics(self, dataset, repeat):
+        run_time = self.benchmark_run_time(dataset, repeat)
+        max_memory = self.max_memory(dataset, repeat)
+        flops = self.count_flops(dataset, repeat)
+        return run_time, max_memory, flops
+    def dump_final_speech_output(
+        self, dataset, output_dir, resample_fn, sample_rate, prefix=None
+    ):
+        for i, sample in enumerate(dataset):
+            hypo = self.forward(sample)[0]
+            def to_np(x):
+                return x.detach().cpu().numpy()
+            try:
+                wave_preds = to_np(resample_fn(hypo["waveform"]))
+                sf.write(
+                    f"{output_dir}/{prefix}_{i}_pred.wav",
+                    wave_preds,
+                    sample_rate,
+                )
+            except Exception as e:
+                raise Exception(
+                    f" Encountered {e} - Invalid waveform. Make sure the model outputs a waveform"
+                )
+class Processing(BenchmarkingBase):
+    """Class similar to fairseq_cli/generate.py. Supports ASR, MT and ST model inference"""
+    def __init__(self, args):
+        super().__init__()
+        self.use_cuda = not getattr(args, "cpu", False)
+        self.setUp(args)
+        self.training = False
+        self.s2x_task = self.task
+    def setUp(self, cfg):
+        if isinstance(cfg, Namespace):
+            cfg = convert_namespace_to_omegaconf(cfg)
+        self.task = tasks.setup_task(cfg.task)
+        self.tgt_dict = self.task.target_dictionary
+        # Load ensemble
+        logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+        models, _ = checkpoint_utils.load_model_ensemble(
+            utils.split_paths(cfg.common_eval.path),
+            arg_overrides={},
+            task=self.task,
+            suffix=cfg.checkpoint.checkpoint_suffix,
+            strict=False,
+            num_shards=cfg.checkpoint.checkpoint_shard_count,
+        )
+        if len(models) > 1:
+            raise Exception("Currently loading multiple models is not supported")
+        self.model = models[0]
+        # Optimize model for generation
+        if cfg.common.fp16:
+            self.model.half()
+        if self.use_cuda:
+            self.model.cuda()
+        self.model.prepare_for_inference_(cfg)
+        self.generator = self.task.build_generator(
+            [self.model],
+            cfg.generation,
+            extra_gen_cls_kwargs={},
+        )
+        # Handle tokenization and BPE
+        self.tokenizer = self.task.build_tokenizer(cfg.tokenizer)
+        self.bpe = self.task.build_bpe(cfg.bpe)
+        self.remove_bpe = cfg.common_eval.post_process
+    def encode_source(self, src):
+        """Method to generate source tokens from a string"""
+        if self.tokenizer is not None:
+            src = self.tokenizer.encode(src)
+        if self.bpe is not None:
+            src = self.bpe.encode(src)
+        src_tokens = self.task.source_dictionary.encode_line(src).long()
+        src_lens = src_tokens.size(0)
+        return {
+            "net_input": {
+                "src_tokens": src_tokens.view(1, src_lens),
+                "src_lengths": torch.tensor([src_lens]),
+            }
+        }
+    def decode_target(self, hypos):
+        """Method to decode target string from tokens"""
+        hypo_str = self.tgt_dict.string(
+            hypos[0][0]["tokens"].int().cpu(),
+            self.remove_bpe,
+            get_symbols_to_strip_from_output(self.generator),
+        )
+        if self.bpe is not None:
+            hypo_str = self.bpe.decode(hypo_str)
+        if self.tokenizer is not None:
+            hypo_str = self.tokenizer.decode(hypo_str)
+        return hypo_str
+    def forward(self, sample):
+        hypos = self.task.inference_step(
+            self.generator,
+            [self.model],
+            sample,
+            prefix_tokens=None,
+            constraints=None,
+        )
+        return hypos
+class GenerateWaveformFromCode(BenchmarkingBase):
+    """Class to support waveform generation from code. Currently, vocoder only supports single speaker"""
+    def __init__(self, args):
+        super().__init__()
+        with open(args.vocoder_cfg) as f:
+            vocoder_cfg = json.load(f)
+        self.dur_prediction = args.dur_prediction
+        self.vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg)
+    def format_units(self, input):
+        code = torch.LongTensor(list(map(int, input.strip().split()))).view(1, -1)
+        return {"code": code}
+    def generate_vocoder_input(self, dataset):
+        return [self.format_units(sample) for sample in dataset]
+    def forward(self, sample):
+        return [{"waveform": self.vocoder(sample, self.dur_prediction)}]
+class HubertUnitExtractor(BenchmarkingBase):
+    def __init__(self, args):
+        self.feature_reader = HubertFeatureReader(
+            args.hubert_ckpt_path, args.hubert_layer
+        )
+        self.kmeans = ApplyKmeans(args.hubert_km_path)
+    def forward(self, sample):
+        with torch.no_grad():
+            feat = []
+            for start in range(0, sample.size(1), self.feature_reader.max_chunk):
+                x_chunk = sample[:, start : start + self.max_chunk]
+                feat_chunk, _ = self.feature_reader.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+            torch.cat(feat, 1).squeeze(0)
+        return self.kmeans(feat).tolist()
+class SpeechGeneration(BenchmarkingBase):
+    """Class similar to examples/text_to_speech/generate_waveform.py.
+    Supports models with speech generation as end goal (TTS, Direct S2ST models etc)"""
+    def __init__(self, args):
+        super().__init__()
+        self.use_cuda = not getattr(args, "cpu", False)
+        self.setUp(args)
+        self.s2x_task = self.task
+    def setUp(self, args):
+        if args.task == "speech_to_speech":
+            args.normalize_waveform = False
+        self.task = tasks.setup_task(args)
+        self.pre_tokenizer = self.task.build_tokenizer(args)
+        self.bpe_tokenizer = self.task.build_bpe(args)
+        try:
+            self.src_dict = self.task.src_dict
+        except Exception:
+            self.src_dict = None
+        ensemble, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            [args.path],
+            arg_overrides=ast.literal_eval(args.model_overrides),
+            task=self.task,
+            strict=False,
+        )
+        self.model = ensemble[0]
+        if self.use_cuda:
+            self.model.cuda()
+            # criterion.cuda()
+        self.model.eval()
+        self.generator = self.task.build_generator(
+            [self.model],
+            args,
+        )
+    def processTextInput(self, text):
+        """Generate source tokens from text input"""
+        if self.pre_tokenizer is not None:
+            text = self.pre_tokenizer.encode(text)
+        if self.bpe_tokenizer is not None:
+            text = self.bpe_tokenizer.encode(text)
+        target = self.src_dict.encode_line(
+            text, add_if_not_exist=False, append_eos=True
+        ).long()
+        target = fairseq_data_utils.collate_tokens(
+            [target],
+            self.src_dict.pad(),
+            self.src_dict.eos(),
+            left_pad=False,
+            move_eos_to_beginning=False,
+        )
+        src_lengths = torch.tensor([target.size(1)], dtype=torch.long)
+        prev_output_tokens = None
+        sample = {
+            "net_input": {
+                "src_tokens": target,
+                "src_lengths": src_lengths,
+                "prev_output_tokens": prev_output_tokens,
+            }
+        }
+        sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+        return sample
+    def forward(self, sample):
+        sample["speaker"] = None
+        output = self.generator.generate(self.model, sample)  # , has_targ=False
+        return output
+class S2UT(BenchmarkingBase):
+    """Class to support S2UT models. Also supports generating waveforms from the units predicted"""
+    def __init__(self, s2u_args, vocoder_args=None):
+        super().__init__()
+        self.s2u = Processing(s2u_args)
+        self.vocoder = None
+        if vocoder_args:
+            self.vocoder = GenerateWaveformFromCode(vocoder_args)
+        self.vocoder_input = None
+    def forward(self, sample):
+        s2u_hypos = self.s2u(sample)
+        s2u_output = self.s2u.decode_target(s2u_hypos)
+        if not self.vocoder:
+            return s2u_output
+        units = self.vocoder.format_units(s2u_output)
+        vocoder_output = self.vocoder(units)
+        return vocoder_output
+    def generate_s2u_outputs(self, dataset):
+        return [self.s2u.decode_target(self.s2u(sample)) for sample in dataset]
+    def compute_metrics(self, metric_type, dataset, repeat=None):
+        """Generic function to compute metrics ignoring the io processing time"""
+        if self.vocoder and not self.vocoder_input:
+            self.s2u_output = self.generate_s2u_outputs(dataset)
+            self.vocoder_input = self.vocoder.generate_vocoder_input(self.s2u_output)
+        s2u_metrics = getattr(self.s2u, metric_type)(
+            dataset,
+            repeat,
+        )
+        vocoder_metrics = 0
+        if self.vocoder:
+            vocoder_metrics = getattr(self.vocoder, metric_type)(
+                self.vocoder_input,
+                repeat,
+            )
+        print(
+            f"metric_type = {metric_type} s2u_metrics = {s2u_metrics} \t vocoder_metrics = {vocoder_metrics}"
+        )
+        if metric_type == "max_memory":
+            return max(s2u_metrics, vocoder_metrics)
+        else:
+            return s2u_metrics + vocoder_metrics
+    def benchmark_run_time(self, dataset, repeat):
+        return self.compute_metrics("benchmark_run_time", dataset, repeat)
+    def count_flops(self, dataset, repeat):
+        return self.compute_metrics("count_flops", dataset, repeat)
+    def max_memory(self, dataset, repeat):
+        return self.compute_metrics("max_memory", dataset, repeat)
+class Cascaded2StageS2ST(BenchmarkingBase):
+    """ST + TTS"""
+    def __init__(self, s2t_args, tts_args):
+        super().__init__()
+        self.s2t = Processing(s2t_args)
+        self.s2x_task = self.s2t.task
+        self.tts = SpeechGeneration(tts_args) if tts_args else None
+        self.training = False
+        self.tts_inputs = None
+    def forward(self, sample):
+        if not self.tts:
+            raise Exception(
+                "Forward function is not callable without tts. Reinitialize the class with tts_args"
+            )
+        s2t_hypos = self.s2t(sample)
+        s2t_output = self.s2t.decode_target(s2t_hypos)
+        tts_input = self.tts.processTextInput(s2t_output)
+        tts_output = self.tts(tts_input)
+        return tts_output
+    def generate_s2t_outputs(self, dataset):
+        """Process dataset and generate s2t outputs"""
+        return [self.s2t.decode_target(self.s2t(sample)) for sample in dataset]
+    def generate_tts_inputs(self, dataset):
+        """Process dataset and generate tts inputs"""
+        return [self.tts.processTextInput(sample) for sample in dataset]
+    def compute_metrics(self, metric_type, dataset, repeat=None):
+        """Generic function to compute metrics ignoring the io processing time"""
+        if not self.tts_inputs:
+            s2t_outputs = self.generate_s2t_outputs(dataset)
+            self.tts_inputs = self.generate_tts_inputs(s2t_outputs)
+        s2t_metrics = getattr(self.s2t, metric_type)(
+            dataset,
+            repeat,
+        )
+        tts_metrics = getattr(self.tts, metric_type)(
+            self.tts_inputs,
+            repeat,
+        )
+        print(
+            f"metric_type = {metric_type} s2t_metrics = {s2t_metrics} \t tts_metrics = {tts_metrics}"
+        )
+        if metric_type == "max_memory":
+            return max(s2t_metrics, tts_metrics)
+        else:
+            return s2t_metrics + tts_metrics
+    def benchmark_run_time(self, dataset, repeat):
+        return self.compute_metrics("benchmark_run_time", dataset, repeat)
+    def count_flops(self, dataset, repeat):
+        return self.compute_metrics("count_flops", dataset, repeat)
+    def max_memory(self, dataset, repeat):
+        return self.compute_metrics("max_memory", dataset, repeat)
+class Cascaded3StageS2ST(Cascaded2StageS2ST):
+    """ASR + MT + TTS"""
+    def __init__(self, s2t_args, tts_args, mt_args):
+        super().__init__(s2t_args, tts_args)
+        self.mt = Processing(mt_args)
+        self.mt_inputs = []
+    def forward(self, sample):
+        s2t_hypos = self.s2t(sample)
+        s2t_output = self.s2t.decode_target(s2t_hypos)
+        mt_input = self.mt.encode_source(s2t_output)
+        mt_hypos = self.mt(mt_input)
+        mt_output = self.mt.decode_target(mt_hypos)
+        tts_input = self.tts.processTextInput(mt_output)
+        tts_output = self.tts(tts_input)
+        return tts_output
+    def generate_mt_inputs(self, dataset):
+        """Process dataset to generate mt model inputs"""
+        return [self.mt.encode_source(sample) for sample in dataset]
+    def generate_mt_outputs(self, dataset):
+        """Process dataset to generate mt model outputs"""
+        return [self.mt.decode_target(self.mt(sample)) for sample in dataset]
+    def compute_metrics(self, metric_type, dataset, repeat=None):
+        """Generic function to compute metrics ignoring the io processing time"""
+        if not self.tts_inputs:
+            s2t_outputs = self.generate_s2t_outputs(dataset)
+            self.mt_inputs = self.generate_mt_inputs(s2t_outputs)
+            mt_outputs = self.generate_mt_outputs(self.mt_inputs)
+            self.tts_inputs = self.generate_tts_inputs(mt_outputs)
+        s2t_metrics = getattr(self.s2t, metric_type)(
+            dataset,
+            repeat,
+        )
+        mt_metrics = getattr(self.mt, metric_type)(self.mt_inputs, repeat)
+        tts_metrics = getattr(self.tts, metric_type)(
+            self.tts_inputs,
+            repeat,
+        )
+        print(
+            f"metric_type = {metric_type}  s2t_metrics = {s2t_metrics} \t mt_metrics = {mt_metrics} \t tts_metrics = {tts_metrics}"
+        )
+        if metric_type == "max_memory":
+            return max(s2t_metrics, mt_metrics, tts_metrics)
+        else:
+            return s2t_metrics + mt_metrics + tts_metrics

fairseq/examples/speech_to_speech/benchmarking/data_utils.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from fairseq import tasks
+import numpy as np
+import logging
+import random
+from fairseq import options
+import torch
+import os
+import soundfile as sf
+from fairseq.data.audio.audio_utils import (
+    get_waveform,
+    parse_path,
+)
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+random.seed(1)
+np.random.seed(1)
+random_number_generator = np.random.RandomState(30)
+def generate_random_data_sample(T, B=1, D=80):
+    """Generate random data sample given the T, B, D values"""
+    net_input = {
+        "src_tokens": torch.tensor(random_number_generator.randn(B, T, D)).float(),
+        "src_lengths": torch.tensor([T]),
+    }
+    return {"net_input": net_input}
+def generate_random_dataset(T_range_min, T_range_max, B=1, D=80, dataset_size=100):
+    """Generate random dataset with T values within a given range, B, D"""
+    T_values = [random.randint(T_range_min, T_range_max) for i in range(dataset_size)]
+    dataset = []
+    for t in T_values:
+        dataset.append(generate_random_data_sample(t, B, D))
+    return dataset, sum(T_values) / dataset_size
+def load_dataset_npy(file_name, dataset_size=None):
+    """Load dataset from a .npy file."""
+    data = np.load(file_name, allow_pickle=True)
+    if dataset_size:
+        data = data[:dataset_size]
+    return data
+def load_dataset_raw_to_waveforms(
+    file_name,
+    dataset_size=None,
+    need_waveform=True,
+    sample_rate=16000,
+    read_using_soundfile=False,
+):
+    """Load raw dataset from w2v tsv file. Optionally get waveforms"""
+    data = []
+    with open(file_name, "r") as fp:
+        lines = fp.readlines()
+        data = [
+            os.path.join(lines[0].strip(), line.strip().split("\t")[0])
+            for line in lines[1:]
+        ]
+    if dataset_size:
+        data = data[:dataset_size]
+    if not need_waveform:
+        return data
+    features = []
+    if read_using_soundfile:
+        for _i, d in enumerate(data):
+            wav = sf.read(d)[0]
+            if wav.ndim == 2:
+                wav = wav.mean(-1)
+            features.append(torch.from_numpy(wav).float().view(1, -1))
+    else:
+        for i, d in enumerate(data):
+            _path, slice_ptr = parse_path(d)
+            if len(slice_ptr) == 0:
+                feat = get_waveform(
+                    _path, always_2d=True, output_sample_rate=sample_rate
+                )[0]
+                features.append(
+                    {
+                        "id": i,
+                        "net_input": {
+                            "src_tokens": torch.tensor(feat),
+                            "src_lengths": torch.tensor([feat.shape[1]]),
+                        },
+                    }
+                )
+            else:
+                raise Exception("Currently unsupported data format")
+    return features
+def load_dataset_task(
+    args,
+    batch_size=1,
+    limit_size=None,
+    ref_dataset=None,
+):
+    """Loads dataset based on args by creating a task"""
+    if not args.data or not args.subset or not args.task:
+        raise Exception(
+            "Please provide necessary arguments to load the dataset - data, subset and task"
+        )
+    task = tasks.setup_task(args)
+    task.load_dataset(args.subset)
+    if not limit_size:
+        limit_size = len(task.dataset(args.subset))
+    iter = task.get_batch_iterator(
+        dataset=task.dataset(args.subset), max_sentences=batch_size
+    ).next_epoch_itr(shuffle=False)
+    dataset = []
+    for i, sample in enumerate(iter):
+        sample = {
+            "id": task.datasets[args.subset].ids[sample["id"].item()],
+            "net_input": {
+                "src_tokens": sample["net_input"]["src_tokens"],
+                "src_lengths": sample["net_input"]["src_lengths"],
+            },
+        }
+        dataset.append(sample)
+        if i == limit_size - 1:
+            break
+    if ref_dataset:
+        try:
+            ids = get_ids_from_dataset(ref_dataset)
+        except Exception as e:
+            raise Exception(f"{e} - Cannot extract ids from reference dataset")
+        filtered_dataset = []
+        for sample in dataset:
+            if (
+                sample["id"] in ids
+                or sample["id"][5:] in ids
+                or f"dev_{sample['id']}" in ids
+            ):
+                filtered_dataset.append(sample)
+        dataset = filtered_dataset
+    max_len, min_len, avg_len = get_dataset_stats(dataset)
+    print(
+        f"{args.subset} dataset stats : num_samples={len(dataset)} max_len = {max_len} min_len = {min_len} avg_len = {avg_len}"
+    )
+    return dataset
+def randomly_sample_subset(dataset, size=500):
+    """Randomly sample subset from a dataset"""
+    random_indices = [random.randint(0, len(dataset) - 1) for i in range(size)]
+    return [dataset[i] for i in random_indices]
+def get_short_data_subset(dataset, size=500):
+    """Get a subset of desired size by sorting based on src_lengths"""
+    return sort_dataset(dataset)[:size]
+def get_long_data_subset(dataset, size=500):
+    """Get a subset of desired size by sorting based on src_lengths descending"""
+    return sort_dataset(dataset, reverse=True)[:size]
+def sort_dataset(dataset, reverse=False):
+    return sorted(
+        dataset, key=lambda x: x["net_input"]["src_lengths"].item(), reverse=reverse
+    )
+def save_dataset_npy(dataset, file_name):
+    """Save a dataset as .npy file"""
+    np.save(file_name, dataset)
+def get_dataset_stats(dataset):
+    """Get stats about dataset based on src_lengths of samples"""
+    max_len = 0
+    min_len = 100000
+    avg_len = 0
+    for d in dataset:
+        max_len = max(max_len, d["net_input"]["src_lengths"].item())
+        min_len = min(min_len, d["net_input"]["src_lengths"].item())
+        avg_len += d["net_input"]["src_lengths"].item()
+    return max_len, min_len, avg_len / len(dataset)
+def make_parser():
+    """
+    Additional args:
+        1. Provide the dataset dir path using --data.
+        2. Loading the dataset doesn't require config, provide --config-yaml to apply additional feature transforms
+    """
+    parser = options.get_speech_generation_parser()
+    parser.add_argument(
+        "--subset",
+        default=None,
+        type=str,
+        required=True,
+        help="Subset to use for dataset generation",
+    )
+    parser.add_argument(
+        "--dataset-save-dir",
+        default=None,
+        type=str,
+        required=False,
+        help="Dir path in which the datasets are to be saved",
+    )
+    parser.add_argument(
+        "--ref-dataset",
+        default=None,
+        type=str,
+        required=False,
+        help="If provided, the ids in the reference dataset will be used to filter the new dataset generated.",
+    )
+    parser.add_argument("--dataset-save-token", default="", type=str, required=False)
+    options.add_generation_args(parser)
+    return parser
+def get_ids_from_dataset(dataset):
+    return {sample["id"]: 1 for sample in dataset}
+def cli_main():
+    parser = make_parser()
+    args = options.parse_args_and_arch(parser)
+    dataset = load_dataset_task(args)
+    random_dataset = randomly_sample_subset(dataset)
+    short_dataset = get_short_data_subset(dataset)
+    long_dataset = get_long_data_subset(dataset)
+    if args.dataset_save_token:
+        args.dataset_save_token = f"_{args.dataset_save_token}_"
+    if args.dataset_save_dir:
+        save_dataset_npy(
+            random_dataset,
+            f"{args.dataset_save_dir}/random_dataset{args.dataset_save_token}w_ids.npy",
+        )
+        save_dataset_npy(
+            short_dataset,
+            f"{args.dataset_save_dir}/short_dataset{args.dataset_save_token}w_ids.npy",
+        )
+        save_dataset_npy(
+            long_dataset,
+            f"{args.dataset_save_dir}/long_dataset{args.dataset_save_token}w_ids.npy",
+        )
+if __name__ == "__main__":
+    cli_main()

fairseq/examples/speech_to_speech/benchmarking/get_metrics.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import copy
+import torch
+import logging
+from argparse import Namespace
+import yaml
+from fairseq import options
+from examples.speech_to_speech.benchmarking.core import (
+    Processing,
+    SpeechGeneration,
+    Cascaded2StageS2ST,
+    Cascaded3StageS2ST,
+    S2UT,
+)
+from examples.speech_to_speech.benchmarking.data_utils import (
+    load_dataset_npy,
+    load_dataset_raw_to_waveforms,
+)
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+torch.manual_seed(1)
+torch.set_deterministic(True)
+def make_parser():
+    """Note: As the names indicate use s2x_args(ex:ST, ASR etc) for models with speech input,
+    x2s_args for models with speech output(ex:TTS) and mt_args for translation models (ex: mt, T2U etc).
+    For direct S2ST models, use x2s_args to provide model details.
+    """
+    parser = options.get_speech_generation_parser()
+    parser.add_argument("--target-is-code", action="store_true", default=False)
+    parser.add_argument("--config", type=str)
+    parser.add_argument(
+        "--model-type",
+        default="S2U",
+        choices=["S2S", "TTS", "S2UT", "MT", "S2T", "2StageS2ST", "3StageS2ST"],
+        help="Choose one of the models. For model inference implementation, refer to core.py",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        help="""File to load dataset from. Assumes dataset is a list of samples.
+        Each sample is a dict of format {'net_input':{'src_tokens':torch.tenor(),'src_lengths':torch.tensor()}}""",
+    )
+    parser.add_argument(
+        "--dataset-type",
+        type=str,
+        default="npy",
+        choices=["npy", "raw"],
+        help="""Type of input dataset file""",
+    )
+    parser.add_argument(
+        "--read-using-sf",
+        type=str,
+        default=False,
+        help="""If sound file should be used to read the raw dataset""",
+    )
+    parser.add_argument(
+        "--dataset-size",
+        default=None,
+        type=int,
+        help="Dataset size to use for benchmarking",
+    )
+    parser.add_argument(
+        "--dump-speech-waveforms-dir",
+        default=None,
+        type=str,
+        help="Directory to dump the speech waveforms computed on the dataset.",
+    )
+    parser.add_argument(
+        "--dump-waveform-file-prefix",
+        default="",
+        type=str,
+        help="File name prefix for the saved speech waveforms",
+    )
+    parser.add_argument(
+        "--feat-dim", default=80, type=int, help="Input feature dimension"
+    )
+    parser.add_argument(
+        "--target-sr",
+        default=16000,
+        type=int,
+        help="Target sample rate for dumping waveforms",
+    )
+    options.add_generation_args(parser)
+    options.get_interactive_generation_parser(parser)
+    return parser
+def cli_main():
+    parser = make_parser()
+    args = options.parse_args_and_arch(parser)
+    with open(
+        args.config,
+        "r",
+    ) as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    dict_args = vars(args)
+    dict_args.update(config["general"])
+    args = Namespace(**dict_args)
+    i = 1
+    stage_args = []
+    while i <= 3:
+        var = f"stage{i}"
+        tmp_args = copy.deepcopy(dict_args)
+        if var in config:
+            tmp_args.update(config[var])
+            stage_args.append(Namespace(**tmp_args))
+            i += 1
+        else:
+            break
+    if args.model_type == "S2S" or args.model_type == "TTS":
+        model = SpeechGeneration(stage_args[0])
+    elif args.model_type == "S2UT":
+        model = S2UT(stage_args[0], stage_args[1] if len(stage_args) > 1 else None)
+    elif args.model_type == "MT" or args.model_type == "S2T":
+        model = Processing(stage_args[0])
+    elif args.model_type == "2StageS2ST":
+        model = Cascaded2StageS2ST(stage_args[0], stage_args[1])
+    elif args.model_type == "3StageS2ST":
+        model = Cascaded3StageS2ST(stage_args[0], stage_args[2], stage_args[1])
+    else:
+        raise Exception(f"Currently unsupported model type {args.model_type}")
+    print(f"Evaluating on dataset - {args.dataset_path}\n")
+    if args.dataset_type == "npy":
+        dataset = load_dataset_npy(args.dataset_path, dataset_size=args.dataset_size)
+    elif args.dataset_type == "raw":
+        dataset = load_dataset_raw_to_waveforms(
+            args.dataset_path,
+            dataset_size=args.dataset_size,
+            read_using_soundfile=args.read_using_sf,
+        )
+    else:
+        raise Exception(f"Invalid dataset type {args.dataset_type}")
+    model.warm_up(sample=dataset[0], repeat=2)
+    run_time, memory, flops = model.gather_all_metrics(dataset, repeat=1)
+    print(f"run_time = {run_time}sec \tmemory = {memory}MiB \tflops = {flops}")
+    if args.dump_speech_waveforms_dir:
+        model.dump_final_speech_output(
+            dataset,
+            args.dump_speech_waveforms_dir,
+            lambda x: x,
+            args.target_sr,
+            prefix=args.dump_waveform_file_prefix,
+        )
+if __name__ == "__main__":
+    cli_main()

fairseq/examples/speech_to_speech/docs/data_augmentation.md ADDED Viewed

	@@ -0,0 +1,435 @@

+# Noise and audio augmentation techniques
+The noise and data augmentation techniques were written in an effort to understand how augmenatation can affect model robustness and performance in both clean and noisy settings.
+All transforms discussed in this section are subclasses of `AudioFeatureTransform`, `AudioWaveformTransform`, or `AudioDatasetTransform`. Each `Audio*Transform` has unique interaction with the data. If interested in implemented one's own transforms, it is highly advisable to review the differences (see [Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms)). If only applying the in-built transforms, then one only needs to be mindful that the correct kind of transform is listed in the config (see [Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms)). These transforms can be applied to instances of `SpeechToTextDataset`.
+### Contents
+[In-built transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#in-built-transforms)
+[Benchmark studies](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#benchmark-studies)
+[Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms)
+[Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms)
+## In-built transforms
+### 1. Utterance concatenation
+Utterance concatenation is a data augmenation technique introduced as ConcatAug in [Translatotron 2: High-quality direct speech-to-speech translation
+with voice preservation](https://arxiv.org/pdf/2107.08661.pdf).
+With some parameterized probability, samples are concatenated with one other randomly chosen sample from the whole dataset. In the positive (concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`. In the negative (skip concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]` and `target=target[i]` as usual.
+**Usage**: `concataugment` is an `AudioDatasetTransform` and has three configurable hyperparameters:
+- `rate`: probability that any single access will result in the positive (concatenation) case. Defaults to 0.25.
+- `max_tokens`: maximum number of tokens allowed for concatenated source sequences. This parameter is meant to limit the length of concatenated samples to avoid out-of-memory errors. Defaults to 300.
+- `attempts`: maximum number of invalid concatenation attempts before defaulting to the negative (skip concatenation) case. This parameter aims to limit excessive time spent trying to find candidate samples that are short enough to concatenate with. Defaults to 5.
+Please be wary of OOMs while using this augmentation technique; we used smaller batch sizes as a workaround to avoid OOMs. Batch size is determined by update frequency, batch size hyperparameter, and the number of GPU, so you may want to alter these to this end.
+### 2. Noise augmentation suite
+The four noise augmentation methods in this suite adhere to the following principle: with some parameterized probability, samples are overlayed with a noise track. The content of the noise track is specific to the method. Signal-to-noise ratio with which the noise track is overlayed is determined by choosing a value from a random uniform distribution with parameterized endpoints. The first three methods are based off data augmentation methods suggested in Section 3.3 of [X-Vectors: Robust DNN Embeddings for Speaker Recognition](https://danielpovey.com/files/2018_icassp_xvectors.pdf).
+#### 2.1. Music augmentation
+For music augmentation, the noise track consists of one file uniformly randomly selected from a corpus of music files. The music file is cut to size, including being repeated to fill the original sample length if necessary.
+**Usage**: `musicaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters:
+- `samples_path`: path where background music files are saved as audios (.wav files). No default.
+- `rate`: probability that any single access will result in the positive (background music) case. Defaults to 0.25.
+- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5.
+- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15.
+#### 2.2. Babble augmentation
+For babble augmentation, the noise track consists of multiple audios uniformly randomly selected from a corpus of speech files. The number of speech audios in the background track is chosen randomly with equal probability between 3 and 7 audios.
+**Usage**: `babbleaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters:
+- `samples_path`: path where background speech files are saved as audios (.wav files). No default.
+- `rate`: probability that any single access will result in the positive (background speech) case. Defaults to 0.25.
+- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5.
+- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15.
+#### 2.3. Sporadic noise augmentation
+For sporadic noise augmentation, the noise track is mostly silent except for intermittent short clips of noise which are added at roughly a parameterized frequency. These clips are randomly chosen and cut from a corpus of noise files to lengths according to a parameterized Gaussian distribution.
+**Usage**: `sporadicnoiseaugment` is an `AudioWaveformTransform` and has seven configurable hyperparameters:
+- `samples_path`: path where background noise files are saved as audios (.wav files). No default.
+- `rate`: probability that any single access will result in the positive (add a sporadic noise track) case. Defaults to 0.25.
+- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5.
+- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15.
+- `noise_rate`: rate in noises per second at which noise clip will be added to the original sample
+- `noise_len_mean`: mean of Gaussian normal distribution from which length of noise clip is chosen
+- `noise_len_std`: standard deviation of Gaussian normal distribution from which length of noise clip is chosen
+#### 2.4. Background noise augmentation
+For background noise augmentation, the noise track is a single track uniformly randomly selected from a corpus of noise files. The noise file is cut to size, including being repeated to fill the original sample length if necessary.
+**Usage**: `backgroundnoiseaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters:
+- `samples_path`: path where background noise files are saved as audios (.wav files). No default.
+- `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25.
+- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5.
+- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15.
+### 3. Mixed babble and background noise augmentation with recognizable source speaker
+This augmentation technique is based on Algorithm 1 in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) and is similar to the noise augmentation suite techniques in that it has a background noise track. The noise track consists of either (1) another audio sample from the batch or (2) a background noise track. A key difference is the length of the noise track is chosen from a uniform random distribution between 0 and half of the original sample length.
+**Usage**: `noisyoverlapaugment` is an `AudioDatasetTransform` and has seven configurable hyperparameters:
+- `noises_path`: path where background noise files are saved as audios (.wav files). No default.
+- `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25.
+- `mixing_noise_rate`: probability that in a positive (background noise) case, the noise track will consist of background noise (rather than babble from the batch). Defaults to 0.1.
+- `noise_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to -5.
+- `noise_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5.
+- `utterance_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to -5.
+- `utterance_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to 5.
+## Benchmark studies
+### Evaluation on clean data
+Augmentation in training data|Hyperparameters|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---|---
+None||3.954|24.984|23.962|24.448
+ConcatAugment|rate = 0.25, max_tokens = 3000, attempts = 5|3.940|25.322|26.124|26.19
+BabbleAugment|rate = 0.25, MUSAN speech, snr_min = (-5), snr_max = 5|3.957|24.226|23.186|22.368|
+BackgroundNoiseAugment|rate = 0.1, MUSAN noises, snr_min = (-10), snr_max = 10|3.955|24.745|23.513|23.819
+MusicAugment|rate = 0.25, MUSAN music, snr_min = 0, snr_max = 20|3.954|25.096|24.301|23.341|
+SporadicNoiseAugment|rate = 0.1, noise_rate = 0.25, MUSAN noises, snr_min = 10, snr_max = 35|3.954|24.924|23.951|23.484|
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|as above, except limited rates to sum to 0.25: music (0.074), background (0.029), babble (0.074), sporadic (0.029)|3.953|24.874|23.675|24.249|
+NoisyOverlapAugment|rate = 0.25, mixing_noise_rate = 0.5, MUSAN noises, utterance_snr_min = (-10), utterance_snr_max = 0, noise_snr_min = (-5), noise_snr_max = 20|3.954|24.949|24.015|23.768|
+### Evaluation on data with music noise added at SNR = (-5) - 5
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|15.785|21.105|16.944
+ConcatAugment|3.940|17.186|23.255|18.24
+BabbleAugment|3.957|19.158|22.064|17.116
+BackgroundNoiseAugment|3.955|17.777|22.0|17.535|
+MusicAugment|3.954|20.345|23.126|19.433|
+SporadicNoiseAugment|3.954|15.927|21.382|14.736|
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|19.724|22.659|17.852|
+NoisyOverlapAugment|3.954|17.49|22.142|17.207|
+### Evaluation on data with babble noise added at SNR = (-5) - 5
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|4.092|13.514|5.13
+ConcatAugment|3.940|5.493|15.835|6.893
+BabbleAugment|3.957|16.12|21.097|13.996
+BackgroundNoiseAugment|3.955|4.691|15.784|5.982
+MusicAugment|3.954|8.06|17.764|9.008
+SporadicNoiseAugment|3.954|4.009|13.935|4.814
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|14.692|20.882|14.45
+NoisyOverlapAugment|3.954|4.032|16.434|7.284
+### Evaluation on data with sporadic noise added at SNR = (-5) - 5
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|23.778|23.745|22.748
+ConcatAugment|3.940|24.239|25.907|25.723
+BabbleAugment|3.957|23.42|23.048|21.076
+BackgroundNoiseAugment|3.955|23.998|23.467|22.494
+MusicAugment|3.954|24.142|24.181|19.143
+SporadicNoiseAugment|3.954|23.97|23.894|22.61
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|24.118|23.59|23.717
+NoisyOverlapAugment|3.954|24.265|24.103|23.167
+### Evaluation on data with background noise added at SNR = (-5) - 5
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|20.201|22.525|19.66
+ConcatAugment|3.940|20.904|24.706|21.353
+BabbleAugment|3.957|20.687|22.374|18.907
+BackgroundNoiseAugment|3.955|21.574|22.998|20.043
+MusicAugment|3.954|21.65|23.529|19.87
+SporadicNoiseAugment|3.954|20.578|22.577|19.096
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|21.811|23.144|20.986
+NoisyOverlapAugment|3.954|21.312|23.153|20.302
+### Evaluation on data with all four types of noises added at SNR = (-5) - 5, each applied with prob 0.5
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|10.895|19.319|12.748
+ConcatAugment|3.940|13.517|21.658|15.428
+BabbleAugment|3.957|18.09|21.384|16.018
+BackgroundNoiseAugment|3.955|12.837|20.719|13.933
+MusicAugment|3.954|16.589|21.823|15.927
+SporadicNoiseAugment|3.954|11.238|19.91|13.31
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|18.636|21.935|17.845
+NoisyOverlapAugment|3.954|12.829|20.856|15.048
+### Evaluation on data with noisy overlap augment
+Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx)
+---|---|---|---|---
+None|3.954|21.245|22.24|20.994
+ConcatAugment|3.940|21.611|24.247|23.068
+BabbleAugment|3.957|21.867|21.987|20.099|
+BackgroundNoiseAugment|3.955|21.533|21.806|19.717|
+MusicAugment|3.954|21.823|22.643|20.847|
+SporadicNoiseAugment|3.954|21.373|22.381|20.672|
+MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|22.206|22.414|21.375|
+NoisyOverlapAugment|3.954|23.371|23.396|22.627|
+## Using transforms
+Transforms are configurable.
+1. Please pay careful attention to the type of transform you are applying.
+    - `concataugment` and `noisyoverlapaugment` are instances of `AudioDatasetTransform` and should be listed in the config under `dataset_transforms`.
+    - `musicaugment`, `babbleaugment`, `sporadicnoiseaugment`, and `backgroundnoiseaugment` are instances of `AudioWaveformTransform` and should be listed under `waveform_transforms`.
+    - Instances of `AudioFeatureTransform` should be listed under `feature_transforms`.
+2. Feel free to apply these augmentations in different contexts, e.g., you may use a `_train` or `_eval` flag to specify when the transform will be applied. If the dataset at hand contains `train` in its name, those transforms under the `_train` flag will be applied; else, the remaining transforms will be applied.
+For example, you would add this to your config to apply the musicaugment transform to a training dataset:
+```yaml
+musicaugment:
+  samples_path: ${MUSIC_PATH}
+  snr_min: 10
+  snr_max: 15
+  rate: 0.25
+waveform_transforms:
+  _train:
+  - musicaugment
+```
+or add this to apply the concataugment transform:
+```yaml
+concataugment:
+  rate: 0.25
+  max_tokens: 3000
+  attempts: 5
+dataset_transforms:
+  _train:
+  - concataugment
+ ```
+You may also want to add multiple of one type of transform; here, we add multiple `AudioWaveformTransform`s:
+```yaml
+musicaugment:
+  samples_path: ${MUSIC_PATH}
+  snr_min: 5
+  snr_max: 20
+  rate: 0.25
+backgroundnoiseaugment:
+  samples_path: ${NOISES_PATH}
+  snr_min: 10
+  snr_max: 20
+  rate: 0.1
+sporadicnoiseaugment:
+  samples_path: ${NOISES_PATH}
+  snr_min: 5
+  snr_max: 15
+  rate: 0.1
+  noise_rate: 0.25
+waveform_transforms:
+  _train:
+  - musicaugment
+  - backgroundnoiseaugment
+  - sporadicnoiseaugment
+```
+## Adding your own transforms
+Note: We store transform implementations in `fairseq/data/audio/*_transforms` directories. You may refer to these as examples while implementing your own transform.
+### Step 1. Picking the right class for your transform
+The integration into SpeechToTextDataset is quite different for each kind of transform, so it is important to understand which one is best suited to your purposes.
+**Feature transforms**
+`AudioFeatureTransform` is a base class which allows **some transform to be applied to audio spectrograms** in the data loading step. One thing to note is that the source data is either saved as `np.ndarrays` or as audio files, and is to be returned either as features (spectrogram) or waveform. If and only if the data is to be returned as a spectrogram, then `AudioFeatureTransform`s will be applied.
+**Waveform transforms**
+`AudioWaveformTransform` is a base class which allows some **transform to be applied to waveforms** in the data loading step. As mentioned above, there are two source and return types to data loading for this dataset. If and only if the data is saved in audio file format, then `AudioWaveformTransform`s will be applied, whichever return type is used.
+**Dataset transforms**
+`AudioDatasetTransform` is a base class for transforms **based on more than one item in a dataset**, ex. concatenation of two random samples in a dataset. Rather than being applied in a consistent way, i.e., to all features or to all waveforms, the integration of a dataset transform is entirely specific. Adding a dataset transform requires actually editing the `fairseq/data/audio/speech_to_text_dataset.py` file.
+### Step 2. Setting up your transform (generic to all types of transforms)
+Now that you know which kind of transform you would like to use, we are ready to implement it. This step is generic for all transform types, i.e., `TRANSFORM_TYPE` may be any of `feature`, `waveform`, or `dataset`. We will show how to build utterance concatenation (an `AudioDatasetTransform`) as an example.
+Import the base class and registration function for your transform.
+```python
+from fairseq.data.audio.dataset_transforms import (
+  AudioDatasetTransform,
+  register_audio_dataset_transform
+)
+```
+Define the class and register the transform. The name passed into the registration function is how your transform should be named in the config.
+```python
+@register_audio_dataset_transform("concataugment")
+class ConcatAugment(AudioDatasetTransform):
+```
+We are now ready to add the basic important functions to our new class. In this example, `_DEFAULTS` refers to a dictionary with the default hyperparameter values that we defined. `from_config_dict` is called to instantiate the transform given hyperparameters from the config.
+```python
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return ConcatAugment(
+            _config.get("rate", _DEFAULTS["rate"]),
+            _config.get("max_tokens", _DEFAULTS["max_tokens"]),
+            _config.get("attempts", _DEFAULTS["attempts"]),
+        )
+```
+We edit the instantiation function `__init__` to track hyperparameters and do any setup work.
+```python
+    def __init__(
+        self,
+        rate=_DEFAULTS["rate"],
+        max_tokens=_DEFAULTS["max_tokens"],
+        attempts=_DEFAULTS["attempts"],
+    ):
+        self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts
+```
+Lastly `__repr__` gives how the transform will be reported in an output log.
+```python
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"rate={self.rate}",
+                    f"max_tokens={self.max_tokens}",
+                    f"attempts={self.attempts}",
+                ]
+            )
+            + ")"
+        )
+```
+### Step 3. Adding the transform logic
+At this point, we are ready to implement the actual transform logic. The flow from here is different for each of the three transforms, so follow the path that is relevant to you.
+### ...for feature transforms
+The final step is implementing the `__call__` function, which applies the transform logic and **returns** the spectrogram with transform applied. This supports and should take exactly **two arguments**:
+- `self`
+- `x` (np.ndarray): the spectrogram for one source sample. (This is a positional argument, so you can use another parameter name like `spectrogram` instead of `x`.)
+For example, this is the `__call__` function for GlobalCMVN (cepstral mean and variance normalization).
+```python
+    def __call__(self, x):
+        x = np.subtract(x, self.mean)
+        x = np.divide(x, self.std)
+        return x
+```
+### ...for waveform transforms
+The final step is implementing the `__call__` function, which applies the transform logic. This supports and should take exactly **three arguments**:
+- `self`
+- `source` (numpy.ndarray or torch.Tensor): source audio 2d waveform (channels x length)
+- `sample_rate` (optional, defaults to None): sample rate of `source`
+`__call__` **returns**:
+- transformed audio waveform
+- sample rate of transformed audio waveform
+For example, this is the `__call__` function for augmentations in the Noise Augmentation Suite.
+```python
+    def __call__(self, source, sample_rate=None):
+        if np.random.random() > self.rate:
+            return source
+        noise = self._get_noise(
+            source.shape, always_2d=True, use_sample_rate=sample_rate
+        )
+        return self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)), sample_rate
+```
+### ...for dataset transforms
+Dataset transforms are extremely flexible, and implementation involves directly integrating them into `fairseq/data/audio/speech_to_text_dataset.py` in transform-specific ways.
+There are two basic components: (1) check whether or not this transform is part of this dataset instance using `self.dataset_transforms.has_transform(TRANSFORM_CLS)`, and (2) if so, get the transform using `self.dataset_transforms.get_transform(TRANSFORM_CLS)` & apply it.
+Due to the case-by-case specificity, it is easier to demonstrate this by examples.
+#### Example: NoisyOverlapAugment
+This transform requires access to multiple items within the same batch at once.
+**Logic**: We still use the transform classes to keep away the transform logic. For example, `__call__` of `NoisyOverlapAugment` class takes a list of source tokens for items in a mini-batch, applies noise/utterance as dictated by the transform, and returns the list of transformed source tokens for items in the mini-batch.
+```python
+    def __call__(self, sources):
+        for i, source in enumerate(sources):
+            if np.random.random() > self.rate:
+                continue
+            pri = source.numpy()
+            # ... some transform code omitted
+            pri[s_source : s_source + l] = np.add(
+                pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l])
+            )
+            sources[i] = torch.from_numpy(pri).float()
+        return sources
+```
+**Integration**: The `collater` function for `SpeechToTextDataset` is responsible for preparing a mini-batch for training, so we integrate NOAug through adding a few lines to the top of this function:
+```python
+def collater(
+    self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
+) -> Dict:
+    if len(samples) == 0:
+        return {}
+    indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+    sources = [x.source for x in samples]
+    # NOAUG INTEGRATION BLOCK
+    # (1) Check whether or not this transform is part of this dataset instance
+    has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment)
+    # (2) If so, get & apply the transform
+    if has_NOAug and self.cfg.use_audio_input:
+        NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment)
+        sources = NOAug(sources)
+    frames = _collate_frames(sources, self.cfg.use_audio_input)
+    # sort samples by descending number of frames
+    n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long)
+    n_frames, order = n_frames.sort(descending=True)
+    indices = indices.index_select(0, order)
+    frames = frames.index_select(0, order)
+    # ... rest of function
+```
+#### Example: ConcatAugment
+This transform requires access to another item within the dataset at once.
+**Logic**: We abstract the logic for picking indices to concatenate by adding a `find_indices` function to the `ConcatAugment` class, which takes one index in the dataset and finds a compatible second index to concatenate source and target tokens.
+```python
+    def find_indices(self, index: int, n_frames: List[int], n_samples: int):
+        # skip conditions: application rate, max_tokens limit exceeded
+        if np.random.random() > self.rate:
+            return [index]
+        if self.max_tokens and n_frames[index] > self.max_tokens:
+            return [index]
+        # pick second sample to concatenate
+        for _ in range(self.attempts):
+            index2 = np.random.randint(0, n_samples)
+            if index2 != index and (
+                not self.max_tokens
+                or n_frames[index] + n_frames[index2] < self.max_tokens
+            ):
+                return [index, index2]
+        return [index]
+```
+**Integration**: `SpeechToTextDataset` uses a custom `__getitem__(self, index)` function (called in the background when you write `dataset[i]`). We edited this function (as well as `_get_source_audio` and `get_tokenized_tgt_text`) to achieve the desired transform effect where accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`.
+```python
+def __getitem__(self, index: int) -> SpeechToTextDatasetItem:
+    # CONCATAUGMENT INTEGRATION BLOCK
+    # (1) Check whether or not this transform is part of this dataset instance
+    has_concat = self.dataset_transforms.has_transform(ConcatAugment)
+    # (2) If so, get & apply the transform
+    if has_concat:
+        concat = self.dataset_transforms.get_transform(ConcatAugment)
+        indices = concat.find_indices(index, self.n_frames, self.n_samples)
+    source = self._get_source_audio(indices if has_concat else index)
+    source = self.pack_frames(source)
+    target = None
+    if self.tgt_texts is not None:
+        tokenized = self.get_tokenized_tgt_text(indices if has_concat else index)
+        target = self.tgt_dict.encode_line(
+    # ... rest of function
+```

fairseq/examples/speech_to_speech/docs/direct_s2st_discrete_units.md ADDED Viewed

	@@ -0,0 +1,181 @@

+# Direct speech-to-speech translation with discrete units
+We provide the implementation for speech-to-unit translation (S2UT) proposed in "[Direct speech-to-speech translation with discrete units (Lee et al. 2021)](https://arxiv.org/abs/2107.05604)" and also the transformer-based implementation of the speech-to-spectrogram translation (S2SPECT, or transformer-based [Translatotron](https://arxiv.org/abs/1904.06037)) baseline in the paper.
+## Pretrained Models
+### Unit-based HiFi-GAN Vocoder
+Unit config | Unit size | Vocoder dataset | Model
+|---|---|---|---
+[HuBERT Base, Librispeech](https://github.com/fairinternal/fairseq-py/tree/main/examples/hubert), layer 6 | 100 | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/config.json)
+## Data preparation
+### Target speech
+0. (optional) To prepare S2S data from a speech-to-text translation (ST) dataset, see [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) for pre-trained TTS models and instructions on how to train and decode TTS models.
+1. Prepare two folders, `$SRC_AUDIO` and `$TGT_AUDIO`, with `${SPLIT}/${SAMPLE_ID}.wav` for source and target speech under each folder, separately. Note that for S2UT experiments, target audio sampling rate should be in 16,000 Hz, and for S2SPECT experiments, target audio sampling rate is recommended to be in 22,050 Hz.
+2. To prepare target discrete units for S2UT model training, see [Generative Spoken Language Modeling (speech2unit)](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/speech2unit) for pre-trained k-means models, checkpoints, and instructions on how to decode units from speech. Set the output target unit files (`--out_quantized_file_path`) as `${TGT_AUDIO}/${SPLIT}.txt`. In [Lee et al. 2021](https://arxiv.org/abs/2107.05604), we use 100 units from the sixth layer (`--layer 6`) of the HuBERT Base model.
+### Formatting data
+**Speech-to-speech data**
+_S2UT_
+  * Set `--reduce-unit` for training S2UT _reduced_ model
+  * Pre-trained vocoder and config (`$VOCODER_CKPT`, `$VOCODER_CFG`) can be downloaded from the **Pretrained Models** section. They are not required if `--eval-inference` is not going to be set during model training.
+```
+# $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc.
+python examples/speech_to_speech/preprocessing/prep_s2ut_data.py \
+  --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \
+  --output-root $DATA_ROOT --reduce-unit \
+  --vocoder-checkpoint $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG
+```
+_S2SPECT_
+```
+# $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc.
+python examples/speech_to_speech/preprocessing/prep_s2spect_data.py \
+  --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \
+  --output-root $DATA_ROOT
+```
+**Multitask data**
+  * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/${SPLIT}.tsv` files for each split following the format below: (Two tab separated columns. The sample_ids should match with the sample_ids for the speech-to-speech data in `${DATA_ROOT}/${SPLIT}.tsv`.)
+```
+id  tgt_text
+sample_id_0 token1 token2 token3 ...
+sample_id_1 token1 token2 token3 ...
+...
+```
+  * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/dict.txt`, a dictionary in fairseq format with all tokens for the targets for `$TASK_NAME`.
+  * Create `config_multitask.yaml`. Below is an example of the config used for S2UT _reduced_ with Fisher experiments including two encoder multitasks (`source_letter`, `target_letter`) and one decoder CTC task (`decoder_target_ctc`).
+```
+source_letter:  # $TASK_NAME
+   decoder_type: transformer
+   dict: ${DATA_ROOT}/source_letter/dict.txt
+   data: ${DATA_ROOT}/source_letter
+   encoder_layer: 6
+   loss_weight: 8.0
+target_letter:
+   decoder_type: transformer
+   dict: ${DATA_ROOT}/target_letter/dict.txt
+   data: ${DATA_ROOT}/target_letter
+   encoder_layer: 8
+   loss_weight: 8.0
+decoder_target_ctc:
+   decoder_type: ctc
+   dict: ${DATA_ROOT}/decoder_target_ctc/dict.txt
+   data: ${DATA_ROOT}/decoder_target_ctc
+   decoder_layer: 3
+   loss_weight: 1.6
+```
+## Training
+**Speech-to-unit translation (S2UT)**
+Here's an example for training Fisher S2UT models with 100 discrete units as target:
+```
+fairseq-train $DATA_ROOT \
+  --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \
+  --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan  \
+  --criterion speech_to_unit --label-smoothing 0.2 \
+  --arch s2ut_transformer_fisher --share-decoder-input-output-embed \
+  --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \
+  --train-subset train --valid-subset dev \
+  --save-dir ${MODEL_DIR} \
+  --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \
+  --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \
+  --max-update 400000 --max-tokens 20000 --max-target-positions 3000 --update-freq 4 \
+  --seed 1 --fp16 --num-workers 8
+```
+* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 4` to simulate training with 4 GPUs.
+* Set `--n-frames-per-step 5` to train an S2UT _stacked_ system with reduction ratio r=5. (Use `$DATA_ROOT` prepared without `--reduce-unit`.)
+* (optional) one can turn on tracking MCD loss during training for checkpoint selection by setting `--eval-inference --eval-args '{"beam": 1, "max_len_a": 1}' --best-checkpoint-metric mcd_loss`. It is recommended to sample a smaller subset as the validation set as MCD loss computation is time-consuming.
+**Speech-to-spectrogram translation (S2SPECT)**
+Here's an example for training Fisher S2SPECT models with reduction ratio r=5:
+```
+fairseq-train $DATA_ROOT \
+  --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \
+  --task speech_to_speech --n-frames-per-step 5 \
+  --criterion speech_to_spectrogram \
+  --arch s2spect_transformer_fisher --decoder-normalize-before \
+  --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \
+  --train-subset train --valid-subset dev \
+  --save-dir ${MODEL_DIR} \
+  --eval-inference --best-checkpoint-metric mcd_loss \
+  --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \
+  --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 --weight-decay 1e-6 \
+  --max-update 400000 --max-tokens 80000 --max-tokens-valid 30000  --required-batch-size-multiple 1 \
+  --max-target-positions 3000 --update-freq 16 \
+  --seed 1 --fp16 --num-workers 8
+```
+* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 16` to simulate training with 16 GPUs.
+* We recommend turning on MCD loss during training for the best checkpoint selection.
+**Unit-based HiFi-GAN vocoder**
+The vocoder is trained with the [speech-resynthesis repo](https://github.com/facebookresearch/speech-resynthesis). See [here](https://github.com/facebookresearch/speech-resynthesis/tree/main/examples/speech_to_speech_translation) for instructions on how to train the unit-based HiFi-GAN vocoder with duration prediction. The same vocoder can support waveform generation for both _reduced_ unit sequences (with `--dur-prediction` set during inference) and original unit sequences.
+## Inference
+**Speech-to-unit translation (S2UT)**
+1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`).
+```
+fairseq-generate $DATA_ROOT \
+  --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \
+  --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan \
+  --path $MODEL_DIR/checkpoint_best.pt  --gen-subset $GEN_SUBSET \
+  --max-tokens 50000 \
+  --beam 10 --max-len-a 1 \
+  --results-path ${RESULTS_PATH}
+```
+  * Set `--beam 1 --n-frames-per-step $r` for decoding with S2UT _stacked_ models.
+2. Convert unit sequences to waveform.
+```
+grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \
+  sed 's/^D-//ig' | sort -nk1 | cut -f3 \
+  > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit
+python examples/speech_to_speech/generate_waveform_from_code.py \
+  --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \
+  --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \
+  --results-path ${RESULTS_PATH} --dur-prediction
+```
+ * Set `--dur-prediction` for generating audio for S2UT _reduced_ models.
+**Speech-to-spectrogram translation (S2SPECT)**
+Follow the same inference process as in [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) to generate waveform.
+```
+# assume using a default Griffin-Lim vocoder
+python examples/speech_synthesis/generate_waveform.py $DATA_ROOT \
+  --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \
+  --task speech_to_speech --n-frames-per-step 5 \
+  --path $MODEL_DIR/checkpoint_best.pt  --gen-subset $GEN_SUBSET \
+  --max-tokens 50000 \
+  --results-path ${RESULTS_PATH} --dump-waveforms --output-sample-rate 16000
+```
+In addition to using the default Griffin-Lim vocoder, one can also finetune a HiFi-GAN vocoder for the S2SPECT model by following the instructions in the [HiFi-GAN repo](https://github.com/jik876/hifi-gan).
+**Multitask decoding**
+Coming soon.
+## Evaluation
+To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU.
+**En**
+* ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self).
+* Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation.

fairseq/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Speech to speech translation (S2ST)
+We provide the implementation for speech-to-unit translation (S2UT) proposed in [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation (Popuri et al. 2022)](https://arxiv.org/abs/2204.02967) and the various pretrained models used.
+## Pretrained Models
+### Unit extraction
+We used the multilingual HuBERT model open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md)
+### Wav2vec 2.0
+Language | Block type | Model size | Dataset | Model |
+--- | --- | --- | --- | --- |
+Es | Transformer | BASE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_B.pt) |
+Es | Transformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_L.pt) |
+Es | Conformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/conformer_L.pt) |
+En | Transformer | BASE | Librilight| [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/transformer_B.pt) |
+En | Conformer | LARGE | Librilight | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/conformer_L.pt) |
+### Unit mBART
+Unit size | Dataset | Unit config | Model |
+--- | --- | --- | --- |
+1000 | [Voxpopuli](https://aclanthology.org/2021.acl-long.80) En, Es unlabelled speech  | [mbart_large](https://github.com/pytorch/fairseq/blob/f591cc94caa85098ccf125a4782f91125b6a086d/fairseq/models/bart/model.py#L368) |[ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/unit_mBART/checkpoint.pt) |
+## Data preparation
+1. To prepare data for S2UT finetuning, follow the steps from [Direct S2ST with Discrete Units](./direct_s2st_discrete_units.md) and format the data in the _S2UT_ format. Note that we use 1000 units from the eleventh layer (`--layer 11`) of the multilingual hubert model linked above instead
+2. Run
+```
+var="id\taudio\tn_frames\ttgt_text\ttgt_n_frames"
+sed -i "1s/.*/$var/" ${SPLIT}.tsv
+```
+## Training
+**Speech-to-unit translation (S2UT)**
+Here's an example for finetuning S2UT models with 1000 discrete units as target. You can download the sample [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/config.yaml) file and [vocabulary](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/dict.txt) for Es-En from here:
+```
+fairseq-train $DATA_ROOT \
+  --config-yaml config.yaml  \
+  --task speech_to_text --arch xm_transformer\
+  --criterion l --label-smoothing 0.2 \
+  --share-decoder-input-output-embed --adaptor-n-layers 1 --normalize\
+  --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \
+  --train-subset train --valid-subset dev \
+  --load-pretrained-decoder-from ${unit_mBART} --w2v-path ${wav2vec2.0} \
+  --mask-prob 0.3 --mask-channel-length 32 --mask-channel-prob 0.25\
+  --save-dir ${MODEL_DIR} --checkpoint-activations --encoder-proj \
+  --lr 0.0005 --dropout 0.1 --attention-dropout 0.1 --lr-scheduler inverse_sqrt\
+  --warmup-init-lr 1e-7 --warmup-updates 10000 \
+  --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \
+  --max-update 20000 --max-tokens 4000 --max-tokens-valid 4000 --max-source-positions 4000 \
+  --max-target-positions 4000 --update-freq 120 \
+  --seed 1 --fp16 --num-workers 1
+```
+* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 15` to simulate training with 120 GPUs.
+* In the above setting we finetune the model end to end, corresponding to the full setup in the paper.
+* To apply LNA-E partial finetuning, add `--finetune-w2v-params layer_norm,self_attn`
+* For LNA-D partial finetuning add `--finetune-decoder-params encoder_attn,layer_norm,self_attn`. To optionally freeze the encoder by k updates, use `--freeze-finetune-updates ${K}`
+* For LNA-E,D partial finetuning add both the above options.
+**Unit-based HiFi-GAN vocoder**
+We apply the open-sourced unit-based HiFi-GAN vocoders to convert the predicted unit sequences to waveform. They are open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md)
+## Inference
+**Speech-to-unit translation (S2UT)**
+1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`).
+```
+fairseq-generate $DATA_ROOT \
+  --config-yaml config.yaml \
+  --task speech_to_text  \
+  --path $MODEL_DIR/checkpoint_best.pt  --gen-subset $GEN_SUBSET \
+  --max-tokens 10000 --max-source-positions 10000 --max-target-positions 10000\
+  --beam 10 --max-len-a 1 --max-len-b 200 \
+  --results-path ${RESULTS_PATH}
+```
+2. Convert unit sequences to waveform.
+```
+grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \
+  sed 's/^D-//ig' | sort -nk1 | cut -f3 \
+  > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit
+python examples/speech_to_speech/generate_waveform_from_code.py \
+  --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \
+  --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \
+  --results-path ${RESULTS_PATH} --dur-prediction
+```
+## Evaluation
+To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU.
+* Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation. The text cleaner used for Spanish text normalization will be updated here shortly.
+* En ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self).
+* Es ASR: We use the [Wav2Vec2-Large-XLSR-53-Spanish](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) finetuned on spanish Common Voice Es ASR model open-sourced by Jonatasgrosman(<https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish>) on [Hugging Face](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish).
+* See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model.
+## Finetuned Model Checkpoints
+ID | En - Es | Es - En |
+| --- | --- | --- |
+**S2UT systems without pre-training**
+S2UT with multitask | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//S2UT_w_multitask.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//S2UT_w_multitask.pt) |
+**S2UT systems with model pre-training**
+w2v2-L | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_only.pt ) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_only.pt) |
+w2v2-L + mBART (LNA-E) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNE.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNE.pt) |
+w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND.pt) |
+w2v2-L + mBART (LNA-E,D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNED.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNED.pt) |
+**S2UT systems with model pre-training and data augmentation**
+w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND_w_ASR.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND_w_ASR.pt) |
+Note: Some of the tasks use speech_to_text_sharded task which is yet to be open sourced. So make sure to override the task to speech_to_text to use those models.

fairseq/examples/speech_to_speech/docs/textless_s2st_real_data.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Textless Speech-to-Speech Translation (S2ST) on Real Data
+We provide instructions and pre-trained models for the work "[Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](https://arxiv.org/abs/2112.08352)".
+## Pre-trained Models
+### HuBERT
+Model | Pretraining Data | Model | Quantizer
+|---|---|---|---
+mHuBERT Base | [VoxPopuli](https://github.com/facebookresearch/voxpopuli) En, Es, Fr speech from the 100k subset | [download](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3.pt) | [L11 km1000](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin)
+### Unit-based HiFi-GAN vocoder
+Unit config | Unit size | Vocoder language | Dataset | Model
+|---|---|---|---|---
+mHuBERT, layer 11 | 1000 | En | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json)
+mHuBERT, layer 11 | 1000 | Es | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/config.json)
+mHuBERT, layer 11 | 1000 | Fr | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/config.json)
+### Speech normalizer
+Language | Training data | Target unit config | Model
+|---|---|---|---
+En | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10min.tar.gz)
+En | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_1h.tar.gz)
+En | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10h.tar.gz)
+Es | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10min.tar.gz)
+Es | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_1h.tar.gz)
+Es | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10h.tar.gz)
+Fr | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10min.tar.gz)
+Fr | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_1h.tar.gz)
+Fr | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10h.tar.gz)
+* Refer to the paper for the details of the training data.
+## Inference with Pre-trained Models
+### Speech normalizer
+1. Download the pre-trained models, including the dictionary, to `DATA_DIR`.
+2. Format the audio data.
+```bash
+# AUDIO_EXT: audio extension, e.g. wav, flac, etc.
+# Assume all audio files are at ${AUDIO_DIR}/*.${AUDIO_EXT}
+python examples/speech_to_speech/preprocessing/prep_sn_data.py \
+  --audio-dir ${AUDIO_DIR} --ext ${AUIDO_EXT} \
+  --data-name ${GEN_SUBSET} --output-dir ${DATA_DIR} \
+  --for-inference
+```
+3. Run the speech normalizer and post-process the output.
+```bash
+mkdir -p ${RESULTS_PATH}
+python examples/speech_recognition/new/infer.py \
+    --config-dir examples/hubert/config/decode/ \
+    --config-name infer_viterbi \
+    task.data=${DATA_DIR} \
+    task.normalize=false \
+    common_eval.results_path=${RESULTS_PATH}/log \
+    common_eval.path=${DATA_DIR}/checkpoint_best.pt \
+    dataset.gen_subset=${GEN_SUBSET} \
+    '+task.labels=["unit"]' \
+    +decoding.results_path=${RESULTS_PATH} \
+    common_eval.post_process=none \
+    +dataset.batch_size=1 \
+    common_eval.quiet=True
+# Post-process and generate output at ${RESULTS_PATH}/${GEN_SUBSET}.txt
+python examples/speech_to_speech/preprocessing/prep_sn_output_data.py \
+  --in-unit ${RESULTS_PATH}/hypo.units \
+  --in-audio ${DATA_DIR}/${GEN_SUBSET}.tsv \
+  --output-root ${RESULTS_PATH}
+```
+### Unit-to-waveform conversion with unit vocoder
+The pre-trained vocoders can support generating audio for both full unit sequences and reduced unit sequences (i.e. duplicating consecutive units removed). Set `--dur-prediction` for generating audio with reduced unit sequences.
+```bash
+# IN_CODE_FILE contains one unit sequence per line. Units are separated by space.
+python examples/speech_to_speech/generate_waveform_from_code.py \
+  --in-code-file ${IN_CODE_FILE} \
+  --vocoder ${VOCODER_CKPT} --vocoder-cfg ${VOCODER_CFG} \
+  --results-path ${RESULTS_PATH} --dur-prediction
+```
+## Training new models
+To be updated.

fairseq/examples/speech_to_speech/generate_waveform_from_code.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import json
+import logging
+from pathlib import Path
+import random
+import soundfile as sf
+import torch
+from tqdm import tqdm
+from fairseq import utils
+from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def dump_result(args, sample_id, pred_wav, suffix=""):
+    sf.write(
+        f"{args.results_path}/{sample_id}{suffix}_pred.wav",
+        pred_wav.detach().cpu().numpy(),
+        16000,
+    )
+def load_code(in_file):
+    with open(in_file) as f:
+        out = [list(map(int, line.strip().split())) for line in f]
+    return out
+def main(args):
+    logger.info(args)
+    use_cuda = torch.cuda.is_available() and not args.cpu
+    with open(args.vocoder_cfg) as f:
+        vocoder_cfg = json.load(f)
+    vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg)
+    if use_cuda:
+        vocoder = vocoder.cuda()
+    multispkr = vocoder.model.multispkr
+    if multispkr:
+        logger.info("multi-speaker vocoder")
+        num_speakers = vocoder_cfg.get(
+            "num_speakers", 200
+        )  # following the default in codehifigan to set to 200
+        assert (
+            args.speaker_id < num_speakers
+        ), f"invalid --speaker-id ({args.speaker_id}) with total #speakers = {num_speakers}"
+    data = load_code(args.in_code_file)
+    Path(args.results_path).mkdir(exist_ok=True, parents=True)
+    for i, d in tqdm(enumerate(data), total=len(data)):
+        x = {
+            "code": torch.LongTensor(d).view(1, -1),
+        }
+        suffix = ""
+        if multispkr:
+            spk = (
+                random.randint(0, num_speakers - 1)
+                if args.speaker_id == -1
+                else args.speaker_id
+            )
+            suffix = f"_spk{spk}"
+            x["spkr"] = torch.LongTensor([spk]).view(1, 1)
+        x = utils.move_to_cuda(x) if use_cuda else x
+        wav = vocoder(x, args.dur_prediction)
+        dump_result(args, i, wav, suffix=suffix)
+def cli_main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--in-code-file", type=str, required=True, help="one unit sequence per line"
+    )
+    parser.add_argument(
+        "--vocoder", type=str, required=True, help="path to the CodeHiFiGAN vocoder"
+    )
+    parser.add_argument(
+        "--vocoder-cfg",
+        type=str,
+        required=True,
+        help="path to the CodeHiFiGAN vocoder config",
+    )
+    parser.add_argument("--results-path", type=str, required=True)
+    parser.add_argument(
+        "--dur-prediction",
+        action="store_true",
+        help="enable duration prediction (for reduced/unique code sequences)",
+    )
+    parser.add_argument(
+        "--speaker-id",
+        type=int,
+        default=-1,
+        help="Speaker id (for vocoder that supports multispeaker). Set to -1 to randomly sample speakers.",
+    )
+    parser.add_argument("--cpu", action="store_true", help="run on CPU")
+    args = parser.parse_args()
+    main(args)
+if __name__ == "__main__":
+    cli_main()

fairseq/examples/speech_to_speech/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.

fairseq/examples/speech_to_speech/preprocessing/data_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import List, Optional
+from examples.speech_to_text.data_utils import S2TDataConfigWriter
+def gen_config_yaml(
+    manifest_root: Path,
+    yaml_filename: str = "config.yaml",
+    specaugment_policy: Optional[str] = "lb",
+    feature_transform: Optional[List[str]] = None,
+    input_channels: Optional[int] = 1,
+    input_feat_per_channel: Optional[int] = 80,
+    audio_root: str = "",
+    vocoder_type: Optional[str] = None,
+    vocoder_checkpoint: Optional[str] = None,
+    vocoder_cfg: Optional[str] = None,
+    extra=None,
+):
+    manifest_root = manifest_root.absolute()
+    writer = S2TDataConfigWriter(manifest_root / yaml_filename)
+    if input_channels is not None:
+        writer.set_input_channels(input_channels)
+    if input_feat_per_channel is not None:
+        writer.set_input_feat_per_channel(input_feat_per_channel)
+    specaugment_setters = {
+        "lb": writer.set_specaugment_lb_policy,
+        "ld": writer.set_specaugment_ld_policy,
+        "sm": writer.set_specaugment_sm_policy,
+        "ss": writer.set_specaugment_ss_policy,
+    }
+    specaugment_setter = specaugment_setters.get(specaugment_policy, None)
+    if specaugment_setter is not None:
+        specaugment_setter()
+    if feature_transform is None:
+        feature_transform = []
+    else:
+        writer.set_feature_transforms("*", feature_transform)
+    if specaugment_policy is not None:
+        writer.set_feature_transforms("_train", feature_transform + ["specaugment"])
+    if len(audio_root) > 0:
+        writer.set_audio_root(audio_root)
+    if (
+        vocoder_type is not None
+        and vocoder_checkpoint is not None
+        and vocoder_cfg is not None
+    ):
+        writer.set_extra(
+            {
+                "vocoder": {
+                    "type": vocoder_type,
+                    "config": vocoder_cfg,
+                    "checkpoint": vocoder_checkpoint,
+                }
+            }
+        )
+    if extra is not None:
+        writer.set_extra(extra)
+    writer.flush()
+def load_units(in_file):
+    out = {}
+    with open(in_file) as f:
+        for line in f:
+            sample_id, units = line.strip().split("|", 1)
+            out[sample_id] = units.split()
+    return out
+def process_units(units, reduce=False):
+    if not reduce:
+        return units
+    out = [u for i, u in enumerate(units) if i == 0 or u != units[i - 1]]
+    return out

fairseq/examples/speech_to_speech/preprocessing/prep_s2spect_data.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import logging
+import os
+from pathlib import Path
+import shutil
+import torchaudio
+import soundfile as sf
+from tqdm import tqdm
+import pandas as pd
+from examples.speech_synthesis.data_utils import extract_logmel_spectrogram
+from examples.speech_to_speech.preprocessing.data_utils import gen_config_yaml
+from examples.speech_to_text.data_utils import create_zip, get_zip_manifest, save_df_to_tsv
+from fairseq.data.audio.audio_utils import convert_waveform
+logger = logging.getLogger(__name__)
+MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"]
+def prepare_target_data(args, tgt_audios):
+    feature_name = "logmelspec80"
+    zip_path = args.output_root / f"{feature_name}.zip"
+    if zip_path.exists():
+        print(f"{zip_path} exists.")
+        return zip_path
+    feature_root = args.output_root / feature_name
+    feature_root.mkdir(exist_ok=True)
+    print("Extracting Mel spectrogram features...")
+    for tgt_audio in tqdm(tgt_audios):
+        sample_id = tgt_audio.stem
+        waveform, sample_rate = torchaudio.load(tgt_audio.as_posix())
+        waveform, sample_rate = convert_waveform(
+            waveform, sample_rate, normalize_volume=args.normalize_volume,
+            to_sample_rate=args.sample_rate
+        )
+        extract_logmel_spectrogram(
+            waveform, sample_rate, feature_root / f"{sample_id}.npy",
+            win_length=args.win_length, hop_length=args.hop_length,
+            n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min,
+            f_max=args.f_max
+        )
+    print("ZIPing features...")
+    create_zip(feature_root, zip_path)
+    shutil.rmtree(feature_root)
+    return zip_path
+def process(args):
+    os.makedirs(args.output_root, exist_ok=True)
+    manifest = {}
+    tgt_audios = []
+    for split in args.data_split:
+        print(f"Processing {split}...")
+        manifest[split] = {c: [] for c in MANIFEST_COLUMNS}
+        missing_tgt_audios = []
+        src_audios = list(args.source_dir.glob(f"{split}/*.wav"))
+        for src_audio in tqdm(src_audios):
+            sample_id = src_audio.stem
+            tgt_audio = args.target_dir / split / f"{sample_id}.wav"
+            if not tgt_audio.is_file():
+                missing_tgt_audios.append(sample_id)
+                continue
+            tgt_audios.append(tgt_audio)
+            src_n_frames = sf.info(src_audio.as_posix()).frames
+            manifest[split]["id"].append(sample_id)
+            manifest[split]["src_audio"].append(src_audio.as_posix())
+            manifest[split]["src_n_frames"].append(
+                src_n_frames // 160
+            )  # estimation of 10-ms frame for 16kHz audio
+        print(f"Processed {len(manifest[split]['id'])} samples")
+        if len(missing_tgt_audios) > 0:
+            print(
+                f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})"
+            )
+    # Extract features and pack features into ZIP
+    zip_path = prepare_target_data(args, tgt_audios)
+    print("Fetching ZIP manifest...")
+    tgt_audio_paths, tgt_audio_lengths = get_zip_manifest(zip_path)
+    print("Generating manifest...")
+    for split in args.data_split:
+        print(f"Processing {split}...")
+        for sample_id in tqdm(manifest[split]["id"]):
+            manifest[split]["tgt_audio"].append(tgt_audio_paths[sample_id])
+            manifest[split]["tgt_n_frames"].append(tgt_audio_lengths[sample_id])
+        out_manifest = args.output_root / f"{split}.tsv"
+        print(f"Writing manifest to {out_manifest}...")
+        save_df_to_tsv(pd.DataFrame.from_dict(manifest[split]), out_manifest)
+    # Generate config YAML
+    win_len_t = args.win_length / args.sample_rate
+    hop_len_t = args.hop_length / args.sample_rate
+    extra = {
+        "features": {
+            "type": "spectrogram+melscale+log",
+            "sample_rate": args.sample_rate,
+            "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft,
+            "window_fn": "hann", "win_length": args.win_length,
+            "hop_length": args.hop_length,
+            "win_len_t": win_len_t, "hop_len_t": hop_len_t,
+            "f_min": args.f_min, "f_max": args.f_max,
+            "n_stft": args.n_fft // 2 + 1
+        }
+    }
+    gen_config_yaml(
+        args.output_root,
+        audio_root=args.output_root.as_posix(),
+        specaugment_policy="lb",
+        feature_transform=["utterance_cmvn", "delta_deltas"],
+        extra=extra,
+    )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--source-dir", required=True, type=Path, help="source audio directory"
+    )
+    parser.add_argument(
+        "--target-dir", required=True, type=Path, help="target audio directory"
+    )
+    parser.add_argument(
+        "--data-split",
+        default=["train", "valid", "test"],
+        nargs="+",
+        help="data split names",
+    )
+    parser.add_argument(
+        "--output-root", required=True, type=Path, help="output directory"
+    )
+    # target feature related
+    parser.add_argument("--win-length", type=int, default=1024)
+    parser.add_argument("--hop-length", type=int, default=256)
+    parser.add_argument("--n-fft", type=int, default=1024)
+    parser.add_argument("--n-mels", type=int, default=80)
+    parser.add_argument("--f-min", type=int, default=20)
+    parser.add_argument("--f-max", type=int, default=8000)
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    parser.add_argument("--normalize-volume", "-n", action="store_true")
+    args = parser.parse_args()
+    process(args)
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_to_speech/preprocessing/prep_s2ut_data.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import logging
+from pathlib import Path
+import soundfile as sf
+from tqdm import tqdm
+import pandas as pd
+from examples.speech_to_speech.preprocessing.data_utils import (
+    gen_config_yaml,
+    load_units,
+    process_units,
+)
+from examples.speech_to_text.data_utils import save_df_to_tsv
+logger = logging.getLogger(__name__)
+MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"]
+def process(args):
+    args.output_root.mkdir(exist_ok=True)
+    print("Generating manifest...")
+    for split in args.data_split:
+        print(f"Processing {split}")
+        # load target units
+        target_unit_data = load_units(args.target_dir / f"{split}.txt")
+        manifest = {c: [] for c in MANIFEST_COLUMNS}
+        missing_tgt_audios = []
+        src_audios = list(args.source_dir.glob(f"{split}/*.wav"))
+        for src_audio in tqdm(src_audios):
+            sample_id = src_audio.stem
+            if sample_id not in target_unit_data:
+                missing_tgt_audios.append(sample_id)
+                continue
+            src_n_frames = sf.info(src_audio.as_posix()).frames
+            manifest["id"].append(sample_id)
+            manifest["src_audio"].append(src_audio.as_posix())
+            manifest["src_n_frames"].append(
+                src_n_frames // 160
+            )  # estimation of 10-ms frame for 16kHz audio
+            target_units = process_units(target_unit_data[sample_id], args.reduce_unit)
+            manifest["tgt_audio"].append(" ".join(target_units))
+            manifest["tgt_n_frames"].append(len(target_units))
+        print(f"Processed {len(manifest['id'])} samples")
+        if len(missing_tgt_audios) > 0:
+            print(
+                f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})"
+            )
+        out_manifest = args.output_root / f"{split}.tsv"
+        print(f"Writing manifest to {out_manifest}...")
+        save_df_to_tsv(pd.DataFrame.from_dict(manifest), out_manifest)
+    # Generate config YAML
+    gen_config_yaml(
+        args.output_root,
+        specaugment_policy="lb",
+        feature_transform=["utterance_cmvn"],
+        vocoder_type="code_hifigan",
+        vocoder_checkpoint=args.vocoder_checkpoint,
+        vocoder_cfg=args.vocoder_cfg,
+    )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--source-dir", required=True, type=Path, help="source audio directory"
+    )
+    parser.add_argument(
+        "--target-dir", required=True, type=Path, help="target audio directory"
+    )
+    parser.add_argument(
+        "--data-split",
+        default=["train", "valid", "test"],
+        nargs="+",
+        help="data split names",
+    )
+    parser.add_argument(
+        "--output-root", required=True, type=Path, help="output directory"
+    )
+    parser.add_argument(
+        "--reduce-unit",
+        action="store_true",
+        help="reduce a target unit sequence to a unique unit sequence, i.e. '1 1 1 2 2' -> '1 2'",
+    )
+    parser.add_argument(
+        "--vocoder-checkpoint", default=None, type=str, help="vocoder checkpoint"
+    )
+    parser.add_argument(
+        "--vocoder-cfg", default=None, type=str, help="vocoder config file"
+    )
+    args = parser.parse_args()
+    process(args)
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_to_speech/preprocessing/prep_sn_data.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Adapted from examples/wav2vec/wav2vec_manifest.py
+"""
+Data preparation for the speech normalizer
+"""
+import argparse
+import glob
+import os
+import soundfile
+from examples.speech_to_speech.preprocessing.data_utils import load_units, process_units
+def process(args):
+    assert (
+        args.for_inference or args.target_unit is not None
+    ), "missing --target-unit or --for-inference"
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    dir_path = os.path.realpath(args.audio_dir)
+    search_path = os.path.join(dir_path, "**/*." + args.ext)
+    if args.target_unit:
+        unit_data = load_units(args.target_unit)
+    with open(os.path.join(args.output_dir, f"{args.data_name}.tsv"), "w") as o_t, open(
+        os.path.join(args.output_dir, f"{args.data_name}.unit"), "w"
+    ) as o_u:
+        print(dir_path, file=o_t)
+        for fname in glob.iglob(search_path, recursive=True):
+            file_path = os.path.realpath(fname)
+            frames = soundfile.info(fname).frames
+            print(
+                "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=o_t
+            )
+            if args.for_inference:
+                print("0", file=o_u)
+            else:
+                sample_id = os.path.basename(file_path)[: -len(args.ext) - 1]
+                assert (
+                    sample_id in unit_data
+                ), f'{fname} does not have unit data in {args.target_unit}. Expecting sample_id "{sample_id}".'
+                target_units = process_units(unit_data[sample_id], reduce=True)
+                print(" ".join(target_units), file=o_u)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--audio-dir", required=True, type=str, help="audio directory")
+    parser.add_argument("--ext", default="flac", type=str, help="audio extension")
+    parser.add_argument(
+        "--data-name",
+        required=True,
+        type=str,
+        help="dataset name",
+    )
+    parser.add_argument(
+        "--output-dir", required=True, type=str, help="output directory"
+    )
+    parser.add_argument(
+        "--for-inference",
+        action="store_true",
+        help="set this if preparing data for running inference with a speech normalizer",
+    )
+    parser.add_argument(
+        "--target-unit",
+        default=None,
+        type=str,
+        help="a file containing unit sequences in the format: sample_id|u1 u2 ...",
+    )
+    args = parser.parse_args()
+    process(args)
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_to_speech/preprocessing/prep_sn_output_data.py ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+def process(args):
+    args.output_root.mkdir(exist_ok=True)
+    # load units
+    units = {}
+    with open(args.in_unit) as f:
+        for line in f:
+            unit_seq, utt_id = line.strip().rsplit(" ", 1)
+            utt_id = int(utt_id[6:-1])  # remove "(None-"
+            units[utt_id] = unit_seq
+    with open(args.in_audio) as f, open(
+        args.output_root / f"{args.in_audio.stem}.txt", "w"
+    ) as o:
+        f.readline()
+        for i, line in enumerate(tqdm(f.readlines())):
+            audio, _ = line.strip().split("\t", 1)
+            sample_id = Path(audio).stem
+            o.write(f"{sample_id}|{units[i]}\n")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--in-unit",
+        required=True,
+        type=Path,
+        help="unit file (output from the speech normalizer)",
+    )
+    parser.add_argument(
+        "--in-audio",
+        required=True,
+        type=Path,
+        help="tsv file (input to the normalizer)",
+    )
+    parser.add_argument(
+        "--output-root", required=True, type=Path, help="output directory"
+    )
+    args = parser.parse_args()
+    process(args)
+if __name__ == "__main__":
+    main()

fairseq/examples/speech_to_speech/unity/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import sequence_generator  # noqa
+from . import sequence_generator_multi_decoder  # noqa

fairseq/examples/speech_to_speech/unity/sequence_generator.py ADDED Viewed

	@@ -0,0 +1,626 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import sys
+from typing import Dict, List, Optional
+import torch
+from torch import Tensor
+from fairseq.sequence_generator import EnsembleModel as EnsembleModelBase
+from fairseq.sequence_generator import SequenceGenerator as SequenceGeneratorBase
+class SequenceGenerator(SequenceGeneratorBase):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        max_len=0,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        search_strategy=None,
+        eos=None,
+        symbols_to_strip_from_output=None,
+        lm_model=None,
+        lm_weight=1.0,
+        tokens_to_suppress=(),
+    ):
+        """Generates translations of a given source sentence.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            max_len (int, optional): the maximum length of the generated output
+                (not including end-of-sentence)
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+        """
+        super().__init__(
+            models=models,
+            tgt_dict=tgt_dict,
+            beam_size=beam_size,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            max_len=max_len,
+            min_len=min_len,
+            normalize_scores=normalize_scores,
+            len_penalty=len_penalty,
+            unk_penalty=unk_penalty,
+            temperature=temperature,
+            match_source_len=match_source_len,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            search_strategy=search_strategy,
+            eos=eos,
+            symbols_to_strip_from_output=symbols_to_strip_from_output,
+            lm_model=lm_model,
+            lm_weight=lm_weight,
+            tokens_to_suppress=tokens_to_suppress,
+        )
+        if isinstance(models, EnsembleModel):
+            self.model = models
+        else:
+            self.model = EnsembleModel(models)
+        self.model.set_decoder_beam_size(self.beam_size)
+        self.model.eval()
+    def _generate(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        net_input = sample["net_input"]
+        if "src_tokens" in net_input:
+            src_tokens = net_input["src_tokens"]
+            # length of the source text being the character length except EndOfSentence and pad
+            # if src_lengths exists in net_input (speech_to_text dataset case), then use it
+            if "src_lengths" in net_input:
+                src_lengths = net_input["src_lengths"]
+            else:
+                src_lengths = (
+                    (src_tokens.ne(self.eos) & src_tokens.ne(self.pad))
+                    .long()
+                    .sum(dim=1)
+                )
+        elif "source" in net_input:
+            src_tokens = net_input["source"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        elif "features" in net_input:
+            src_tokens = net_input["features"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        else:
+            raise Exception(
+                "expected src_tokens or source in net input. input keys: "
+                + str(net_input.keys())
+            )
+        if constraints is not None and not self.search.supports_constraints:
+            raise NotImplementedError(
+                "Target-side constraints were provided, but search method doesn't support them"
+            )
+        # Initialize constraints, when active
+        self.search.init_constraints(constraints, self.beam_size)
+        # compute the encoder output for each beam
+        with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"):
+            encoder_outs = self.model.forward_encoder(net_input)
+        finalized = self.generate_decoder(
+            encoder_outs,
+            src_tokens,
+            src_lengths,
+            sample,
+            prefix_tokens,
+            constraints,
+            bos_token,
+        )
+        return finalized
+    def generate_decoder(
+        self,
+        encoder_outs,
+        src_tokens,
+        src_lengths,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+        aux_task_name="",
+        encoder_outs_aug: Optional[
+            Tensor
+        ] = None,  # an additional/augmented encoder_outs
+    ):
+        incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(self.model.models_size)
+            ],
+        )
+        # bsz: total number of sentences in beam
+        # Note that src_tokens may have more than 2 dimensions (i.e. audio features)
+        bsz, src_len = src_tokens.size()[:2]
+        beam_size = self.beam_size
+        decoder_name = f"{aux_task_name}_decoder" if aux_task_name else "decoder"
+        max_len: int = -1
+        if self.match_source_len:
+            max_len = src_lengths.max().item()
+        else:
+            max_len = min(
+                int(self.max_len_a * src_len + self.max_len_b),
+                self.max_len - 1,
+            )
+        assert (
+            self.min_len <= max_len
+        ), "min_len cannot be larger than max_len, please adjust these!"
+        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order)
+        # ensure encoder_outs is a List.
+        assert encoder_outs is not None
+        if encoder_outs_aug is not None:
+            encoder_outs_aug = self.model.reorder_encoder_out(
+                encoder_outs_aug, new_order
+            )
+        # initialize buffers
+        scores = (
+            torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float()
+        )  # +1 for eos; pad is never chosen for scoring
+        tokens = (
+            torch.zeros(bsz * beam_size, max_len + 2)
+            .to(src_tokens)
+            .long()
+            .fill_(self.pad)
+        )  # +2 for eos and pad
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
+        attn: Optional[Tensor] = None
+        # A list that indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = (
+            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
+        )  # forward and backward-compatible False mask
+        # list of completed sentences
+        finalized = torch.jit.annotate(
+            List[List[Dict[str, Tensor]]],
+            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
+        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
+        # a boolean array indicating if the sentence at the index is finished or not
+        finished = [False for i in range(bsz)]
+        num_remaining_sent = bsz  # number of sentences remaining
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (
+            (torch.arange(0, bsz) * beam_size)
+            .unsqueeze(1)
+            .type_as(tokens)
+            .to(src_tokens.device)
+        )
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device)
+        reorder_state: Optional[Tensor] = None
+        batch_idxs: Optional[Tensor] = None
+        original_batch_idxs: Optional[Tensor] = None
+        if "id" in sample and isinstance(sample["id"], Tensor):
+            original_batch_idxs = sample["id"]
+        else:
+            original_batch_idxs = torch.arange(0, bsz).type_as(tokens)
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(
+                        batch_idxs
+                    )
+                    reorder_state.view(-1, beam_size).add_(
+                        corr.unsqueeze(-1) * beam_size
+                    )
+                    original_batch_idxs = original_batch_idxs[batch_idxs]
+                self.model.reorder_incremental_state(
+                    incremental_states, reorder_state, decoder_name
+                )
+                encoder_outs = self.model.reorder_encoder_out(
+                    encoder_outs, reorder_state
+                )
+                if encoder_outs_aug is not None:
+                    encoder_outs_aug = self.model.reorder_encoder_out(
+                        encoder_outs_aug, reorder_state
+                    )
+            with torch.autograd.profiler.record_function(
+                "EnsembleModel: forward_decoder"
+            ):
+                lprobs, avg_attn_scores = self.model.forward_decoder(
+                    tokens[:, : step + 1],
+                    encoder_outs,
+                    incremental_states,
+                    self.temperature,
+                    decoder_name=decoder_name,
+                    encoder_outs_aug=encoder_outs_aug,
+                )
+            if self.lm_model is not None and not aux_task_name:
+                lm_out = self.lm_model(tokens[:, : step + 1])
+                probs = self.lm_model.get_normalized_probs(
+                    lm_out, log_probs=True, sample=None
+                )
+                probs = probs[:, -1, :] * self.lm_weight
+                lprobs += probs
+            lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)
+            lprobs[:, self.pad] = -math.inf  # never select pad
+            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
+            # handle max length constraint
+            if step >= max_len:
+                lprobs[:, : self.eos] = -math.inf
+                lprobs[:, self.eos + 1 :] = -math.inf
+            # handle prefix tokens (possibly with different lengths)
+            if (
+                prefix_tokens is not None
+                and step < prefix_tokens.size(1)
+                and step < max_len
+            ):
+                lprobs, tokens, scores = self._prefix_tokens(
+                    step, lprobs, scores, tokens, prefix_tokens, beam_size
+                )
+            else:
+                if step < self.min_len:
+                    # minimum length constraint (does not apply if using prefix_tokens)
+                    lprobs[:, self.eos] = -math.inf
+                if self.token_indices_to_suppress is not None:
+                    lprobs[:, self.token_indices_to_suppress] = -math.inf
+            # Record attention scores, only support avg_attn_scores is a Tensor
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = torch.empty(
+                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
+                    ).to(scores)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+            scores = scores.type_as(lprobs)
+            eos_bbsz_idx = torch.empty(0).to(
+                tokens
+            )  # indices of hypothesis ending with eos (finished sentences)
+            eos_scores = torch.empty(0).to(
+                scores
+            )  # scores of hypothesis ending with eos (finished sentences)
+            if self.should_set_src_lengths:
+                self.search.set_src_lengths(src_lengths)
+            if self.repeat_ngram_blocker is not None:
+                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step)
+            # Shape: (batch, cand_size)
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step],
+                tokens[:, : step + 1],
+                original_batch_idxs,
+            )
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+            # finalize hypotheses that end in eos
+            # Shape of eos_mask: (batch size, beam size)
+            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
+            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
+            # only consider eos when it's among the top beam_size indices
+            # Now we know what beam item(s) to finish
+            # Shape: 1d list of absolute-numbered
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+            finalized_sents: List[int] = []
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.masked_select(
+                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
+                )
+                finalized_sents = self.finalize_hypos(
+                    step,
+                    eos_bbsz_idx,
+                    eos_scores,
+                    tokens,
+                    scores,
+                    finalized,
+                    finished,
+                    beam_size,
+                    attn,
+                    src_lengths,
+                    max_len,
+                )
+                num_remaining_sent -= len(finalized_sents)
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            if self.search.stop_on_max_len and step >= max_len:
+                break
+            assert step < max_len, f"{step} < {max_len}"
+            # Remove finalized sentences (ones for which {beam_size}
+            # finished hypotheses have been generated) from the batch.
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = torch.ones(
+                    bsz, dtype=torch.bool, device=cand_indices.device
+                )
+                batch_mask[finalized_sents] = False
+                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
+                batch_idxs = torch.arange(
+                    bsz, device=cand_indices.device
+                ).masked_select(batch_mask)
+                # Choose the subset of the hypothesized constraints that will continue
+                self.search.prune_sentences(batch_idxs)
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+                cand_scores = cand_scores[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths = src_lengths[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, attn.size(1), -1
+                    )
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+            # Set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+            # Rewrite the operator since the element wise or is not supported in torchscript.
+            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+            # get the top beam_size active hypotheses, which are just
+            # the hypos with the smallest values in active_mask.
+            # {active_hypos} indicates which {beam_size} hypotheses
+            # from the list of {2 * beam_size} candidates were
+            # selected. Shapes: (batch size, beam size)
+            new_cands_to_ignore, active_hypos = torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False
+            )
+            # update cands_to_ignore to ignore any finalized hypos.
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            # Make sure there is at least one active item for each sentence in the batch.
+            assert (~cands_to_ignore).any(dim=1).all()
+            # update cands_to_ignore to ignore any finalized hypos
+            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
+            # can be selected more than once).
+            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
+            active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+            # copy tokens and scores for active hypotheses
+            # Set the tokens for each beam (can select the same row more than once)
+            tokens[:, : step + 1] = torch.index_select(
+                tokens[:, : step + 1], dim=0, index=active_bbsz_idx
+            )
+            # Select the next token for each of them
+            tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
+                cand_indices, dim=1, index=active_hypos
+            )
+            if step > 0:
+                scores[:, :step] = torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx
+                )
+            scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
+                cand_scores, dim=1, index=active_hypos
+            )
+            # Update constraints based on which candidates were selected for the next beam
+            self.search.update_constraints(active_hypos)
+            # copy attention for active hypotheses
+            if attn is not None:
+                attn[:, :, : step + 2] = torch.index_select(
+                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
+                )
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+        # sort by score descending
+        for sent in range(len(finalized)):
+            scores = torch.tensor(
+                [float(elem["score"].item()) for elem in finalized[sent]]
+            )
+            _, sorted_scores_indices = torch.sort(scores, descending=True)
+            finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices]
+            finalized[sent] = torch.jit.annotate(
+                List[Dict[str, Tensor]], finalized[sent]
+            )
+        return finalized
+class EnsembleModel(EnsembleModelBase):
+    """A wrapper around an ensemble of models."""
+    def __init__(self, models):
+        super().__init__(models)
+    @torch.jit.export
+    def forward_decoder(
+        self,
+        tokens,
+        encoder_outs: List[Dict[str, List[Tensor]]],
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        temperature: float = 1.0,
+        decoder_name="decoder",
+        encoder_outs_aug: List[Dict[str, List[Tensor]]] = None,
+    ):
+        log_probs = []
+        avg_attn: Optional[Tensor] = None
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None
+        encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None
+        for i, model in enumerate(self.models):
+            if self.has_encoder():
+                encoder_out = encoder_outs[i]
+                if encoder_outs_aug is not None:
+                    encoder_out_aug = encoder_outs_aug[i]
+            # decode each model
+            if self.has_incremental_states():
+                if encoder_out_aug is not None:
+                    decoder_out = getattr(model, decoder_name).forward(
+                        tokens,
+                        encoder_out=encoder_out,
+                        encoder_out_aug=encoder_out_aug,
+                        incremental_state=incremental_states[i],
+                    )
+                else:
+                    decoder_out = getattr(model, decoder_name).forward(
+                        tokens,
+                        encoder_out=encoder_out,
+                        incremental_state=incremental_states[i],
+                    )
+            else:
+                if hasattr(model, decoder_name):
+                    decoder_out = getattr(model, decoder_name).forward(
+                        tokens, encoder_out=encoder_out
+                    )
+                else:
+                    decoder_out = model.forward(tokens)
+            attn: Optional[Tensor] = None
+            decoder_len = len(decoder_out)
+            if decoder_len > 1 and decoder_out[1] is not None:
+                if isinstance(decoder_out[1], Tensor):
+                    attn = decoder_out[1]
+                else:
+                    attn_holder = decoder_out[1]["attn"]
+                    if isinstance(attn_holder, Tensor):
+                        attn = attn_holder
+                    elif attn_holder is not None:
+                        attn = attn_holder[0]
+                if attn is not None:
+                    attn = attn[:, -1, :]
+            decoder_out_tuple = (
+                decoder_out[0][:, -1:, :].div_(temperature),
+                None if decoder_len <= 1 else decoder_out[1],
+            )
+            probs = getattr(model, decoder_name).get_normalized_probs(
+                decoder_out_tuple, log_probs=True, sample=None
+            )
+            probs = probs[:, -1, :]
+            if self.models_size == 1:
+                return probs, attn
+            log_probs.append(probs)
+            if attn is not None:
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(
+            self.models_size
+        )
+        if avg_attn is not None:
+            avg_attn.div_(self.models_size)
+        return avg_probs, avg_attn
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+        decoder_name="decoder",
+    ):
+        if not self.has_incremental_states():
+            return
+        for i, model in enumerate(self.models):
+            getattr(model, decoder_name).reorder_incremental_state_scripting(
+                incremental_states[i], new_order
+            )

fairseq/examples/speech_to_speech/unity/sequence_generator_multi_decoder.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from fairseq import search
+class MultiDecoderSequenceGenerator(nn.Module):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        tgt_dict_mt,
+        beam_size=1,
+        beam_size_mt=1,
+        max_len_a=0,
+        max_len_b=200,
+        max_len_a_mt=0,
+        max_len_b_mt=200,
+        max_len=0,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        len_penalty_mt=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        eos=None,
+        eos_mt=None,
+        symbols_to_strip_from_output=None,
+        lm_model=None,
+        lm_weight=1.0,
+    ):
+        """Generates translations of a given source sentence.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length for the second pass
+            max_len_a_mt/b_mt (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length for the first pass
+            max_len (int, optional): the maximum length of the generated output
+                (not including end-of-sentence)
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty in the second pass, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            len_penalty (float, optional): length penalty in the first pass, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+        """
+        super().__init__()
+        from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator
+        self.generator = SequenceGenerator(
+            models,
+            tgt_dict,
+            beam_size=beam_size,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            max_len=max_len,
+            min_len=min_len,
+            normalize_scores=normalize_scores,
+            len_penalty=len_penalty,
+            unk_penalty=unk_penalty,
+            temperature=temperature,
+            match_source_len=match_source_len,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            search_strategy=search.BeamSearch(tgt_dict),
+            eos=eos,
+            symbols_to_strip_from_output=symbols_to_strip_from_output,
+            lm_model=lm_model,
+            lm_weight=lm_weight,
+        )
+        self.eos = self.generator.eos
+        self.generator_mt = SequenceGenerator(
+            models,
+            tgt_dict_mt,
+            beam_size=beam_size_mt,
+            max_len_a=max_len_a_mt,
+            max_len_b=max_len_b_mt,
+            max_len=max_len,
+            min_len=min_len,
+            normalize_scores=normalize_scores,
+            len_penalty=len_penalty_mt,
+            unk_penalty=unk_penalty,
+            temperature=temperature,
+            match_source_len=match_source_len,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            search_strategy=search.BeamSearch(tgt_dict_mt),
+            eos=eos_mt,
+            symbols_to_strip_from_output=symbols_to_strip_from_output,
+        )
+    @torch.no_grad()
+    def generate(
+        self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs
+    ) -> List[List[Dict[str, Tensor]]]:
+        """Generate translations. Match the api of other fairseq generators.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            constraints (torch.LongTensor, optional): force decoder to include
+                the list of constraints
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, **kwargs)
+    def _generate(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        net_input = sample["net_input"]
+        if "src_tokens" in net_input:
+            src_tokens = net_input["src_tokens"]
+            # length of the source text being the character length except EndOfSentence and pad
+            # if src_lengths exists in net_input (speech_to_text dataset case), then use it
+            if "src_lengths" in net_input:
+                src_lengths = net_input["src_lengths"]
+            else:
+                src_lengths = (
+                    (
+                        src_tokens.ne(self.generator.eos)
+                        & src_tokens.ne(self.generator.pad)
+                    )
+                    .long()
+                    .sum(dim=1)
+                )
+        else:
+            raise Exception(
+                "expected src_tokens or source in net input. input keys: "
+                + str(net_input.keys())
+            )
+        if constraints is not None and not self.generator.search.supports_constraints:
+            raise NotImplementedError(
+                "Target-side constraints were provided, but search method doesn't support them"
+            )
+        # Initialize constraints, when active
+        self.generator.search.init_constraints(constraints, self.generator.beam_size)
+        self.generator_mt.search.init_constraints(
+            constraints, self.generator_mt.beam_size
+        )
+        # compute the encoder output for each beam
+        with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"):
+            encoder_outs = self.generator.model.forward_encoder(net_input)
+        single_model = self.generator.model.single_model
+        mt_decoder = getattr(single_model, f"{single_model.mt_task_name}_decoder")
+        # 1. MT decoder
+        finalized_mt = self.generator_mt.generate_decoder(
+            encoder_outs,
+            src_tokens,
+            src_lengths,
+            sample,
+            prefix_tokens,
+            constraints,
+            bos_token,
+            aux_task_name=single_model.mt_task_name,
+        )
+        # extract decoder output corresponding to the best hypothesis
+        max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt])
+        prev_output_tokens_mt = (
+            src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len)
+            .fill_(mt_decoder.padding_idx)
+            .int()
+        )  # B x T
+        for i, hypo in enumerate(finalized_mt):
+            i_beam = 0
+            tmp = hypo[i_beam]["tokens"].int()  # hyp + eos
+            prev_output_tokens_mt[i, 0] = self.generator_mt.eos
+            if tmp[-1] == self.generator_mt.eos:
+                tmp = tmp[:-1]
+            prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp
+            text = "".join([self.generator_mt.tgt_dict[c] for c in tmp])
+            text = text.replace("_", " ")
+            text = text.replace("▁", " ")
+            text = text.replace("<unk>", " ")
+            text = text.replace("<s>", "")
+            text = text.replace("</s>", "")
+            if len(text) > 0 and text[0] == " ":
+                text = text[1:]
+            sample_id = sample["id"].tolist()[i]
+            print("{} (None-{})".format(text, sample_id))
+        x = mt_decoder(
+            prev_output_tokens_mt,
+            encoder_out=encoder_outs[0],
+            features_only=True,
+        )[0].transpose(0, 1)
+        if getattr(single_model, "proj", None) is not None:
+            x = single_model.proj(x)
+        mt_decoder_padding_mask = None
+        if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any():
+            mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx)
+        # 2. T2U encoder
+        if getattr(single_model, "synthesizer_encoder", None) is not None:
+            t2u_encoder_out = single_model.synthesizer_encoder(
+                x,
+                mt_decoder_padding_mask,
+            )
+        else:
+            t2u_encoder_out = {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [mt_decoder_padding_mask]
+                if mt_decoder_padding_mask is not None
+                else [],  # B x T
+                "encoder_embedding": [],
+                "encoder_states": [],
+                "src_tokens": [],
+                "src_lengths": [],
+            }
+        if getattr(single_model, "t2u_augmented_cross_attn", False):
+            encoder_outs_aug = [t2u_encoder_out]
+        else:
+            encoder_outs = [t2u_encoder_out]
+            encoder_outs_aug = None
+        # 3. T2U decoder
+        finalized = self.generator.generate_decoder(
+            encoder_outs,
+            src_tokens,
+            src_lengths,
+            sample,
+            prefix_tokens,
+            constraints,
+            bos_token,
+            encoder_outs_aug=encoder_outs_aug,
+        )
+        return finalized

fairseq/examples/speech_to_text/README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Speech-to-Text (S2T) Modeling
+[https://www.aclweb.org/anthology/2020.aacl-demo.6](https://www.aclweb.org/anthology/2020.aacl-demo.6.pdf)
+Speech recognition (ASR) and speech-to-text translation (ST) with fairseq.
+## Data Preparation
+S2T modeling data consists of source speech features, target text and other optional information
+(source text, speaker id, etc.). Fairseq S2T uses per-dataset-split TSV manifest files
+to store these information. Each data field is represented by a column in the TSV file.
+Unlike text token embeddings, speech features (e.g. log mel-scale filter banks) are usually fixed
+during model training and can be pre-computed. The manifest file contains the path to
+either the feature file in NumPy format or the WAV/FLAC audio file. For the latter,
+features will be extracted on-the-fly by fairseq S2T. Optionally, feature/audio files can be packed
+into uncompressed ZIP files (then accessed via byte offset and length) to improve I/O performance.
+Fairseq S2T also employs a YAML file for data related configurations: tokenizer type and dictionary path
+for the target text, feature transforms such as CMVN (cepstral mean and variance normalization) and SpecAugment,
+temperature-based resampling, etc.
+## Model Training
+Fairseq S2T uses the unified `fairseq-train` interface for model training. It requires arguments `--task speech_to_text`,
+ `--arch <model architecture in fairseq.models.speech_to_text.*>` and `--config-yaml <config YAML filename>`.
+## Inference & Evaluation
+Fairseq S2T uses the unified `fairseq-generate`/`fairseq-interactive` interface for inference and evaluation. It
+requires arguments `--task speech_to_text` and `--config-yaml <config YAML filename>`. The interactive console takes
+audio paths (one per line) as inputs.
+## Examples
+- [Speech Recognition (ASR) on LibriSpeech](docs/librispeech_example.md)
+- [Speech-to-Text Translation (ST) on MuST-C](docs/mustc_example.md)
+- [Speech-to-Text Translation (ST) on CoVoST 2](docs/covost_example.md)
+- [Speech-to-Text Translation (ST) on Multilingual TEDx](docs/mtedx_example.md)
+- [Simultaneous Speech-to-Text Translation (SimulST) on MuST-C](docs/simulst_mustc_example.md)
+## Updates
+- 02/04/2021: Added interactive decoding (`fairseq-interactive`) support. Examples:
+  [ASR (LibriSpeech)](docs/librispeech_example.md#interactive-decoding)
+  and [ST (CoVoST 2)](docs/covost_example.md#interactive-decoding).
+- 01/08/2021: Several fixes for S2T Transformer model, inference-time de-tokenization, scorer configuration and data
+  preparation scripts. We also add pre-trained models to the examples and revise the instructions.
+  Breaking changes: the data preparation scripts now extract filterbank features without CMVN. CMVN is instead applied
+  on-the-fly (defined in the config YAML).
+## What's Next
+- We are migrating the old fairseq [ASR example](../speech_recognition) into this S2T framework and
+  merging the features from both sides.
+- The following papers also base their experiments on fairseq S2T. We are adding more examples for replication.
+  - [Improving Cross-Lingual Transfer Learning for End-to-End Speech Recognition with Speech Translation (Wang et al., 2020)](https://arxiv.org/abs/2006.05474)
+  - [Self-Supervised Representations Improve End-to-End Speech Translation (Wu et al., 2020)](https://arxiv.org/abs/2006.12124)
+  - [Self-Training for End-to-End Speech Translation (Pino et al., 2020)](https://arxiv.org/abs/2006.02490)
+  - [CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus (Wang et al., 2020)](https://arxiv.org/abs/2002.01320)
+  - [Harnessing Indirect Training Data for End-to-End Automatic Speech Translation: Tricks of the Trade (Pino et al., 2019)](https://arxiv.org/abs/1909.06515)
+## Citation
+Please cite as:
+```
+@inproceedings{wang2020fairseqs2t,
+  title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
+  author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino},
+  booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations},
+  year = {2020},
+}
+@inproceedings{ott2019fairseq,
+  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
+  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
+  year = {2019},
+}
+```

fairseq/examples/speech_to_text/data_utils.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+from pathlib import Path
+import zipfile
+from functools import reduce
+from multiprocessing import cpu_count
+from typing import Any, Dict, List, Optional, Union
+import io
+import numpy as np
+import pandas as pd
+import sentencepiece as sp
+from fairseq.data.audio.audio_utils import (
+    convert_waveform, _get_kaldi_fbank, _get_torchaudio_fbank, is_npy_data,
+    is_sf_audio_data
+)
+import torch
+import soundfile as sf
+from tqdm import tqdm
+UNK_TOKEN, UNK_TOKEN_ID = "<unk>", 3
+BOS_TOKEN, BOS_TOKEN_ID = "<s>", 0
+EOS_TOKEN, EOS_TOKEN_ID = "</s>", 2
+PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1
+def gen_vocab(
+    input_path: Path, output_path_prefix: Path, model_type="bpe",
+    vocab_size=1000, special_symbols: Optional[List[str]] = None
+):
+    # Train SentencePiece Model
+    arguments = [
+        f"--input={input_path.as_posix()}",
+        f"--model_prefix={output_path_prefix.as_posix()}",
+        f"--model_type={model_type}",
+        f"--vocab_size={vocab_size}",
+        "--character_coverage=1.0",
+        f"--num_threads={cpu_count()}",
+        f"--unk_id={UNK_TOKEN_ID}",
+        f"--bos_id={BOS_TOKEN_ID}",
+        f"--eos_id={EOS_TOKEN_ID}",
+        f"--pad_id={PAD_TOKEN_ID}",
+    ]
+    if special_symbols is not None:
+        _special_symbols = ",".join(special_symbols)
+        arguments.append(f"--user_defined_symbols={_special_symbols}")
+    sp.SentencePieceTrainer.Train(" ".join(arguments))
+    # Export fairseq dictionary
+    spm = sp.SentencePieceProcessor()
+    spm.Load(output_path_prefix.as_posix() + ".model")
+    vocab = {i: spm.IdToPiece(i) for i in range(spm.GetPieceSize())}
+    assert (
+        vocab.get(UNK_TOKEN_ID) == UNK_TOKEN
+        and vocab.get(PAD_TOKEN_ID) == PAD_TOKEN
+        and vocab.get(BOS_TOKEN_ID) == BOS_TOKEN
+        and vocab.get(EOS_TOKEN_ID) == EOS_TOKEN
+    )
+    vocab = {
+        i: s
+        for i, s in vocab.items()
+        if s not in {UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, PAD_TOKEN}
+    }
+    with open(output_path_prefix.as_posix() + ".txt", "w") as f_out:
+        for _, s in sorted(vocab.items(), key=lambda x: x[0]):
+            f_out.write(f"{s} 1\n")
+def extract_fbank_features(
+    waveform: torch.FloatTensor,
+    sample_rate: int,
+    output_path: Optional[Path] = None,
+    n_mel_bins: int = 80,
+    overwrite: bool = False,
+):
+    if output_path is not None and output_path.is_file() and not overwrite:
+        return
+    _waveform, _ = convert_waveform(waveform, sample_rate, to_mono=True)
+    # Kaldi compliance: 16-bit signed integers
+    _waveform = _waveform * (2 ** 15)
+    _waveform = _waveform.numpy()
+    features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins)
+    if features is None:
+        features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins)
+    if features is None:
+        raise ImportError(
+            "Please install pyKaldi or torchaudio to enable fbank feature extraction"
+        )
+    if output_path is not None:
+        np.save(output_path.as_posix(), features)
+    return features
+def create_zip(data_root: Path, zip_path: Path):
+    paths = list(data_root.glob("*.npy"))
+    paths.extend(data_root.glob("*.flac"))
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as f:
+        for path in tqdm(paths):
+            f.write(path, arcname=path.name)
+def get_zip_manifest(
+        zip_path: Path, zip_root: Optional[Path] = None, is_audio=False
+):
+    _zip_path = Path.joinpath(zip_root or Path(""), zip_path)
+    with zipfile.ZipFile(_zip_path, mode="r") as f:
+        info = f.infolist()
+    paths, lengths = {}, {}
+    for i in tqdm(info):
+        utt_id = Path(i.filename).stem
+        offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size
+        paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}"
+        with open(_zip_path, "rb") as f:
+            f.seek(offset)
+            byte_data = f.read(file_size)
+            assert len(byte_data) > 1
+            if is_audio:
+                assert is_sf_audio_data(byte_data), i
+            else:
+                assert is_npy_data(byte_data), i
+            byte_data_fp = io.BytesIO(byte_data)
+            if is_audio:
+                lengths[utt_id] = sf.info(byte_data_fp).frames
+            else:
+                lengths[utt_id] = np.load(byte_data_fp).shape[0]
+    return paths, lengths
+def gen_config_yaml(
+    manifest_root: Path,
+    spm_filename: Optional[str] = None,
+    vocab_name: Optional[str] = None,
+    yaml_filename: str = "config.yaml",
+    specaugment_policy: Optional[str] = "lb",
+    prepend_tgt_lang_tag: bool = False,
+    sampling_alpha: Optional[float] = None,
+    input_channels: Optional[int] = 1,
+    input_feat_per_channel: Optional[int] = 80,
+    audio_root: str = "",
+    cmvn_type: str = "utterance",
+    gcmvn_path: Optional[Path] = None,
+    extra=None
+):
+    manifest_root = manifest_root.absolute()
+    writer = S2TDataConfigWriter(manifest_root / yaml_filename)
+    assert spm_filename is not None or vocab_name is not None
+    vocab_name = spm_filename.replace(".model", ".txt") if vocab_name is None \
+        else vocab_name
+    writer.set_vocab_filename(vocab_name)
+    if input_channels is not None:
+        writer.set_input_channels(input_channels)
+    if input_feat_per_channel is not None:
+        writer.set_input_feat_per_channel(input_feat_per_channel)
+    specaugment_setters = {
+        "lb": writer.set_specaugment_lb_policy,
+        "ld": writer.set_specaugment_ld_policy,
+        "sm": writer.set_specaugment_sm_policy,
+        "ss": writer.set_specaugment_ss_policy,
+    }
+    specaugment_setter = specaugment_setters.get(specaugment_policy, None)
+    if specaugment_setter is not None:
+        specaugment_setter()
+    if spm_filename is not None:
+        writer.set_bpe_tokenizer(
+            {
+                "bpe": "sentencepiece",
+                "sentencepiece_model": (manifest_root / spm_filename).as_posix(),
+            }
+        )
+    if prepend_tgt_lang_tag:
+        writer.set_prepend_tgt_lang_tag(True)
+    if sampling_alpha is not None:
+        writer.set_sampling_alpha(sampling_alpha)
+    if cmvn_type not in ["global", "utterance"]:
+        raise NotImplementedError
+    if specaugment_policy is not None:
+        writer.set_feature_transforms(
+            "_train", [f"{cmvn_type}_cmvn", "specaugment"]
+        )
+    writer.set_feature_transforms("*", [f"{cmvn_type}_cmvn"])
+    if cmvn_type == "global":
+        if gcmvn_path is None:
+            raise ValueError("Please provide path of global cmvn file.")
+        else:
+            writer.set_global_cmvn(gcmvn_path.as_posix())
+    if len(audio_root) > 0:
+        writer.set_audio_root(audio_root)
+    if extra is not None:
+        writer.set_extra(extra)
+    writer.flush()
+def load_df_from_tsv(path: Union[str, Path]) -> pd.DataFrame:
+    _path = path if isinstance(path, str) else path.as_posix()
+    return pd.read_csv(
+        _path,
+        sep="\t",
+        header=0,
+        encoding="utf-8",
+        escapechar="\\",
+        quoting=csv.QUOTE_NONE,
+        na_filter=False,
+    )
+def save_df_to_tsv(dataframe, path: Union[str, Path]):
+    _path = path if isinstance(path, str) else path.as_posix()
+    dataframe.to_csv(
+        _path,
+        sep="\t",
+        header=True,
+        index=False,
+        encoding="utf-8",
+        escapechar="\\",
+        quoting=csv.QUOTE_NONE,
+    )
+def load_tsv_to_dicts(path: Union[str, Path]) -> List[dict]:
+    with open(path, "r") as f:
+        reader = csv.DictReader(
+            f,
+            delimiter="\t",
+            quotechar=None,
+            doublequote=False,
+            lineterminator="\n",
+            quoting=csv.QUOTE_NONE,
+        )
+        rows = [dict(e) for e in reader]
+    return rows
+def filter_manifest_df(
+    df, is_train_split=False, extra_filters=None, min_n_frames=5, max_n_frames=3000
+):
+    filters = {
+        "no speech": df["audio"] == "",
+        f"short speech (<{min_n_frames} frames)": df["n_frames"] < min_n_frames,
+        "empty sentence": df["tgt_text"] == "",
+    }
+    if is_train_split:
+        filters[f"long speech (>{max_n_frames} frames)"] = df["n_frames"] > max_n_frames
+    if extra_filters is not None:
+        filters.update(extra_filters)
+    invalid = reduce(lambda x, y: x | y, filters.values())
+    valid = ~invalid
+    print(
+        "| "
+        + ", ".join(f"{n}: {f.sum()}" for n, f in filters.items())
+        + f", total {invalid.sum()} filtered, {valid.sum()} remained."
+    )
+    return df[valid]
+def cal_gcmvn_stats(features_list):
+    features = np.concatenate(features_list)
+    square_sums = (features ** 2).sum(axis=0)
+    mean = features.mean(axis=0)
+    features = np.subtract(features, mean)
+    var = square_sums / features.shape[0] - mean ** 2
+    std = np.sqrt(np.maximum(var, 1e-8))
+    return {"mean": mean.astype("float32"), "std": std.astype("float32")}
+class S2TDataConfigWriter(object):
+    DEFAULT_VOCAB_FILENAME = "dict.txt"
+    DEFAULT_INPUT_FEAT_PER_CHANNEL = 80
+    DEFAULT_INPUT_CHANNELS = 1
+    def __init__(self, yaml_path: Path):
+        try:
+            import yaml
+        except ImportError:
+            print("Please install PyYAML for S2T data config YAML files")
+        self.yaml = yaml
+        self.yaml_path = yaml_path
+        self.config = {}
+    def flush(self):
+        with open(self.yaml_path, "w") as f:
+            self.yaml.dump(self.config, f)
+    def set_audio_root(self, audio_root=""):
+        self.config["audio_root"] = audio_root
+    def set_vocab_filename(self, vocab_filename: str = "dict.txt"):
+        self.config["vocab_filename"] = vocab_filename
+    def set_specaugment(
+        self,
+        time_wrap_w: int,
+        freq_mask_n: int,
+        freq_mask_f: int,
+        time_mask_n: int,
+        time_mask_t: int,
+        time_mask_p: float,
+    ):
+        self.config["specaugment"] = {
+            "time_wrap_W": time_wrap_w,
+            "freq_mask_N": freq_mask_n,
+            "freq_mask_F": freq_mask_f,
+            "time_mask_N": time_mask_n,
+            "time_mask_T": time_mask_t,
+            "time_mask_p": time_mask_p,
+        }
+    def set_specaugment_lb_policy(self):
+        self.set_specaugment(
+            time_wrap_w=0,
+            freq_mask_n=1,
+            freq_mask_f=27,
+            time_mask_n=1,
+            time_mask_t=100,
+            time_mask_p=1.0,
+        )
+    def set_specaugment_ld_policy(self):
+        self.set_specaugment(
+            time_wrap_w=0,
+            freq_mask_n=2,
+            freq_mask_f=27,
+            time_mask_n=2,
+            time_mask_t=100,
+            time_mask_p=1.0,
+        )
+    def set_specaugment_sm_policy(self):
+        self.set_specaugment(
+            time_wrap_w=0,
+            freq_mask_n=2,
+            freq_mask_f=15,
+            time_mask_n=2,
+            time_mask_t=70,
+            time_mask_p=0.2,
+        )
+    def set_specaugment_ss_policy(self):
+        self.set_specaugment(
+            time_wrap_w=0,
+            freq_mask_n=2,
+            freq_mask_f=27,
+            time_mask_n=2,
+            time_mask_t=70,
+            time_mask_p=0.2,
+        )
+    def set_input_channels(self, input_channels: int = 1):
+        self.config["input_channels"] = input_channels
+    def set_input_feat_per_channel(self, input_feat_per_channel: int = 80):
+        self.config["input_feat_per_channel"] = input_feat_per_channel
+    def set_bpe_tokenizer(self, bpe_tokenizer: Dict[str, Any]):
+        self.config["bpe_tokenizer"] = bpe_tokenizer
+    def set_global_cmvn(self, stats_npz_path: str):
+        self.config["global_cmvn"] = {"stats_npz_path": stats_npz_path}
+    def set_feature_transforms(self, split: str, transforms: List[str]):
+        if "transforms" not in self.config:
+            self.config["transforms"] = {}
+        self.config["transforms"][split] = transforms
+    def set_prepend_tgt_lang_tag(self, flag: bool = True):
+        self.config["prepend_tgt_lang_tag"] = flag
+    def set_sampling_alpha(self, sampling_alpha: float = 1.0):
+        self.config["sampling_alpha"] = sampling_alpha
+    def set_extra(self, data):
+        self.config.update(data)

fairseq/examples/speech_to_text/docs/covost_example.md ADDED Viewed

	@@ -0,0 +1,140 @@

+[[Back]](..)
+# S2T Example: ST on CoVoST
+We replicate the experiments in
+[CoVoST 2 and Massively Multilingual Speech-to-Text Translation (Wang et al., 2020)](https://arxiv.org/abs/2007.10310).
+## Data Preparation
+[Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path
+`${COVOST_ROOT}/${SOURCE_LANG_ID}`, then preprocess it with
+```bash
+# additional Python packages for S2T data processing/model training
+pip install pandas torchaudio sentencepiece
+# En ASR
+python examples/speech_to_text/prep_covost_data.py \
+  --data-root ${COVOST_ROOT} --vocab-type char --src-lang en
+# ST
+python examples/speech_to_text/prep_covost_data.py \
+  --data-root ${COVOST_ROOT} --vocab-type char \
+  --src-lang fr --tgt-lang en
+```
+The generated files (manifest, features, vocabulary and data configuration) will be added to
+`${COVOST_ROOT}/${SOURCE_LANG_ID}`.
+Download our vocabulary files if you want to use our pre-trained models:
+- ASR: [En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_vocab_char.zip)
+- ST: [Fr-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_vocab_char.zip), [De-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_vocab_char.zip), [Es-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_vocab_char.zip), [Ca-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_vocab_char.zip), [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_vocab_char.zip), [En-Ca](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_vocab_char.zip), [En-Fa](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_vocab_char.zip), [En-Et](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_vocab_char.zip)
+## ASR
+#### Training
+We train an En ASR model for encoder pre-training some of the ST models.
+```bash
+fairseq-train ${COVOST_ROOT}/en \
+  --config-yaml config_asr_en.yaml --train-subset train_asr_en --valid-subset dev_asr_en \
+  --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 50000 --max-update 60000 \
+  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+  --report-accuracy --arch s2t_transformer_s --dropout 0.15 --optimizer adam --lr 2e-3 \
+  --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \
+  --attn-type None --pos-enc-type ${POS_ENC_TYPE}
+```
+where `ASR_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder.
+Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively.
+Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used.
+To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU.
+#### Inference & Evaluation
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+  --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \
+  --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${COVOST_ROOT}/en \
+  --config-yaml config_asr_en.yaml --gen-subset test_asr_en --task speech_to_text \
+  --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \
+  --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct
+```
+#### Results
+| --arch | --pos-enc-type | Params | En | Model |
+|---|---|---|---|---|
+| s2t_transformer_s | - | 31M | 25.6 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_transformer_s.pt) |
+| s2t_conformer | rel_pos | 42.9M | 23.18| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rel_pos_asr_checkpoint_best.pt) |
+| s2t_conformer | rope | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rope_pos_asr_checkpoint_best.pt) |
+| s2t_conformer | abs | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/abs_asr_checkpoint_best.pt) |
+## ST
+#### Training
+Fr-En as example:
+```bash
+fairseq-train ${COVOST_ROOT}/fr \
+  --config-yaml config_st_fr_en.yaml --train-subset train_st_fr_en --valid-subset dev_st_fr_en \
+  --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-update 30000 --max-tokens 40000 \  # --max-tokens 50000 for en-*
+  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
+  --arch s2t_transformer_s --encoder-freezing-updates 1000 --optimizer adam --lr 2e-3 \
+  --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \
+  --attn-type None --pos-enc-type ${POS_ENC_TYPE} \
+  --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}
+```
+where `ST_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder.
+Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively.
+Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used.
+To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. Optionally load the pre-trained En ASR encoder for faster training and better
+performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU.
+You may want to update it accordingly when using more than 1 GPU.
+#### Inference & Evaluation
+Average the last 10 checkpoints and evaluate on test split:
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+  --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \
+  --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${COVOST_ROOT}/fr \
+  --config-yaml config_st_fr_en.yaml --gen-subset test_st_fr_en --task speech_to_text \
+  --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \
+  --max-tokens 50000 --beam 5 --scoring sacrebleu
+```
+## Interactive Decoding
+Launch the interactive console via
+```bash
+fairseq-interactive ${COVOST_ROOT}/fr --config-yaml config_st_fr_en.yaml \
+  --task speech_to_text --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \
+  --max-tokens 50000 --beam 5
+```
+Type in WAV/FLAC/OGG audio paths (one per line) after the prompt.
+#### Results
+| --arch | --pos-enc-type | Params | ASR PT | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et | Model |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| s2t_transformer | - | 31M | Yes | [27.2](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_transformer_s.pt) | [23.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_transformer_s.pt) | [19.3](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_transformer_s.pt) | [16.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_transformer_s.pt) | [21.6](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_transformer_s.pt) | [12.9](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_transformer_s.pt) | [12.8](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_transformer_s.pt) | (<-Download) |
+| s2t_conformer | rel_pos | 42.9M | No | [28.32](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [18.21](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.98](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [21.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [20.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.89](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [15.59](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [14.49](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) |
+| s2t_conformer | rel_pos | 42.9M | Yes| [27.15](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [18.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.14](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [21.68](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [20.35](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.92](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [15.76](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [16.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) |
+| s2t_conformer | rope | 42.1M | No | [27.61](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [17.6](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [24.91](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [20.78](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [19.7](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_from_scratch_avg_last_10_checkpoint.pt) | [25.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.87](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) |
+| s2t_conformer | rope | 42.1M | Yes | [26.99](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [17.71](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [24.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [21.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [19.9](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_asr_pt_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.58](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.97](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) |
+| s2t_conformer | abs | 42.1M | No | [27.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [17.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.01](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_from_scratch_avg_last_10_checkpoint.pt) |  [20.26](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [19.86](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.46](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.81](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) |
+| s2t_conforme | abs | 42.1M | Yes| [26.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [17.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [20.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [19.57](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.17](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.83](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) |
+[[Back]](..)