Leon299 commited on Mar 20

Commit

ec0bc9b

verified ·

1 Parent(s): 8337fa0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

MuCodec/libs/rvq/__pycache__/descript_quantize3.cpython-312.pyc +0 -0
MuCodec/models/__pycache__/attention.cpython-310.pyc +0 -0
MuCodec/models/__pycache__/attention.cpython-312.pyc +0 -0
MuCodec/models/__pycache__/transformer_2d_flow.cpython-310.pyc +0 -0
MuCodec/models/__pycache__/transformer_2d_flow.cpython-312.pyc +0 -0
MuCodec/muq_dev/__pycache__/test.cpython-310.pyc +0 -0
MuCodec/muq_dev/__pycache__/test.cpython-312.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/data/__init__.py +1 -0
MuCodec/muq_dev/muq_fairseq/data/__pycache__/__init__.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/data/__pycache__/ark_dataset.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/data/__pycache__/mert_dataset.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/data/ark_dataset.py +71 -0
MuCodec/muq_dev/muq_fairseq/data/mert_dataset.py +295 -0
MuCodec/muq_dev/muq_fairseq/data/utils/data_utils.py +535 -0
MuCodec/muq_dev/muq_fairseq/models/muq/__init__.py +1 -0
MuCodec/muq_dev/muq_fairseq/models/muq/__pycache__/__init__.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/__pycache__/muq_model.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/__init__.py +2 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/__init__.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/muq.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/rvq.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/rvq_muq.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/muq.py +520 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/pred_ark_target_with_model.py +151 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq.py +459 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq_muq.py +394 -0
MuCodec/muq_dev/muq_fairseq/models/muq/model/w2v2_config.json +113 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__init__.py +2 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/__init__.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/conv.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/features.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/random_quantizer.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/conv.py +77 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/features.py +67 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/flash_conformer.py +2114 -0
MuCodec/muq_dev/muq_fairseq/models/muq/modules/random_quantizer.py +68 -0
MuCodec/muq_dev/muq_fairseq/models/muq/muq_model.py +139 -0
MuCodec/muq_dev/muq_fairseq/tasks/__pycache__/muq_pretraining.cpython-310.pyc +0 -0
MuCodec/muq_dev/muq_fairseq/tasks/muq_pretraining.py +354 -0
MuCodec/tools/__pycache__/get_melvaehifigan48k.cpython-310.pyc +0 -0
MuCodec/tools/__pycache__/torch_tools.cpython-310.pyc +0 -0
MuCodec/tools/__pycache__/torch_tools.cpython-312.pyc +0 -0
checkpoints/Qwen3-0.6B/.gitattributes +36 -0
checkpoints/Qwen3-0.6B/LICENSE +202 -0
checkpoints/Qwen3-0.6B/README.md +301 -0
checkpoints/Qwen3-0.6B/config.json +33 -0
checkpoints/Qwen3-0.6B/generation_config.json +13 -0
checkpoints/Qwen3-0.6B/merges.txt +0 -0
checkpoints/Qwen3-0.6B/tokenizer_config.json +239 -0
checkpoints/Qwen3-0.6B/vocab.json +0 -0

MuCodec/libs/rvq/__pycache__/descript_quantize3.cpython-312.pyc ADDED Viewed

Binary file (16.1 kB). View file

MuCodec/models/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (16.3 kB). View file

MuCodec/models/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (25.6 kB). View file

MuCodec/models/__pycache__/transformer_2d_flow.cpython-310.pyc ADDED Viewed

Binary file (17.9 kB). View file

MuCodec/models/__pycache__/transformer_2d_flow.cpython-312.pyc ADDED Viewed

Binary file (26.9 kB). View file

MuCodec/muq_dev/__pycache__/test.cpython-310.pyc ADDED Viewed

Binary file (866 Bytes). View file

MuCodec/muq_dev/__pycache__/test.cpython-312.pyc ADDED Viewed

Binary file (1.19 kB). View file

MuCodec/muq_dev/muq_fairseq/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .mert_dataset import MERTDataset

MuCodec/muq_dev/muq_fairseq/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (219 Bytes). View file

MuCodec/muq_dev/muq_fairseq/data/__pycache__/ark_dataset.cpython-310.pyc ADDED Viewed

Binary file (2.35 kB). View file

MuCodec/muq_dev/muq_fairseq/data/__pycache__/mert_dataset.cpython-310.pyc ADDED Viewed

Binary file (9.85 kB). View file

MuCodec/muq_dev/muq_fairseq/data/ark_dataset.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+import torch
+import torch.nn.functional as F
+from fairseq.data.audio.raw_audio_dataset import RawAudioDataset
+from typing import Tuple
+try:
+    import kaldiio
+except:
+    kaldiio = None
+import warnings
+logger = logging.getLogger(__name__)
+class ArkDataset(RawAudioDataset):
+    def __init__(
+        self,
+        wav_scp,
+        dur_scp,
+        sr = 24000,
+        max_dur = 20,
+        num_buckets=0,
+        normalize=False,
+    ):
+        super().__init__(
+            sample_rate=sr,
+            max_sample_size=max_dur*sr,
+            min_sample_size=1200,
+            shuffle=True,
+            pad=True,
+            normalize=normalize,
+            compute_mask=False,
+        )
+        self.sr = sr
+        self.max_dur = max_dur
+        self.normalize = normalize
+        logger.info("Loading Kaldi scp files from {}".format(wav_scp))
+        self.wav_data = kaldiio.load_scp(wav_scp)
+        self.keys = list(self.wav_data.keys())
+        dur_data = {}
+        keys_set = set(self.keys)
+        with open(dur_scp, 'r') as f:
+            for line in f:
+                line = line.strip().split()
+                if line[0] in keys_set:
+                    dur_data[line[0]] = float(line[-1])
+        self.sizes = [int(dur_data[k]*self.sr/100) for k in self.keys]
+        logger.info("Loading Kaldi scp files done")
+        self.dataset_len = len(self.keys)
+        self.set_bucket_info(num_buckets)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        pass
+    def size(self, idx):
+        pass
+    def postprocess(self, wav):
+        pass
+    def collater(self, samples):
+        pass

MuCodec/muq_dev/muq_fairseq/data/mert_dataset.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import logging
+import os
+import sys
+from typing import Any, List, Optional, Union
+import numpy as np
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from fairseq.data import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+from fairseq.data.audio.audio_utils import (
+    parse_path,
+    read_from_stored_zip,
+)
+import math
+import io
+import torchaudio
+# this is in the user_dir
+from nnAudio import features as nnAudioFeatures
+# from tqdm import tqdm
+import tqdm
+import json
+import random
+import traceback
+from einops import rearrange
+# from scripts.prepare_codecs_from_manifest import *
+logger = logging.getLogger(__name__)
+class model_cqt_pred(torch.nn.Module):
+    def __init__(self, n_bins=84, sr=16000, freq=50):
+        super().__init__()
+        self.epsilon=1e-10
+        # Getting Mel Spectrogram on the fly
+        self.spec_layer = nnAudioFeatures.cqt.CQT(sr=sr, hop_length=sr//freq, fmin=32.7,
+                                           fmax=None, n_bins=n_bins, bins_per_octave=n_bins//7,
+                                           filter_scale=1, norm=1, window='hann', center=True,
+                                           pad_mode='constant', trainable=False,
+                                           output_format='Magnitude', verbose=True)
+        # self.fc = nn.Linear(input_dim, n_bins)
+        # self.criterion = nn.MSELoss()
+        self.forward_dict = {
+            # 'masked_transformer_output': self.plain_forward
+            'compute_cqt': self.compute_cqt
+        }
+    def compute_cqt(self, x):
+        '''
+        convert waveform to CQT -> [batch, bins, len] -> transpose
+        '''
+        # align with the padding of HuBERT model,
+        # the truncation is calculated by bruteforce search since the nnAudio padding strategy and fairseq models are different
+        # x = x[..., :-560]
+        return torch.transpose(self.spec_layer(x), -1, -2)
+    def forward(self, x, forward_type='masked_transformer_output'):
+        '''
+        take input from transformer hidden states: [batch, len_seq, channel]
+        output: [batch, len_seq, n_bins]
+        '''
+        return self.forward_dict[forward_type](x)
+def load_audio_by_json(json_path, max_keep, min_keep, tgt_sample_rate, clip_secs=5):
+    # read json file
+    print(json_path)
+    datas = []
+    inds = []
+    sizes = []
+    with open(json_path) as fp:
+        for ind,line in  enumerate(fp):
+            data = json.loads(line)
+            if 'duration' in data and min_keep is not None and tgt_sample_rate*data['duration'] < min_keep:
+                continue
+            datas.append(data)
+            inds.append(ind)
+            # sz = int(data['duration'] * data['sample_rate'])
+            if clip_secs > 0:
+                sz = int(tgt_sample_rate * clip_secs)
+            else:
+                sz = int(tgt_sample_rate * data['duration'])
+            sizes.append(sz)
+    tot = ind + 1
+    return datas,inds,tot,sizes
+def load_audio(manifest_path, max_keep, min_keep):
+    pass
+def load_label(label_path, inds, tot):
+    pass
+def load_numpy_label(label_path, inds, tot):
+    labels = np.load(label_path, mmap_mode='r')
+    assert (labels.shape[0] == tot), f"number of labels does not match ({labels.shape[0]} != {tot})"
+    return labels
+def verify_label_lengths(
+    audio_sizes,
+    audio_rate,
+    label_path,
+    label_rate,
+    inds,
+    tot,
+    tol=0.1,  # tolerance in seconds
+):
+    pass
+class Read_and_PadCrop_Normalized_T(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int, fixed_offset_duration=None) -> Tuple[torch.Tensor, float, float, int, int]:
+        pass
+class MERTDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        label_paths: List[str],
+        label_rates: Union[List[float], float],  # -1 for sequence labels
+        pad_list: List[str],
+        eos_list: List[str],
+        label_scp_path: Optional[str] = None,
+        label_scp_clip_duration: float = -1,
+        label_processors: Optional[List[Any]] = None,
+        max_keep_sample_size: Optional[int] = None,
+        min_keep_sample_size: Optional[int] = None,
+        max_sample_size: Optional[int] = None,
+        shuffle: bool = True,
+        pad_audio: bool = False,
+        normalize: bool = False,
+        store_labels: bool = True,
+        npmemmap: bool = False,
+        random_crop: bool = False,
+        single_target: bool = False,
+        augmentation_effects: List[str] = [],
+        augmentation_probs: List[float] = [],
+        inbatch_noise_augment_len_range: List[int] = [8000, 24000],
+        inbatch_noise_augment_number_range: List[int] = [1, 3],
+        inbatch_noise_augment_volume: float = 1.0,
+        cqt_prediction_bin: int = -1,
+        dataset_len:int = 128*3000,
+        clip_secs = 5,
+    ):
+        self.sample_rate = sample_rate
+        self.shuffle = shuffle
+        self.random_crop = random_crop
+        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path,max_keep_sample_size,min_keep_sample_size, self.sample_rate, clip_secs)
+        self.inds = inds
+        self.num_labels = len(label_paths)
+        self.pad_list = pad_list
+        self.eos_list = eos_list
+        self.label_processors = label_processors
+        self.single_target = single_target
+        self.label_rates = (
+            [label_rates for _ in range(len(label_paths))]
+            if isinstance(label_rates, float)
+            else label_rates
+        )
+        self.store_labels = store_labels
+        self.npmemmap = npmemmap
+        self.label_scp_path = label_scp_path
+        self.label_scp_clip_duration = label_scp_clip_duration
+        if self.label_scp_path is not None:
+            from kaldiio import load_scp
+            self.label_scp = load_scp(self.label_scp_path)
+        # self.dataset_len = dataset_len
+        self.dataset_len = len(self.datas)
+        logger.info('preparing labels')
+        logger.info('========dataset len: {}=========='.format(self.dataset_len))
+        if store_labels:
+            if self.npmemmap:
+                self.label_list = [load_numpy_label(p+'.npy', inds, tot) for p in label_paths]
+            else:
+                self.label_list = [load_label(p, inds, tot) for p in label_paths]
+        else:
+            self.label_paths = label_paths
+            # self.label_offsets_list = [
+            #     load_label_offset(p, inds, tot) for p in label_paths
+            # ]
+        assert label_processors is None or len(label_processors) == self.num_labels
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.pad_audio = pad_audio
+        self.normalize = normalize
+        logger.info(
+            f"pad_audio={pad_audio}, random_crop={random_crop}, "
+            f"normalize={normalize}, max_sample_size={self.max_sample_size}"
+        )
+        self.augmentation_effects = augmentation_effects
+        self.augmentation_probs = augmentation_probs
+        self.inbatch_noise_augment_len_range = inbatch_noise_augment_len_range
+        self.inbatch_noise_augment_number_range = inbatch_noise_augment_number_range
+        self.inbatch_noise_augment_volume = inbatch_noise_augment_volume
+        self.cqt_prediction_bin = cqt_prediction_bin
+        if self.cqt_prediction_bin > 0:
+            self.encoder_cqt_model = model_cqt_pred(n_bins=self.cqt_prediction_bin)
+            logger.info('preparing cqt loss objective in dataloader with cpu')
+        self.epoch = -1
+        self.reader = Read_and_PadCrop_Normalized_T(n_samples=clip_secs*sample_rate if clip_secs>0 else None, sample_rate = self.sample_rate)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        pass
+    def set_epoch(self, epoch):
+        pass
+    def inbatch_noise_augment(self,
+        target_audio: torch.Tensor, target_audio_idx: int ,
+        batch_audios: torch.Tensor, # [bsz, audio_lengths]
+        noise_len_min: int, noise_len_max: int,
+        n_noise_min: int, n_noise_max: int,
+        noise_vol: float = 1.0):
+        pass
+    def get_audio_by_slice(self,index):
+        pass
+    def get_audio(self, index):
+        pass
+    def get_label(self, index, label_idx):
+        pass
+    def get_labels(self, index):
+        pass
+    def __getitem__(self, i):
+        pass
+    def __len__(self):
+        return self.dataset_len
+    def crop_to_max_size(self, wav, target_size):
+        pass
+    def collater(self, samples):
+        pass
+    def collater_audio(self, audios, audio_size):
+        pass
+    def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
+        pass
+    def collater_seq_label(self, targets, pad):
+        pass
+    def collater_label(self, targets_by_label, audio_size, audio_starts):
+        pass
+    def num_tokens(self, index):
+        pass
+    def size(self, index):
+        pass
+    def ordered_indices(self):
+        pass
+    def postprocess(self, wav, cur_sample_rate):
+        pass

MuCodec/muq_dev/muq_fairseq/data/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import numpy as np
+import torch
+from typing import Optional, Tuple
+logger = logging.getLogger(__name__)
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    add_masks: bool = False,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
+    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    if num_mask_ver == 1:
+        all_num_mask = int(
+            # add a random number for probabilistic rounding
+            mask_prob * all_sz / float(mask_length)
+            + np.random.rand()
+        )
+        all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else:
+            sz = all_sz
+        if num_mask_ver == 1:
+            if padding_mask is not None:
+                num_mask = int(
+                    # add a random number for probabilistic rounding
+                    mask_prob * sz / float(mask_length)
+                    + np.random.rand()
+                )
+                num_mask = max(min_masks, num_mask)
+            else:
+                num_mask = all_num_mask
+        elif num_mask_ver == 2:
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            raise ValueError()
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            if mask_type == "static":
+                raise ValueError(f"this should never happens")
+            else:
+                lengths = [min(mask_length, sz - 1)]
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask:
+                    min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2:
+                mask_idc = rng.choice(sz, num_mask, replace=False)
+            else:
+                raise ValueError()
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz:
+            raise ValueError(
+                (
+                    f"the entire sequence is masked. "
+                    f"sz={sz}; mask_idc[mask_idc]; "
+                    f"index={indices[i] if indices is not None else None}"
+                )
+            )
+        mask_idcs.append(mask_idc)
+    target_len = None
+    if require_same_masks:
+        if add_masks:
+            target_len = max([len(m) for m in mask_idcs])
+        else:
+            target_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len:
+            mask_idc = rng.choice(mask_idc, target_len, replace=False)
+        mask[i, mask_idc] = True
+        if target_len is not None and len(mask_idc) < target_len:
+            unmasked = np.flatnonzero(~mask[i])
+            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
+            to_drop = rng.choice(masked, num_holes, replace=False)
+            mask[i, to_drop] = False
+    return mask
+def compute_block_mask_2d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+    img_shape: tuple = None,   # For the situation when d[0] != d[1], especially in audio spce ways
+    flexible_mask: bool = False,
+) -> torch.Tensor:
+    assert mask_length > 1
+    B, L = shape
+    d = (int(L**0.5),int(L**0.5))
+    if img_shape:
+        d = (img_shape[0],img_shape[1])
+    if flexible_mask:
+        index = np.random.randint(0,3)
+        block_size_options = np.array([(6, 4), (5, 5), (8, 3)])
+        block_size = block_size_options[index]
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+    if flexible_mask:
+        mask = torch.zeros((B, d[0], d[1]))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / (block_size[0]*block_size[1]))
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+        inds = ([], [], [])
+        offset = mask_length // 2
+        for i in range(block_size[0]):
+            for j in range(block_size[1]):
+                k1 = i - offset
+                k2 = j - offset
+                inds[0].append(centers[0])
+                inds[1].append(centers[1] + k1)
+                inds[2].append(centers[2] + k2)
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=d[0] - 1)
+        i2 = torch.cat(inds[2]).clamp_(min=0, max=d[1] - 1)
+        mask[(i0, i1, i2)] = 1
+    elif non_overlapping:
+        sz = math.ceil(d[0] / mask_length)
+        inp_len = sz * sz
+        inp = torch.zeros((B, 1, sz, sz))
+        w = torch.ones((1, 1, mask_length, mask_length))
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+        mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > d[0]:
+            mask = mask[..., :d, :d]
+    else:
+        mask = torch.zeros((B, d[0], d[1]))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length**2)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+        inds = ([], [], [])
+        offset = mask_length // 2
+        for i in range(mask_length):
+            for j in range(mask_length):
+                k1 = i - offset
+                k2 = j - offset
+                inds[0].append(centers[0])
+                inds[1].append(centers[1] + k1)
+                inds[2].append(centers[2] + k2)
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=d[0] - 1)
+        i2 = torch.cat(inds[2]).clamp_(min=0, max=d[1] - 1)
+        mask[(i0, i1, i2)] = 1
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+    if require_same_masks and expand_adjcent:
+        w = torch.zeros((1, 1, 3, 3))
+        w[..., 0, 1] = 1
+        w[..., 2, 1] = 1
+        w[..., 1, 0] = 1
+        w[..., 1, 2] = 1
+        all_nbs = get_nbs(B, mask, w)
+    mask = mask.reshape(B, -1)
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.view(1, d[0], d[1]), w).flatten()
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+    if inverse_mask:
+        mask = 1 - mask
+    return mask
+def compute_block_mask_1d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+) -> torch.Tensor:
+    B, L = shape
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+    if non_overlapping:
+        sz = math.ceil(L / mask_length)
+        inp = torch.zeros((B, 1, sz))
+        w = torch.ones((1, 1, mask_length))
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+        mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > L:
+            mask = mask[..., :L]
+    else:
+        mask = torch.zeros((B, L))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+        inds = ([], [])
+        offset = mask_length // 2
+        for i in range(mask_length):
+            k1 = i - offset
+            inds[0].append(centers[0])
+            inds[1].append(centers[1] + k1)
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1)
+        mask[(i0, i1)] = 1
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+    if require_same_masks and expand_adjcent:
+        w = torch.ones((1, 1, 3))
+        w[..., 1] = 0
+        all_nbs = get_nbs(B, mask, w)
+    mask = mask.view(B, -1)
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0)
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+    if inverse_mask:
+        mask = 1 - mask
+    return mask
+def get_buckets(sizes, num_buckets):
+    buckets = np.unique(
+        np.percentile(
+            sizes,
+            np.linspace(0, 100, num_buckets + 1),
+            interpolation="lower",
+        )[1:]
+    )
+    return buckets
+def get_bucketed_sizes(orig_sizes, buckets):
+    sizes = np.copy(orig_sizes)
+    assert np.min(sizes) >= 0
+    start_val = -1
+    for end_val in buckets:
+        mask = (sizes > start_val) & (sizes <= end_val)
+        sizes[mask] = end_val
+        start_val = end_val
+    return sizes

MuCodec/muq_dev/muq_fairseq/models/muq/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .muq_model import *

MuCodec/muq_dev/muq_fairseq/models/muq/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (203 Bytes). View file

MuCodec/muq_dev/muq_fairseq/models/muq/__pycache__/muq_model.cpython-310.pyc ADDED Viewed

Binary file (4.96 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (183 Bytes). View file

MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/muq.cpython-310.pyc ADDED Viewed

Binary file (15.8 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/rvq.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/model/__pycache__/rvq_muq.cpython-310.pyc ADDED Viewed

Binary file (13.1 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/model/muq.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import json
+import random
+import torch
+from torch import nn
+from einops import rearrange
+import os
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models.wav2vec.wav2vec2 import ConvFeatureExtractionModel
+from fairseq.modules import LayerNorm
+try:
+    from ..modules.random_quantizer import RandomProjectionQuantizer
+    from ..modules.features import MelSTFT
+    from ..modules.conv import Conv2dSubsampling
+except:
+    import sys, os
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+    from modules.random_quantizer import RandomProjectionQuantizer
+    from modules.features import MelSTFT
+    from modules.conv import Conv2dSubsampling
+class MuQ(nn.Module):
+    """
+    MuQ
+    Input: 128-band mel spectrogram
+    Frontend: 2-layer Residual convolution
+    Backend: 12-layer Conformer
+    Quantizer: a codebook for mel spectrogram
+    """
+    def __init__(
+        self,
+        num_codebooks=1,
+        codebook_dim=16,
+        codebook_size=4096,
+        features=["melspec_2048"],
+        hop_length=240,
+        n_mels=128,
+        conv_dim=512,
+        encoder_dim=1024,
+        encoder_depth=12,
+        mask_hop=0.4,
+        mask_prob=0.6,
+        is_flash=False,
+        stat_path=None, #"./data/fma_stats.json",
+        model_path=None, #"./data/pretrained_fma.pt",
+        w2v2_config_path=None, #"facebook/wav2vec2-conformer-rope-large-960h-ft",
+        use_rvq_target=False,
+        use_vq_target=False,
+        rvq_ckpt_path=None,
+        recon_loss_ratio=None,
+        label_rate=25,
+        use_hubert_masking_strategy=False,
+        use_hubert_featurizer=False,
+        hubert_conv_feature_layers="[(512,10,5)] + [(512,3,2)] * 3  + [(512,3,3)] + [(512,2,2)] * 2",
+        use_hubert_nce_loss=False,
+        hubert_final_dim=256,
+        rvq_n_codebooks=8,
+        rvq_multi_layer_num=1,
+        use_encodec_target=False,
+    ):
+        super(MuQ, self).__init__()
+        # global variables
+        self.hop_length = hop_length
+        self.mask_hop = mask_hop
+        self.mask_prob = mask_prob
+        self.num_codebooks = num_codebooks
+        self.codebook_size = codebook_size
+        self.features = features
+        self.recon_loss_ratio = recon_loss_ratio
+        self.n_fold = int(100//label_rate)
+        self.label_rate = label_rate
+        self.use_hubert_masking_strategy = use_hubert_masking_strategy
+        self.use_hubert_featurizer = use_hubert_featurizer
+        self.use_hubert_nce_loss = use_hubert_nce_loss
+        # load feature mean / std stats
+        import os
+        if stat_path is not None and os.path.exists(stat_path):
+            with open(stat_path, "r") as f:
+                self.stat = json.load(f)
+        else:
+            # print("No stats file found at `{}`, use default from msd.".format(stat_path))
+            self.stat = {"spec_256_cnt": 14394344256, "spec_256_mean": -23.34296658431829, "spec_256_std": 26.189295587132637, "spec_512_cnt": 28677104448, "spec_512_mean": -21.31267396860235, "spec_512_std": 26.52644536245769, "spec_1024_cnt": 57242624832, "spec_1024_mean": -18.852271129208273, "spec_1024_std": 26.443154583585663, "spec_2048_cnt": 114373665600, "spec_2048_mean": -15.638743433896792, "spec_2048_std": 26.115825961611545, "spec_4096_cnt": 228635747136, "spec_4096_mean": -11.715532502794836, "spec_4096_std": 25.763972210234062, "melspec_256_cnt": 14282760192, "melspec_256_mean": -26.962600400166156, "melspec_256_std": 36.13614100912126, "melspec_512_cnt": 14282760192, "melspec_512_mean": -9.108344167718862, "melspec_512_std": 24.71910937988429, "melspec_1024_cnt": 14282760192, "melspec_1024_mean": 0.37302579246531126, "melspec_1024_std": 18.684082325919388, "melspec_2048_cnt": 14282760192, "melspec_2048_mean": 6.768444971712967, "melspec_2048_std": 18.417922652295623, "melspec_4096_cnt": 14282760192, "melspec_4096_mean": 13.617164614990036, "melspec_4096_std": 18.08552130124525, "cqt_cnt": 9373061376, "cqt_mean": 0.46341379757927165, "cqt_std": 0.9543998080910191, "mfcc_256_cnt": 1339008768, "mfcc_256_mean": -11.681755459447485, "mfcc_256_std": 29.183186444668316, "mfcc_512_cnt": 1339008768, "mfcc_512_mean": -2.540581461792183, "mfcc_512_std": 31.93752185832081, "mfcc_1024_cnt": 1339008768, "mfcc_1024_mean": 6.606636263169779, "mfcc_1024_std": 34.151644801729624, "mfcc_2048_cnt": 1339008768, "mfcc_2048_mean": 5.281600844245184, "mfcc_2048_std": 33.12784541220003, "mfcc_4096_cnt": 1339008768, "mfcc_4096_mean": 4.7616569480166095, "mfcc_4096_std": 32.61458906894133, "chromagram_256_cnt": 1339008768, "chromagram_256_mean": 55.15596556703181, "chromagram_256_std": 73.91858278719991, "chromagram_512_cnt": 1339008768, "chromagram_512_mean": 175.73092252759895, "chromagram_512_std": 248.48485148525953, "chromagram_1024_cnt": 1339008768, "chromagram_1024_mean": 589.2947481634608, "chromagram_1024_std": 913.857929063196, "chromagram_2048_cnt": 1339008768, "chromagram_2048_mean": 2062.286388327397, "chromagram_2048_std": 3458.92657915397, "chromagram_4096_cnt": 1339008768, "chromagram_4096_mean": 7673.039107997085, "chromagram_4096_std": 13009.883158267234}
+        # feature extractor
+        self.preprocessor_melspec_2048 = MelSTFT(
+            n_fft=2048, hop_length=hop_length, is_db=True
+        )
+        # random quantizer
+        self.use_rvq_target = use_rvq_target
+        self.use_vq_target = use_vq_target
+        self.use_encodec_target = use_encodec_target
+        seed = 142
+        if self.use_rvq_like_target:
+            if use_rvq_target:
+                try:
+                    from .rvq_muq import ResidualVectorQuantize
+                except:
+                    import sys, os
+                    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+                    from rvq_muq import ResidualVectorQuantize
+                inp_dim = 128*self.n_fold
+                self.rvq = ResidualVectorQuantize(
+                    input_dim = inp_dim,
+                    n_codebooks = rvq_n_codebooks,
+                    codebook_size = 1024,
+                    codebook_dim = 16,
+                    quantizer_dropout = 0.0,
+                    use_multi_layer_num = rvq_multi_layer_num,
+                    )
+            elif use_vq_target:
+                try:
+                    from .rvq_muq import VectorQuantize
+                except:
+                    import sys, os
+                    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+                    from rvq_muq import VectorQuantize
+                self.rvq = VectorQuantize(
+                    input_dim = 128*self.n_fold,
+                    codebook_size = 1024,
+                    codebook_dim = 8,
+                    stale_tolerance = 1000,
+                    mfcc_clustering = False
+                )
+            elif use_encodec_target:
+                from encodec import EncodecModel
+                self.rvq = EncodecModel.encodec_model_24khz()
+                self.rvq.set_target_bandwidth(6.0)
+                for param in self.rvq.parameters():
+                    param.requires_grad = False
+            import os
+            if rvq_ckpt_path is not None and os.path.exists(rvq_ckpt_path):
+                state_dict = torch.load(rvq_ckpt_path, map_location="cpu")
+                self.rvq.load_state_dict(state_dict)
+            else:
+                print(f'Checkpoint for rvq `{rvq_ckpt_path}` not found. Using random initialization.')
+        else:
+            for feature in self.features:
+                for i in range(num_codebooks):
+                    setattr(
+                        self,
+                        f"quantizer_{feature}", # _{i}
+                        RandomProjectionQuantizer(
+                            n_mels * self.n_fold, codebook_dim, codebook_size, seed=seed + i
+                        ),
+                    )
+        if use_hubert_masking_strategy:
+            self.mask_emb = nn.Parameter(
+                torch.FloatTensor(encoder_dim).uniform_()
+            )
+        if use_hubert_featurizer:
+            feature_enc_layers = eval(hubert_conv_feature_layers)  # noqa
+            hubert_feat_embed = feature_enc_layers[-1][0]
+            self.hubert_feature_extractor = ConvFeatureExtractionModel(
+                conv_layers=feature_enc_layers,
+                dropout=0.0,
+                mode='default', #cfg.extractor_mode,
+                conv_bias=False, #cfg.conv_bias,
+            )
+            self.post_extract_proj = (
+                nn.Linear(hubert_feat_embed, encoder_dim)
+                if hubert_feat_embed != encoder_dim
+                else None
+            )
+            self.layer_norm = LayerNorm(hubert_feat_embed)
+        else:
+            # two residual convolution layers + one projection layer
+            strides_factory = {
+                4: [2, 2],
+                2: [2, 1]
+            }
+            self.conv = Conv2dSubsampling(
+                1, conv_dim, encoder_dim, strides=strides_factory.get(self.n_fold), n_bands=n_mels
+            )
+        # Conformer
+        if is_flash:
+            from modules.flash_conformer import (
+                Wav2Vec2ConformerEncoder,
+                Wav2Vec2ConformerConfig,
+            )
+        else:
+            from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+                Wav2Vec2ConformerEncoder,
+                Wav2Vec2ConformerConfig,
+            )
+        import os
+        if w2v2_config_path is None or not os.path.exists(w2v2_config_path):
+            w2v2_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v2_config.json")
+        print("load w2v2 config from:", w2v2_config_path)
+        config = Wav2Vec2ConformerConfig.from_pretrained(
+            w2v2_config_path
+        )
+        config.num_hidden_layers = encoder_depth
+        config.hidden_size = encoder_dim
+        self.conformer = Wav2Vec2ConformerEncoder(config)
+        if self.use_hubert_nce_loss:
+            self.label_embs_concat = nn.Parameter(
+                torch.FloatTensor(codebook_size, hubert_final_dim)
+            ) # embeddings of codes
+            nn.init.uniform_(self.label_embs_concat)
+            self.linear = nn.Linear(encoder_dim, hubert_final_dim) # final_proj
+        else:
+            # projection
+            self.linear = nn.Linear(encoder_dim, codebook_size) # N_SubSpec=8
+        # reconstruct melspec
+        if self.recon_loss_ratio is not None and self.recon_loss_ratio > 0:
+            self.recon_proj = nn.Linear(encoder_dim, n_mels * self.n_fold)
+            self.recon_loss = nn.MSELoss()
+        # loss function
+        self.loss = nn.CrossEntropyLoss()
+        # cls token (used for sequence classification)
+        random.seed(seed)
+        self.cls_token = nn.Parameter(torch.randn(encoder_dim))
+        # load model
+        if model_path:
+            S = torch.load(model_path)["state_dict"]
+            SS = {k[6:]: v for k, v in S.items()}
+            SS['quantizer_melspec_2048.random_projection'] = SS['quantizer_melspec_2048_0.random_projection']
+            SS['quantizer_melspec_2048.codebook'] = SS['quantizer_melspec_2048_0.codebook']
+            del SS['quantizer_melspec_2048_0.random_projection']
+            del SS['quantizer_melspec_2048_0.codebook']
+            unmatch = self.load_state_dict(SS, strict=False)
+            if len(unmatch.missing_keys) > 0:
+                print(f'Missing keys: {unmatch.missing_keys}')
+    @property
+    def use_rvq_like_target(self):
+        return self.use_rvq_target or self.use_vq_target or self.use_encodec_target
+    def apply_hubert_mask(self, x, padding_mask=None, target_list=None):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_length = int(self.mask_hop / (1/self.label_rate))
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                mask_length, # self.mask_length,
+                "static", #self.mask_selection,
+                0, #self.mask_other,
+                min_masks=2,
+                no_overlap=False, #self.no_mask_overlap,
+                min_space=1, #self.mask_min_space,
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x[mask_indices] = self.mask_emb
+            mask_indices = torch.nonzero(mask_indices)
+        else:
+            mask_indices = None
+        return x, mask_indices
+    def masking(self, x, attention_mask=None):
+        """random masking of 400ms with given probability"""
+        if self.use_hubert_masking_strategy:
+            return x, None
+        mx = x.clone()
+        b, t = mx.shape
+        len_masking_raw = int(24000 * self.mask_hop) # 9600 = 24000 * 0.4
+        len_masking_token = int(24000 / self.hop_length / 2 / 2 * self.mask_hop) # 10 = 25Hz * 0.4
+        # get random mask indices
+        start_indices = torch.rand(b, t // len_masking_raw) < self.mask_prob
+        time_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_raw, dim=1)
+        )
+        token_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_token, dim=1)
+        )
+        # mask with random values
+        masking_noise = (
+            torch.randn(time_domain_masked_indices.shape[0], dtype=x.dtype) * 0.1
+        )  # 0 mean 0.1 std
+        mx[tuple(time_domain_masked_indices.t())] = masking_noise.to(x.device)
+        return mx, token_domain_masked_indices
+    @torch.no_grad()
+    def preprocessing(self, x, features):
+        """extract classic audio features"""
+        # check precision
+        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
+            precision = 16
+        else:
+            precision = 32
+        out = {}
+        for key in features:
+            layer = getattr(self, "preprocessor_%s" % key)
+            layer.to(x.device)
+            dtype = x.dtype
+            out[key] = layer.float()(x.float())[..., :-1]
+            if precision == 16:
+                out[key] = out[key].half()
+            if out[key].dtype != dtype:
+                out[key].to(dtype=dtype)
+        return out
+    def encoder(self, x, *, attention_mask=None, is_features_only=False):
+        """2-layer conv + w2v-conformer"""
+        if not self.use_hubert_featurizer:
+            x = self.conv(x) # [3, 128, 3000] -> [3, 750, 1024]
+        if self.training and self.use_hubert_masking_strategy and not is_features_only:
+            x, mask_indices = self.apply_hubert_mask(x)
+        else:
+            mask_indices = None
+        if attention_mask is None:
+            out = self.conformer(x, output_hidden_states=True)
+        else:
+            attention_mask = attention_mask.bool()
+            skip_n = int(attention_mask.size(-1) / x.size(1))
+            attention_mask = attention_mask[:, ::skip_n]
+            attention_mask = attention_mask[:, :x.size(1)]
+            out = self.conformer(x, attention_mask=attention_mask, output_hidden_states=True)
+        hidden_emb = out["hidden_states"]
+        last_emb = out["last_hidden_state"]
+        logits = self.linear(last_emb)
+        interval = self.codebook_size
+        logits = {
+            key: logits[:, :, i * interval : (i + 1) * interval]
+            for i, key in enumerate(self.features)
+        }
+        return logits, hidden_emb, mask_indices
+    @torch.no_grad()
+    def normalize(self, x):
+        """normalize the input audio to have zero mean unit variance"""
+        for key in x.keys():
+            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key] # {'melspec_2048_cnt': 14282760192, 'melspec_2048_mean': 6.768444971712967}
+        return x
+    @torch.no_grad()
+    def rearrange(self, x):
+        """rearrange the batch to flatten every 4 steps"""
+        for key in x.keys():
+            if key == "chromagram":
+                x[key] = rearrange(x[key], "b f t -> b t f")
+            else:
+                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=self.n_fold)
+        return x
+    def get_rvq_codes(self, inp, raw_wav):
+        if self.use_rvq_target:
+            quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = self.rvq(inp)
+            return codes
+        if self.use_vq_target:
+            quantized_prompt_embeds, commitment_loss, codebook_loss, codes, _ = self.rvq(inp)
+            return codes.unsqueeze(1)
+        if self.use_encodec_target:
+            encoded_frames = self.rvq.encode(raw_wav.unsqueeze(1)) #list, B,[ 8,T ]
+            codes = torch.cat([encoded[0].detach() for encoded in encoded_frames], dim=-1)
+            if self.label_rate == 25:
+                codes = codes[:, :, ::3]
+            return codes
+    @torch.no_grad()
+    def tokenize(self, x, raw_wav):
+        out = {}
+        for key in x.keys():
+            if self.use_rvq_like_target:
+                self.rvq.eval()
+                inp = x[key].permute((0, 2, 1))
+                codes = self.get_rvq_codes(inp, raw_wav)
+                out[key] = torch.cat([codes[:, idx, ...] for idx in range(int(self.codebook_size//1024))], dim=-1) # (when use freq mask)->[Batch, N_SubSpec, SeqLen=8*750]
+            else:
+                layer = getattr(self, "quantizer_%s" % key)
+                out[key] = layer(x[key])
+        return out
+    def to_spec_wise_quad(self, x):
+        Batch, QuadSpec, Time = x.shape
+        SubSpec, N_SubSpec = 16, 8
+        assert 4 * SubSpec * N_SubSpec == QuadSpec == 4*128
+        x = rearrange(x, "b (q n s) t -> b (q s) (n t)", q=4, n=N_SubSpec, s=SubSpec)
+        return x # [Batch, SubSpec=16, N_SubSpec*Time=8*100Hz]
+    def get_targets(self, x, label=None):
+        if self.use_encodec_target:
+            raw_x = x.clone()
+        else:
+            raw_x = None
+        x = self.preprocessing(x, features=self.features) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
+        x = self.normalize(x)
+        x = self.rearrange(x) # -> {'melspec_2048': Tensor{Size([3, 750, 512]) cuda:0 f32}}
+        melspec = x['melspec_2048']
+        if label is None:
+            target_tokens = self.tokenize(x, raw_x) # -> {'melspec_2048': Tensor{Size([3, 750]) cuda:0 i64}}
+        else:
+            # print("use_target from label")
+            target_tokens = {'melspec_2048': rearrange(label, "b n s -> b (n s)").long()}
+        return target_tokens, melspec
+    def get_predictions(self, x, *, mask=None, attention_mask=None, return_new_mask=False, is_features_only=False):
+        # preprocessing
+        if not self.use_hubert_featurizer:
+            x = self.preprocessing(x, features=["melspec_2048"])
+            x = self.normalize(x) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
+        else:
+            features = self.hubert_feature_extractor(x)
+            features = self.layer_norm(features.transpose(1, 2))
+            if self.post_extract_proj is not None:
+                features = self.post_extract_proj(features)
+            x = {"melspec_2048": features}
+        # encoding
+        logits, hidden_emb, new_mask = self.encoder(x["melspec_2048"], attention_mask=attention_mask, is_features_only=is_features_only)
+        if return_new_mask:
+            return logits, hidden_emb, mask if new_mask is None else new_mask
+        else:
+            return logits, hidden_emb
+    def get_latent(self, x, layer_ix=12):
+        _, hidden_states = self.get_predictions(x)
+        emb = hidden_states[layer_ix]
+        return emb
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        pos = pos.unsqueeze(0)
+        targets = torch.cat([pos, negs], dim=0)
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
+        logits /= 0.1
+        if neg_is_pos.any():
+            logits[1:][neg_is_pos] = float("-inf")
+        logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+        return logits
+    def compute_hubert_nce_loss(self, proj_xs, targets):
+        label_embs_list = self.label_embs_concat.split(self.codebook_size, 0) # (self.num_classes, 0)
+        def compute_pred(proj_x, target, label_embs):
+            # compute logits for the i-th label set
+            y = torch.index_select(label_embs, 0, target.long())
+            negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+            return self.compute_nce(proj_x, y, negs)
+        logit_list = [
+                    compute_pred(proj_x, t, label_embs_list[i])
+                    for i, (proj_x, t) in enumerate(zip(proj_xs, targets))
+                ]
+        return sum(logit_list)
+    def get_loss(self, logits, target_tokens, masked_indices):
+        losses = {}
+        accuracies = {}
+        for key in logits.keys():
+            if not self.use_rvq_like_target:
+                masked_logits = logits[key][tuple(masked_indices.t())]
+                masked_tokens = target_tokens[key][tuple(masked_indices.t())]
+            else:
+                Batch, SeqLen, N_Codebook_x_CodebookSize = logits[key].shape # CodebookSize=4096
+                Batch, N_Codebook_x_SeqLen = target_tokens[key].shape # N_Codebook*SeqLen=4*750
+                N_Codebook = int(N_Codebook_x_SeqLen // SeqLen)
+                # print("not use_virtual, n codebook = ", N_Codebook)
+                target_tokens[key] = rearrange(target_tokens[key], "b (n s) -> b s n", n=N_Codebook) # Batch, SeqLen=750, N_Codebook=4
+                masked_logits = logits[key][tuple(masked_indices.t())]
+                masked_tokens = target_tokens[key][tuple(masked_indices.t())]
+                masked_logits = rearrange(masked_logits, "b (n c) -> (b n) c", n=N_Codebook)
+                masked_tokens = rearrange(masked_tokens, "b n -> (b n)", n=N_Codebook)
+            if self.use_hubert_nce_loss:
+                losses[key] = self.compute_hubert_nce_loss(masked_logits, masked_tokens)
+            else:
+                losses[key] = self.loss(masked_logits, masked_tokens)
+            accuracies[key] = (
+                torch.sum(masked_logits.argmax(-1) == masked_tokens)
+                / masked_tokens.numel()
+            )
+        return losses, accuracies
+    def get_recon_loss(self, last_hidden_emb, melspec, masked_indices):
+        pred_melspec = self.recon_proj(last_hidden_emb[tuple(masked_indices.t())])
+        target_melspec = melspec[tuple(masked_indices.t())]
+        recon_loss = self.recon_loss(pred_melspec, target_melspec)
+        return recon_loss
+    def forward(self, x, attention_mask=None, label=None):
+        dtype = x.dtype
+        # get target feature tokens
+        target_tokens, melspec = self.get_targets(x, label=label)
+        # masking
+        x, masked_indices = self.masking(x, attention_mask=attention_mask)
+        # forward
+        logits, hidden_emb, masked_indices = self.get_predictions(x, mask=masked_indices, attention_mask=attention_mask, return_new_mask=True)
+        # get loss
+        losses, accuracies = self.get_loss(logits, target_tokens, masked_indices)
+        if self.recon_loss_ratio:
+            losses["recon_loss"] = self.get_recon_loss(hidden_emb[-1], melspec, masked_indices) * self.recon_loss_ratio
+        return logits, hidden_emb, losses, accuracies

MuCodec/muq_dev/muq_fairseq/models/muq/model/pred_ark_target_with_model.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import sys
+import torch.nn as nn
+import torch
+import sys, os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from rvq_musicfm import PreprocessorWithModel, ResidualVectorQuantize
+class RVQ(nn.Module):
+    def __init__(self,
+                model_config,
+                rvq_ckpt_path,
+                preprocess,
+                ):
+        super().__init__()
+        self.rvq = ResidualVectorQuantize(**model_config)
+        if rvq_ckpt_path is not None:
+            self.rvq.load_state_dict(torch.load(rvq_ckpt_path, map_location='cpu'))
+        self.preprocess = preprocess
+    def get_targets(self, x):
+        self.rvq.eval()
+        x = self.preprocess(x)
+        quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = self.rvq(x)
+        return codes.permute(1,0,2)
+    @torch.no_grad()
+    def encode_wavs(self, wavs):
+        wavs = wavs[..., :int((wavs.shape[-1]//320)*320)]
+        return self.get_targets(wavs)
+def This_Music_ModelTarget_Config():
+    config = dict(
+        model = dict(
+            input_dim = 1024,
+            n_codebooks = 8,
+            codebook_size = 1024,
+            codebook_dim = 16,
+            quantizer_dropout = 0.0,
+        ),
+        train = dict(
+            batch_size = 32,
+            num_workers = 6,
+            valid_interval = 10,
+            save_interval = 100,
+            max_updates = 500000,
+            lr = 1e-4,
+            # device = 'cuda:1',
+            loss = 'commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()',
+            preprocess = PreprocessorWithModel(
+                model_dir= 'path/to/muq_fairseq',
+                checkpoint_dir='path/to/muq_m4a_75K.pt',
+                use_layer_idx=9,
+            )
+        ),
+        pred = dict(
+            rvq_ckpt_path='path/to/runs/Aug07_18-09-24_ts-828fa13e58384d0bba4144fda78ecc92-launcher/ckpt/RVQ_8100.pth',
+            sr=24000,
+            data_jsonl_path='path/to/data/music4all/train.json',
+            save_target_dir= 'path/to/data/music4all_ark/reiter_musicssl_m4a',
+        ),
+    )
+    return config
+CLEN = 30
+N_GPU_PER = 8
+N_NODE = 4
+def parse_lr(wave_length, sr):
+    n_step = int( wave_length // (sr*CLEN) )
+    if n_step == 0:
+        n_step = 1
+        print('wave_length: ', wave_length, 'sr: ', sr, 'n_step: ', n_step)
+    starts = torch.arange(n_step) * CLEN * sr
+    left_rights = torch.stack((starts, starts+CLEN*sr)).T
+    return left_rights[:10, ...]
+@torch.no_grad()
+def main(index, rank):
+    device = f'cuda:{rank}'
+    config = This_Music_ModelTarget_Config()
+    preprocess = config['train']['preprocess']
+    model = RVQ(
+        model_config = config['model'],
+        rvq_ckpt_path = config['pred']['rvq_ckpt_path'],
+        preprocess = preprocess
+    ).to(device)
+    model.eval()
+    sr = config['pred']['sr']
+    fname_nobase = os.path.basename(config['pred']['data_jsonl_path']).split('.')[0]
+    scp_dir = os.path.join(config['pred']['save_target_dir'], 'scp')
+    ark_dir = os.path.join(config['pred']['save_target_dir'], 'ark')
+    os.makedirs(scp_dir, exist_ok=True)
+    os.makedirs(ark_dir, exist_ok=True)
+    scp_path = os.path.join(scp_dir, f'{fname_nobase}.{index}_{rank}.scp')
+    ark_path = os.path.join(ark_dir, f'{fname_nobase}.{index}_{rank}.ark')
+    from kaldiio import WriteHelper
+    with open(config['pred']['data_jsonl_path']) as f:
+        lines = f.readlines()
+    print("Total:", len(lines))
+    from tqdm import tqdm
+    import json
+    import librosa
+    import time
+    from einops import rearrange
+    import numpy as np
+    # lines = lines[(index*N_GPU_PER+rank)::(N_GPU_PER*N_NODE)]
+    with WriteHelper(f'ark,scp:{ark_path},{scp_path}') as writer:
+        for idx, line in tqdm(enumerate(lines)):
+            try:
+                if idx % (N_GPU_PER*N_NODE) != (index*N_GPU_PER+rank):
+                    continue
+                item = json.loads(line)
+                path = item['path']
+                wave, _ = librosa.load(path, sr=sr)
+                wave = torch.from_numpy(wave)
+                wave_length = wave.shape[-1]
+                if wave_length < sr*CLEN:
+                    continue
+                left_rights = parse_lr(wave_length, sr)
+                lr = left_rights.tolist()
+                wavs = torch.stack(
+                    [wave[l:r] for l,r in lr]
+                ).to(device)
+                targets = model.encode_wavs(wavs) # [Codebook=8, N_Steps, Feature]
+                final_target = rearrange(targets, "c n f -> n (c f)").cpu().numpy().astype(np.int32)
+                for j in range(final_target.shape[0]):
+                    writer(f'{idx}:{j}', final_target[j])
+            except Exception as e:
+                print(e)
+if __name__ == '__main__':
+    import sys
+    index = int(sys.argv[1])
+    import multiprocessing
+    pool = multiprocessing.Pool(processes=N_GPU_PER)
+    for rank in range(8):
+        pool.apply_async(main, (index, rank))
+    pool.close()
+    pool.join()
+    print("Done.")

MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq.py ADDED Viewed

	@@ -0,0 +1,459 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 1000, mfcc_clustering=False, n_layer=1):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.mfcc_clustering = mfcc_clustering
+        ProjClass = nn.Identity if mfcc_clustering else WNConv1d
+        if n_layer==1:
+            self.in_proj = ProjClass(input_dim, codebook_dim, kernel_size=1)
+            self.out_proj = ProjClass(codebook_dim, input_dim, kernel_size=1)
+        elif n_layer >= 2:
+            ndim_hidden = 128
+            self.in_proj = nn.Sequential(
+                ProjClass(input_dim, ndim_hidden, kernel_size=1),
+                *[nn.Sequential(nn.ReLU(), ProjClass(ndim_hidden, ndim_hidden, kernel_size=1),) for _ in range(n_layer-2)],
+                nn.ReLU(),
+                ProjClass(ndim_hidden, codebook_dim, kernel_size=1)
+            )
+            self.out_proj = nn.Sequential(
+                ProjClass(codebook_dim, ndim_hidden, kernel_size=1),
+                nn.ReLU(),
+                *[nn.Sequential(ProjClass(ndim_hidden, ndim_hidden, kernel_size=1), nn.ReLU()) for _ in range(n_layer-2)],
+                ProjClass(ndim_hidden, input_dim, kernel_size=1),
+            )
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+        use_multi_layer_num:int = 1,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance, n_layer=use_multi_layer_num)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers + 1
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+from torch.utils.data import Dataset, DataLoader
+import json, traceback
+import torchaudio
+import math
+from typing import List, Tuple, Dict, Any
+CLIPSECS = 5
+def load_audio_by_json(json_path, max_keep, min_keep, tgt_sample_rate):
+    # read json file
+    print(json_path)
+    datas = []
+    inds = []
+    sizes = []
+    with open(json_path) as fp:
+        for ind,line in  enumerate(fp):
+            data = json.loads(line)
+            datas.append(data)
+            inds.append(ind)
+            # sz = int(data['duration'] * data['sample_rate'])
+            sz = int(tgt_sample_rate * CLIPSECS)
+            sizes.append(sz)
+    tot = ind + 1
+    return datas,inds,tot,sizes
+class Read_and_PadCrop_Normalized_T(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
+        if(duration<(float(self.n_samples)/self.sample_rate+1)):
+            # print(duration,(float(self.n_samples)/self.sample_rate+1))
+            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+            t_start = 0.
+            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
+            offset = 0
+            # print('c1:',chunk.shape)
+        else:
+            offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+            t_start = offset / float(cur_sample_rate) / duration
+            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
+            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+            # print('offset:',offset)
+            # print('c0:',chunk.shape)
+        # Pad with silence if necessary.
+        if(chunk.shape[0]>1):
+            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+        else:
+            chunk = chunk[[0],:].float()
+        if(cur_sample_rate!=self.sample_rate):
+            # print('a:',cur_sample_rate,chunk.shape)
+            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
+            # print('b:',self.sample_rate,chunk.shape)
+        if chunk.shape[-1] < self.n_samples:
+            chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
+        else:
+            chunk = chunk[:,0:self.n_samples]
+        seconds_start = math.floor(offset / cur_sample_rate)
+        seconds_total = math.floor(duration)
+        return (
+            chunk,
+            t_start,
+            t_end,
+            seconds_start,
+            seconds_total
+        )
+class RVQDataset(Dataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        normalize: bool = False,
+    ):
+        self.sample_rate = sample_rate
+        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path, None, None, self.sample_rate)
+        self.dataset_len = len(self.datas)
+        self.reader = Read_and_PadCrop_Normalized_T(n_samples=CLIPSECS*sample_rate,sample_rate = self.sample_rate)
+        self.normalize = normalize
+    def __getitem__(self, i):
+        # WORLD_SIZE = int(torch.distributed.get_world_size())
+        # WORLD_RANK = int(torch.distributed.get_rank())
+        # np.random.seed(1337 + self.epoch * WORLD_SIZE + WORLD_RANK + i)
+        # index = random.randint(0,len(self.sizes) - 1)
+        index = i
+        item = None
+        while item is None:
+            try:
+                wav = self.get_audio_by_slice(index)
+                # labels = self.get_labels(index)
+                # labels = None
+                # item = {"id": index, "source": wav, "label_list": labels}
+                item = {"id": index, "source": wav}
+            except Exception as e:
+                # print(e)
+                traceback.print_exc()
+                print(f'skip damaged data {index}')
+                index = np.random.randint(0,len(self.sizes)-1)
+        return item
+    def __len__(self):
+        return self.dataset_len
+    def get_audio_by_slice(self,index):
+        wav_path = self.datas[index]['path']
+        # print(wav_path)
+        audio_info =  torchaudio.info(wav_path)
+        origin_sample_rate = audio_info.sample_rate
+        origin_duration = audio_info.num_frames / origin_sample_rate
+        wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
+        wav = wav.float()
+        # _path, slice_ptr = parse_path(wav_path)
+        # original way
+        # if len(slice_ptr) == 0:
+        #     wav, cur_sample_rate = sf.read(_path)
+        # else:
+        #     assert _path.endswith(".zip")
+        #     data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+        #     f = io.BytesIO(data)
+        #     wav, cur_sample_rate = sf.read(f)
+        # wav = torch.from_numpy(wav).float()
+        # print(wav.shape)
+        wav = wav.permute(1,0)
+        wav = self.postprocess(wav, self.sample_rate)
+        # print(wav.shape)
+        # wav = wav.squeeze(0)
+        return wav
+    def postprocess(self, wav, cur_sample_rate):
+        if wav.dim() == 2:
+            wav = wav.mean(-1)
+        assert wav.dim() == 1, wav.dim()
+        if cur_sample_rate != self.sample_rate:
+            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
+        if self.normalize:
+            with torch.no_grad():
+                wav = F.layer_norm(wav, wav.shape)
+        return wav

MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq_muq.py ADDED Viewed

	@@ -0,0 +1,394 @@

+try:
+    from .rvq import *
+except:
+    import sys, os
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    from rvq import *
+try:
+    from ..modules.random_quantizer import RandomProjectionQuantizer
+    from ..modules.features import MelSTFT
+    from ..modules.conv import Conv2dSubsampling
+except:
+    import sys, os
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+    from modules.random_quantizer import RandomProjectionQuantizer
+    from modules.features import MelSTFT
+    from modules.conv import Conv2dSubsampling
+import fairseq
+CLIPSECS = 5 # 5 for rvq, 30 for model
+class RVQDataset(Dataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        normalize: bool = False,
+    ):
+        self.sample_rate = sample_rate
+        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path, None, None, self.sample_rate)
+        self.dataset_len = len(self.datas)
+        self.reader = Read_and_PadCrop_Normalized_T(n_samples=CLIPSECS*sample_rate,sample_rate = self.sample_rate)
+        self.normalize = normalize
+    def __getitem__(self, i):
+        # WORLD_SIZE = int(torch.distributed.get_world_size())
+        # WORLD_RANK = int(torch.distributed.get_rank())
+        # np.random.seed(1337 + self.epoch * WORLD_SIZE + WORLD_RANK + i)
+        # index = random.randint(0,len(self.sizes) - 1)
+        index = i
+        item = None
+        while item is None:
+            try:
+                wav = self.get_audio_by_slice(index)
+                item = {"id": index, "source": wav}
+            except Exception as e:
+                # print(e)
+                traceback.print_exc()
+                print(f'skip damaged data {index}')
+                index = np.random.randint(0,len(self.sizes)-1)
+        return item
+    def __len__(self):
+        return self.dataset_len
+    def get_audio_by_slice(self,index):
+        wav_path = self.datas[index]['path']
+        audio_info =  torchaudio.info(wav_path)
+        origin_sample_rate = audio_info.sample_rate
+        origin_duration = audio_info.num_frames / origin_sample_rate
+        wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
+        wav = wav.float()
+        # _path, slice_ptr = parse_path(wav_path)
+        # original way
+        # if len(slice_ptr) == 0:
+        #     wav, cur_sample_rate = sf.read(_path)
+        # else:
+        #     assert _path.endswith(".zip")
+        #     data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+        #     f = io.BytesIO(data)
+        #     wav, cur_sample_rate = sf.read(f)
+        # wav = torch.from_numpy(wav).float()
+        # print(wav.shape)
+        wav = wav.permute(1,0)
+        wav = self.postprocess(wav, self.sample_rate)
+        # print(wav.shape)
+        # wav = wav.squeeze(0)
+        return wav
+    def postprocess(self, wav, cur_sample_rate):
+        if wav.dim() == 2:
+            wav = wav.mean(-1)
+        assert wav.dim() == 1, wav.dim()
+        if cur_sample_rate != self.sample_rate:
+            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
+        if self.normalize:
+            with torch.no_grad():
+                wav = F.layer_norm(wav, wav.shape)
+        return wav
+class Preprocessor(nn.Module):
+    def __init__(self,
+            codebook_dim=16,
+            codebook_size=4096,
+            hop_length=240,
+            n_mels=128,
+            stat_path=None,
+            is_spec_wise=False,
+            s=4,
+        ) -> None:
+        super().__init__()
+        self.features=["melspec_2048"]
+        self.s = s
+        # load feature mean / std stats
+        import os
+        if stat_path is not None and os.path.exists(stat_path):
+            with open(stat_path, "r") as f:
+                self.stat = json.load(f)
+        else:
+            # print("No stats file found at `{}`, use default from msd.".format(stat_path))
+            self.stat = {"spec_256_cnt": 14394344256, "spec_256_mean": -23.34296658431829, "spec_256_std": 26.189295587132637, "spec_512_cnt": 28677104448, "spec_512_mean": -21.31267396860235, "spec_512_std": 26.52644536245769, "spec_1024_cnt": 57242624832, "spec_1024_mean": -18.852271129208273, "spec_1024_std": 26.443154583585663, "spec_2048_cnt": 114373665600, "spec_2048_mean": -15.638743433896792, "spec_2048_std": 26.115825961611545, "spec_4096_cnt": 228635747136, "spec_4096_mean": -11.715532502794836, "spec_4096_std": 25.763972210234062, "melspec_256_cnt": 14282760192, "melspec_256_mean": -26.962600400166156, "melspec_256_std": 36.13614100912126, "melspec_512_cnt": 14282760192, "melspec_512_mean": -9.108344167718862, "melspec_512_std": 24.71910937988429, "melspec_1024_cnt": 14282760192, "melspec_1024_mean": 0.37302579246531126, "melspec_1024_std": 18.684082325919388, "melspec_2048_cnt": 14282760192, "melspec_2048_mean": 6.768444971712967, "melspec_2048_std": 18.417922652295623, "melspec_4096_cnt": 14282760192, "melspec_4096_mean": 13.617164614990036, "melspec_4096_std": 18.08552130124525, "cqt_cnt": 9373061376, "cqt_mean": 0.46341379757927165, "cqt_std": 0.9543998080910191, "mfcc_256_cnt": 1339008768, "mfcc_256_mean": -11.681755459447485, "mfcc_256_std": 29.183186444668316, "mfcc_512_cnt": 1339008768, "mfcc_512_mean": -2.540581461792183, "mfcc_512_std": 31.93752185832081, "mfcc_1024_cnt": 1339008768, "mfcc_1024_mean": 6.606636263169779, "mfcc_1024_std": 34.151644801729624, "mfcc_2048_cnt": 1339008768, "mfcc_2048_mean": 5.281600844245184, "mfcc_2048_std": 33.12784541220003, "mfcc_4096_cnt": 1339008768, "mfcc_4096_mean": 4.7616569480166095, "mfcc_4096_std": 32.61458906894133, "chromagram_256_cnt": 1339008768, "chromagram_256_mean": 55.15596556703181, "chromagram_256_std": 73.91858278719991, "chromagram_512_cnt": 1339008768, "chromagram_512_mean": 175.73092252759895, "chromagram_512_std": 248.48485148525953, "chromagram_1024_cnt": 1339008768, "chromagram_1024_mean": 589.2947481634608, "chromagram_1024_std": 913.857929063196, "chromagram_2048_cnt": 1339008768, "chromagram_2048_mean": 2062.286388327397, "chromagram_2048_std": 3458.92657915397, "chromagram_4096_cnt": 1339008768, "chromagram_4096_mean": 7673.039107997085, "chromagram_4096_std": 13009.883158267234}
+        # feature extractor
+        self.preprocessor_melspec_2048 = MelSTFT(
+            n_fft=2048, hop_length=hop_length, is_db=True
+        )
+        self.is_spec_wise = is_spec_wise
+    @torch.no_grad()
+    def normalize(self, x):
+        """normalize the input audio to have zero mean unit variance"""
+        for key in x.keys():
+            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key] # {'melspec_2048_cnt': 14282760192, 'melspec_2048_mean': 6.768444971712967}
+        return x
+    @torch.no_grad()
+    def rearrange(self, x):
+        """rearrange the batch to flatten every 4 steps"""
+        for key in x.keys():
+            if key == "chromagram":
+                x[key] = rearrange(x[key], "b f t -> b t f")
+            else:
+                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=self.s)
+        return x
+    @torch.no_grad()
+    def preprocessing(self, x, features):
+        """extract classic audio features"""
+        # check precision
+        if x.dtype == torch.float16:
+            precision = 16
+        else:
+            precision = 32
+        out = {}
+        for key in features:
+            layer = getattr(self, "preprocessor_%s" % key)
+            out[key] = layer.float()(x.float())[..., :-1]
+            if precision == 16:
+                out[key] = out[key].half()
+        return out
+    @torch.no_grad()
+    def tokenize(self, x):
+        out = {}
+        for key in x.keys():
+            layer = getattr(self, "quantizer_%s" % key)
+            out[key] = layer(x[key])
+        return out
+    def to_spec_wise(self, x):
+        Batch, Spec, Time = x.shape
+        SubSpec, N_SubSpec = 16, 8
+        assert SubSpec * N_SubSpec == Spec == 128
+        x = rearrange(x, "b (n s) t -> b s (n t)", n=N_SubSpec, s=SubSpec)
+        return x # [Batch, SubSpec=16, N_SubSpec*Time=8*100Hz]
+    @torch.no_grad()
+    def __call__(self, x):
+        x = self.preprocessing(x, features=self.features) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
+        x = self.normalize(x)
+        if self.is_spec_wise:
+            x = {k:self.to_spec_wise(v) for k,v in x.items()}
+        x = self.rearrange(x) # -> {'melspec_2048': Tensor{Size([3, 750, 512]) cuda:0 f32}}
+        return x['melspec_2048'].permute((0, 2, 1))
+class CQTPreprocessor(nn.Module):
+    def __init__(self,
+            sr=24000,
+            hop=960,
+            nb=84,
+            to_db = True,
+        ) -> None:
+        super().__init__()
+        from nnAudio.features.cqt import CQT
+        import torchaudio
+        self.cqt_fn = CQT(
+                            sr=sr,
+                            hop_length=hop,
+                            n_bins=nb,
+                            fmin=32.7 if nb == 84 else 27.5, # 84 or 88
+                            bins_per_octave=12,
+                            filter_scale=1,
+                            norm=1,
+                            window='hann',
+                            center=True,
+                            pad_mode='constant',
+                            trainable=False,
+                            output_format='Magnitude',
+                            verbose=True,
+                        )
+        if to_db:
+            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+        else:
+            self.amplitude_to_db = lambda x:x
+    @torch.no_grad()
+    def __call__(self, x):
+        return self.amplitude_to_db(self.cqt_fn(x))
+from dataclasses import dataclass
+@dataclass
+class UserDirModule:
+    user_dir: str
+def load_model(model_dir, checkpoint_dir):
+    '''Load Fairseq SSL model'''
+    if model_dir is not None:
+        model_path = UserDirModule(model_dir)
+        fairseq.utils.import_user_module(model_path)
+    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_dir], strict=False)
+    model = model[0]
+    return model
+class PreprocessorWithModel(nn.Module):
+    def __init__(self, model_dir, checkpoint_dir, use_layer_idx=9) -> None:
+        super().__init__()
+        self.model = load_model(model_dir=model_dir, checkpoint_dir=checkpoint_dir)
+        self.model.eval()
+        self.use_layer_idx = use_layer_idx
+    def forward(self, x):
+        with torch.no_grad():
+            self.model.eval()
+            res = self.model(x, features_only = True)
+            layer_results = res['layer_results']
+            return layer_results[self.use_layer_idx].permute(0,2,1)
+def Music_Mel_Target_Config():
+    config = dict(
+        train_dataset = dict(
+            manifest_path = 'path/to/data/music4all/train.json',
+            sample_rate = 24000,
+            normalize = False,
+        ),
+        valid_dataset = dict(
+            manifest_path = 'path/to/data/music4all/valid.json',
+            sample_rate = 24000,
+            normalize = False,
+        ),
+        model = dict(
+            input_dim = 128*4,
+            n_codebooks = 8,
+            codebook_size = 1024,
+            codebook_dim = 16,
+            quantizer_dropout = 0.0,
+        ),
+        train = dict(
+            batch_size = 32,
+            num_workers = 6,
+            valid_interval = 10,
+            save_interval = 100,
+            max_updates = 500000,
+            lr = 1e-4,
+            device = 'cuda:0',
+            loss = 'commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()',
+            preprocess = Preprocessor()
+        )
+    )
+    return config
+def main(config):
+    train_dataset = RVQDataset(**config['train_dataset'])
+    if config['valid_dataset']['manifest_path'] is None:
+        # split train and valid dataset
+        from torch.utils.data import random_split
+        train_dataset, valid_dataset = random_split(
+            train_dataset, lengths=[len(train_dataset) - 500, 500]
+        )
+    else:
+        valid_dataset = RVQDataset(**config['valid_dataset'])
+    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config['train']['batch_size'], drop_last=True, num_workers=config['train']['num_workers'])
+    valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=config['train']['batch_size'], drop_last=True, num_workers=config['train']['num_workers'])
+    model = ResidualVectorQuantize(**config['model'])
+    device = config['train']['device']
+    preprocess = config['train']['preprocess'].to(device)
+    model = model.to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=config['train']['lr'])
+    cur_updates = 0
+    is_running = True
+    result = {}
+    from tqdm import tqdm
+    from tensorboardX import SummaryWriter
+    writer = SummaryWriter()
+    from collections import defaultdict
+    import os
+    from logging import getLogger
+    logger = getLogger()
+    while is_running:
+        results = defaultdict(lambda:0)
+        for item in tqdm(train_dataloader, desc='train'):
+            wavs = item['source']
+            optimizer.zero_grad()
+            wavs = wavs.to(device)
+            x = preprocess(wavs)
+            model.train()
+            quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = model(x)
+            loss = eval(config['train']['loss'])
+            loss.backward()
+            optimizer.step()
+            results['loss/train'] += loss.item()
+            results['commitment_loss/train'] += commitment_loss.item()
+            results['codebook_loss/train'] += codebook_loss.item()
+            results['rvq_usage/train'] += rvq_usage.float().mean().item()
+            if cur_updates % config['train']['valid_interval'] == 0:
+                model.eval()
+                with torch.no_grad():
+                    for item in tqdm(valid_dataloader, desc='valid'):
+                        wavs = item['source']
+                        wavs = wavs.to(device)
+                        x = preprocess(wavs)
+                        quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = model(x)
+                        valid_loss = eval(config['train']['loss'])
+                        results['loss/valid'] += valid_loss.item()
+                        results['commitment_loss/valid'] += commitment_loss.item()
+                        results['codebook_loss/valid'] += codebook_loss.item()
+                        results['rvq_usage/valid'] += rvq_usage.float().mean().item()
+                    results['cur_updates'] = cur_updates
+                    results['loss/train'] /= config['train']['valid_interval']
+                    results['commitment_loss/train'] /= config['train']['valid_interval']
+                    results['codebook_loss/train'] /= config['train']['valid_interval']
+                    results['rvq_usage/train'] /= config['train']['valid_interval']
+                    results['loss/valid'] /= len(valid_dataloader)
+                    results['commitment_loss/valid'] /= len(valid_dataloader)
+                    results['codebook_loss/valid'] /= len(valid_dataloader)
+                    results['rvq_usage/valid'] /= len(valid_dataloader)
+                    print('')
+                    logger.info(str(results))
+                    for k,v in results.items():
+                        writer.add_scalar(k, v, cur_updates)
+                    results.clear()
+            if cur_updates % config['train']['save_interval'] == 0:
+                os.makedirs(f'{writer.logdir}/ckpt/', exist_ok=True)
+                logger.info(f'saving checkpoint to {writer.logdir}/ckpt/RVQ_{cur_updates}.pth')
+                torch.save(model.state_dict(), f'{writer.logdir}/ckpt/RVQ_{cur_updates}.pth')
+            if cur_updates < config['train']['max_updates']:
+                cur_updates += 1
+            else:
+                is_running = False
+                break
+if __name__ == '__main__':
+    config = Music_Mel_Target_Config()
+    main(config)

MuCodec/muq_dev/muq_fairseq/models/muq/model/w2v2_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ConformerForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "conformer_conv_dropout": 0.1,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_depthwise_kernel_size": 31,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "swish",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_source_positions": 5000,
+  "model_type": "wav2vec2-conformer",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "position_embeddings_type": "rotary",
+  "proj_codevector_dim": 768,
+  "rotary_embedding_base": 10000,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.19.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (185 Bytes). View file

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/conv.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/features.cpython-310.pyc ADDED Viewed

Binary file (2.14 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__pycache__/random_quantizer.cpython-310.pyc ADDED Viewed

Binary file (1.98 kB). View file

MuCodec/muq_dev/muq_fairseq/models/muq/modules/conv.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from torch import nn
+from einops import rearrange
+class Res2dModule(nn.Module):
+    def __init__(self, idim, odim, stride=(2, 2)):
+        super(Res2dModule, self).__init__()
+        self.conv1 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+        self.bn1 = nn.BatchNorm2d(odim)
+        self.conv2 = nn.Conv2d(odim, odim, 3, padding=1)
+        self.bn2 = nn.BatchNorm2d(odim)
+        self.relu = nn.ReLU()
+        # residual
+        self.diff = False
+        if (idim != odim) or (stride[0] > 1):
+            self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+            self.bn3 = nn.BatchNorm2d(odim)
+            self.diff = True
+    def forward(self, x):
+        out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x)))))
+        if self.diff:
+            x = self.bn3(self.conv3(x))
+        out = x + out
+        out = self.relu(out)
+        return out
+class Conv2dSubsampling(nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        hdim (int): Hidden dimension.
+        odim (int): Output dimension.
+        strides (list): Sizes of strides.
+        n_bands (int): Number of frequency bands.
+    """
+    def __init__(self, idim, hdim, odim, strides=[2, 2], n_bands=64):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = nn.Sequential(
+            Res2dModule(idim, hdim, (2, strides[0])),
+            Res2dModule(hdim, hdim, (2, strides[1])),
+        )
+        self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim)
+    def forward(self, x):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, idim, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+        """
+        if x.dim() == 3:
+            x = x.unsqueeze(1)  # (b, c, f, t)
+        x = self.conv(x)
+        x = rearrange(x, "b c f t -> b t (c f)")
+        x = self.linear(x)
+        return x
+if __name__ == '__main__':
+    import torch
+    conv_dim, encoder_dim = 512, 1024
+    conv = Conv2dSubsampling(
+            1, conv_dim, encoder_dim, strides=[2, 1], n_bands=128
+        )
+    inp = torch.randn((1, 128, 3000))
+    out = conv(inp)
+    print(out.shape)

MuCodec/muq_dev/muq_fairseq/models/muq/modules/features.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torchaudio
+from torch import nn
+import torch
+class MelSTFT(nn.Module):
+    def __init__(
+        self,
+        sample_rate=24000,
+        n_fft=2048,
+        hop_length=240,
+        n_mels=128,
+        is_db=False,
+    ):
+        super(MelSTFT, self).__init__()
+        # spectrogram
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
+        )
+        # amplitude to decibel
+        self.is_db = is_db
+        if is_db:
+            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def forward(self, waveform):
+        if self.is_db:
+            return self.amplitude_to_db(self.mel_stft(waveform))
+        else:
+            return self.mel_stft(waveform)
+class CQTPreprocessor(nn.Module):
+    def __init__(self,
+            sr=24000,
+            hop=960,
+            nb=84,
+            to_db = True,
+        ) -> None:
+        super().__init__()
+        from nnAudio.features.cqt import CQT
+        import torchaudio
+        self.cqt_fn = CQT(
+                            sr=sr,
+                            hop_length=hop,
+                            n_bins=nb,
+                            fmin=32.7 if nb == 84 else 27.5, # 84 or 88
+                            bins_per_octave=12,
+                            filter_scale=1,
+                            norm=1,
+                            window='hann',
+                            center=True,
+                            pad_mode='constant',
+                            trainable=False,
+                            output_format='Magnitude',
+                            verbose=True,
+                        )
+        if to_db:
+            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+        else:
+            self.amplitude_to_db = lambda x:x
+    @torch.no_grad()
+    def __call__(self, x):
+        return self.amplitude_to_db(self.cqt_fn(x))

MuCodec/muq_dev/muq_fairseq/models/muq/modules/flash_conformer.py ADDED Viewed

	@@ -0,0 +1,2114 @@

+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Wav2Vec2-Conformer model."""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.wav2vec2_conformer.configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+logger = logging.get_logger(__name__)
+_HIDDEN_STATES_START_POSITION = 2
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/wav2vec2-conformer-rel-pos-large",
+    # See all Wav2Vec2Conformer models at https://huggingface.co/models?filter=wav2vec2-conformer
+]
+@dataclass
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: torch.FloatTensor = None
+    projected_quantized_states: torch.FloatTensor = None
+    codevector_perplexity: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+        return num_masked_span
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+    return spec_aug_mask
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+    mask_time_indices = (
+        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
+    )
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+    return sampled_negative_indices
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
+        else:
+            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+        self.cached_sequence_length = sequence_length
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings])
+        return self.cached_rotary_positional_embedding
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+        return relative_position_embeddings
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+    def __init__(self, config):
+        super().__init__()
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = torch.nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = torch.nn.GLU(dim=1)
+        self.depthwise_conv = torch.nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = torch.nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = torch.nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = torch.nn.Dropout(config.conformer_conv_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.dropout_p = config.attention_dropout
+        self.is_causal = config.is_causal
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=self.dropout_p, is_causal=self.is_causal)
+        probs = None
+        # # apply attention_mask if necessary
+        # if attention_mask is not None:
+        #     scores = scores + attention_mask
+        # # => (batch, head, time1, time2)
+        # probs = torch.softmax(scores, dim=-1)
+        # probs = self.dropout(probs)
+        # # => (batch, head, time1, d_k)
+        # hidden_states = torch.matmul(probs, value)
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states, probs
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+        return hidden_states
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+        return scores
+class Wav2Vec2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states, attn_weigts
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+        config.is_causal = is_causal
+        self.config = config
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+        hidden_states = self.dropout(hidden_states)
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GumbelVectorQuantizer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+        # can be decayed for training
+        self.temperature = 2
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+        return codevectors, perplexity
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+        return hidden_states
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+        batch_size = attention_mask.shape[0]
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Wav2Vec2ConformerEncoder, Wav2Vec2ConformerFeatureEncoder)):
+            module.gradient_checkpointing = value
+WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
+    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+    Parameters:
+        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            <Tip warning={true}>
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
+            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
+            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
+            that these models also yield slightly different results depending on whether `input_values` is padded or
+            not.
+            </Tip>
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.freeze_feature_encoder
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+        return hidden_states
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward with wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.compute_contrastive_logits
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+        # apply temperature
+        logits = logits / temperature
+        return logits
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,wav2vec2_conformer-base->wav2vec2-conformer-rel-pos-large
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+        Returns:
+        Example:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
+        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+        ...     _compute_mask_indices,
+        ...     _sample_negative_indices,
+        ... )
+        >>> from datasets import load_dataset
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+        quantized_features = self.project_q(quantized_features)
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+        return Wav2Vec2ConformerForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+        self.init_weights()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_base_model with wav2vec2->wav2vec2_conformer
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+        return loss
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = nn.functional.unfold(
+            hidden_states,
+            (self.kernel_size, self.in_conv_dim),
+            stride=(1, self.in_conv_dim),
+            dilation=(self.dilation, 1),
+        )
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.kernel(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+        self.init_weights()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_base_model with wav2vec2->wav2vec2_conformer
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+        return input_lengths
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

MuCodec/muq_dev/muq_fairseq/models/muq/modules/random_quantizer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+from torch import nn, einsum
+from einops import rearrange
+class RandomProjectionQuantizer(nn.Module):
+    """
+    Random projection and codebook lookup module
+    Some code is borrowed from:
+     https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/random_projection_quantizer.py
+    But I did normalization using pre-computed global mean & variance instead of using layer norm.
+    """
+    def __init__(
+        self,
+        input_dim,
+        codebook_dim,
+        codebook_size,
+        seed=142,
+    ):
+        super().__init__()
+        # random seed
+        torch.manual_seed(seed)
+        # randomly initialized projection
+        random_projection = torch.empty(input_dim, codebook_dim)
+        nn.init.xavier_normal_(random_projection)
+        self.register_buffer("random_projection", random_projection)
+        # randomly initialized codebook
+        codebook = torch.empty(codebook_size, codebook_dim)
+        nn.init.normal_(codebook)
+        self.register_buffer("codebook", codebook)
+    def codebook_lookup(self, x):
+        # reshape
+        b = x.shape[0]
+        x = rearrange(x, "b n e -> (b n) e")
+        # L2 normalization
+        normalized_x = nn.functional.normalize(x, dim=1, p=2)
+        normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2)
+        # compute distances
+        distances = torch.cdist(normalized_codebook, normalized_x)
+        # get nearest
+        nearest_indices = torch.argmin(distances, dim=0)
+        # reshape
+        xq = rearrange(nearest_indices, "(b n) -> b n", b=b)
+        return xq
+    @torch.no_grad()
+    def forward(self, x):
+        # always eval
+        self.eval()
+        # random projection [batch, length, input_dim] -> [batch, length, codebook_dim]
+        x = einsum("b n d, d e -> b n e", x, self.random_projection)
+        # codebook lookup
+        xq = self.codebook_lookup(x)
+        return xq

MuCodec/muq_dev/muq_fairseq/models/muq/muq_model.py ADDED Viewed

	@@ -0,0 +1,139 @@

+try:
+    from .model.muq import MuQ
+except:
+    import sys, os
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    from model.muq import MuQ
+try:
+    from fairseq.fairseq.dataclass import FairseqDataclass
+    from fairseq.fairseq.models import BaseFairseqModel, register_model
+    from fairseq.fairseq.tasks.fairseq_task import FairseqTask
+except:
+    from fairseq.dataclass import FairseqDataclass
+    from fairseq.models import BaseFairseqModel, register_model
+    from fairseq.tasks.fairseq_task import FairseqTask
+from dataclasses import dataclass, field
+from typing import List, Tuple, Optional
+import torch
+from logging import getLogger
+logger = getLogger(__name__)
+@dataclass
+class MuQConfig(FairseqDataclass):
+    label_rate:int = field(default=25)
+    num_codebooks:int = field(default=1)
+    codebook_dim:int = field(default=16)
+    codebook_size:int = field(default=4096)
+    features:List[str] = field(default_factory=lambda:["melspec_2048"])
+    hop_length:int = field(default=240)
+    n_mels:int = field(default=128)
+    conv_dim:int = field(default=512)
+    encoder_dim:int = field(default=1024)
+    encoder_depth:int = field(default=12)
+    mask_hop:float = field(default=0.4)
+    mask_prob:float = field(default=0.6)
+    is_flash:bool = field(default=False)
+    stat_path:Optional[str] = field(default=None)
+    model_path:Optional[str] = field(default=None)
+    w2v2_config_path:Optional[str] = field(default=None)
+    use_rvq_target:bool = field(default=False)
+    use_vq_target:bool = field(default=False)
+    rvq_ckpt_path: Optional[str] = field(default=None)
+    recon_loss_ratio: Optional[float] = field(default=None)
+    resume_checkpoint: Optional[str] = None
+    use_hubert_masking_strategy:bool = field(default=False)
+    use_hubert_featurizer:bool = field(default=False)
+    hubert_conv_feature_layers:str = field(default_factory=lambda:"[(512,10,5)] + [(512,3,2)] * 3  + [(512,3,3)] + [(512,2,2)] * 2")
+    rvq_n_codebooks:int = field(default=8)
+    rvq_multi_layer_num:int = field(default=1)
+    use_encodec_target:bool = field(default=False)
+SAMPLE_RATE = 24_000
+@register_model("muq", dataclass=MuQConfig)
+class MuQModel(BaseFairseqModel):
+    def __init__(self, cfg: MuQConfig, task_cfg: FairseqTask):
+        super().__init__()
+        self.cfg = cfg
+        self.model = MuQ(
+            num_codebooks=cfg.num_codebooks,
+            codebook_dim=cfg.codebook_dim,
+            codebook_size=cfg.codebook_size,
+            features=cfg.features,
+            n_mels=cfg.n_mels,
+            conv_dim=cfg.conv_dim,
+            encoder_dim=cfg.encoder_dim,
+            encoder_depth=cfg.encoder_depth,
+            mask_hop=cfg.mask_hop,
+            mask_prob=cfg.mask_prob,
+            is_flash=cfg.is_flash,
+            stat_path=cfg.stat_path,
+            model_path=cfg.model_path,
+            w2v2_config_path=cfg.w2v2_config_path,
+            use_rvq_target=cfg.use_rvq_target,
+            use_vq_target=cfg.use_vq_target,
+            rvq_ckpt_path=cfg.rvq_ckpt_path,
+            recon_loss_ratio=cfg.recon_loss_ratio,
+            label_rate=cfg.label_rate,
+            use_hubert_masking_strategy=cfg.use_hubert_masking_strategy,
+            use_hubert_featurizer=cfg.use_hubert_featurizer,
+            hubert_conv_feature_layers=cfg.hubert_conv_feature_layers,
+            rvq_n_codebooks=cfg.rvq_n_codebooks,
+            rvq_multi_layer_num=cfg.rvq_multi_layer_num,
+            use_encodec_target=cfg.use_encodec_target,
+        )
+    def forward(
+        self,
+        source: torch.Tensor, # B,L
+        features_only: bool = False,
+        label = None, # pre-extracted labeks, dim is [Batch, N_Codebook, SeqLen]
+        **kwargs,
+    ):
+        source = source[..., :int((source.shape[-1]//(SAMPLE_RATE//self.cfg.label_rate))*(SAMPLE_RATE//self.cfg.label_rate)) ]
+        if features_only:
+            if 'attention_mask' in kwargs:
+                attention_mask = kwargs['attention_mask']
+            elif 'padding_mask' in kwargs:
+                attention_mask = ~kwargs['padding_mask'].bool()
+            else:
+                attention_mask = None
+            _, hidden_states = self.model.get_predictions(source, attention_mask=attention_mask, is_features_only=True)
+            result = {
+                "layer_results": hidden_states
+            }
+            return result
+        else:
+            result = {}
+            logits, hidden_emb, losses, accuracies = self.model(source, label=label)
+            result["losses"] = losses
+            result["accuracies"] = accuracies
+            result["logits"] = logits
+            result["hidden_emb"] = hidden_emb
+            for k, v in losses.items():
+                result[k] = v
+            return result
+    @classmethod
+    def build_model(cls, cfg: MuQConfig, task: FairseqTask):
+        """Build a new model instance."""
+        model = MuQModel(cfg, task.cfg)
+        import numpy as np
+        s = 0
+        for param in model.parameters():
+            s += np.product(param.size())
+        # print('# of parameters: '+str(s/1024.0/1024.0))
+        if cfg.get("resume_checkpoint", None):
+            print("Loading checkpoint from {}".format(cfg.resume_checkpoint))
+            model.load_state_dict(torch.load(cfg.resume_checkpoint)['model'], strict=False)
+        return model
+    def get_losses(self, result, batch):
+        return result['losses']

MuCodec/muq_dev/muq_fairseq/tasks/__pycache__/muq_pretraining.cpython-310.pyc ADDED Viewed

Binary file (9.93 kB). View file

MuCodec/muq_dev/muq_fairseq/tasks/muq_pretraining.py ADDED Viewed

	@@ -0,0 +1,354 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+import logging
+import os
+import sys
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+from dataclasses import dataclass, field
+from fairseq.data import Dictionary, HubertDataset
+from fairseq.dataclass.configs import FairseqDataclass
+from fairseq.tasks import register_task
+from fairseq.tasks.fairseq_task import FairseqTask
+from omegaconf import MISSING
+from ..data.mert_dataset import MERTDataset
+from ..data.ark_dataset import ArkDataset
+logger = logging.getLogger(__name__)
+class LabelEncoder(object):
+    def __init__(self, dictionary: Dictionary) -> None:
+        self.dictionary = dictionary
+    def __call__(self, label: str) -> List[str]:
+        # encode_line return a torch.IntTensor, should be all 1 for vanila HuBERT
+        return self.dictionary.encode_line(
+            label,
+            append_eos=False,
+            add_if_not_exist=False,
+        )
+class PaddedNumpyLabelEncoder(object):
+    def __init__(self):
+        # self.dictionary = dictionary
+        pass
+    def __call__(self, label):
+        t = torch.IntTensor(np.asarray(label))
+        t = t[t>=0] # remove padded -1 values at the end
+        return t
+@dataclass
+class MuQPretrainingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    sharding_data: int = field(
+        default=-1,
+        metadata={
+            "help": "set this para >1 to use sharding dataset to prevent OOM"
+            "prepare data tsv and label files by adding postfix for sharding 64 like:"
+            "train_28_64.tsv and train_28_64.encodec_6"
+        },
+    )
+    load_random_data_shard: bool = field(
+        default=True,
+        metadata={
+            "help": "whether to laod shards randomly or in order when use sharding_data"
+        },
+    )
+    fine_tuning: bool = field(
+        default=False, metadata={"help": "set to true if fine-tuning Hubert"}
+    )
+    labels: List[str] = field(
+        default_factory=lambda: ["ltr"],
+        metadata={
+            "help": (
+                "extension of the label files to load, frame-level labels for"
+                " pre-training, and sequence-level label for fine-tuning"
+            )
+        },
+    )
+    label_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "if set, looks for labels in this directory instead",
+        },
+    )
+    label_scp_path: Optional[str] = field(
+        default=None,
+        metadata={
+            'help': 'if set, load label from scp file'
+        }
+    )
+    label_scp_clip_duration: float = field(
+        default=-1,
+        metadata={
+            'help': 'clip duration for loading scp label. if set to -1, this will not make effect.'
+        }
+    )
+    label_rate: float = field(
+        default=-1.0,
+        metadata={"help": "label frame rate. -1.0 for sequence label"},
+    )
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help": "target sample rate. audio files will be up/down "
+            "sampled to this rate"
+        },
+    )
+    normalize: bool = field(
+        default=False,
+        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
+    )
+    enable_padding: bool = field(
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"},
+    )
+    max_keep_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "exclude sample longer than this"},
+    )
+    max_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "max sample size to crop to for batching"},
+    )
+    min_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "min sample size to crop to for batching"},
+    )
+    single_target: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "if set, AddTargetDatasets outputs same keys " "as AddTargetDataset"
+        },
+    )
+    random_crop: Optional[bool] = field(
+        default=True,
+        metadata={"help": "always crop from the beginning if false"},
+    )
+    pad_audio: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pad audio to the longest one in the batch if true"},
+    )
+    store_labels: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to load all of the label into memory"},
+    )
+    numpy_memmap_label: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether the label file is saved as a numpy file, each line is ended with padding -1"},
+    )
+    augmentation_effects: Optional[str] = field(
+        default="[]",
+        metadata={
+            "help": (
+                "a list of effects that might apply to the audios"
+                "example: \"['random_mute', 'random_Gaussian', 'reverse_polarity']\" "
+                "supported: random_mute,"
+                "todo: "
+            )
+        },
+    )
+    augmentation_probs: Optional[str] = field(
+        default="[]",
+        metadata={
+            "help": (
+                "the corresponding probabilities for the data augmentation effects"
+                "example: \"[0.1, 0.5, 0.8]\" "
+                "the sum is not necessarily need to be 1.0, and multiple effects can be applied to the same audio"
+            )
+        },
+    )
+    # inbatch_noise_augment_len_range: Optional[List[int]] = field(
+        # default_factory=lambda: [8000, 24000],
+        # default = [8000, 24000],
+    inbatch_noise_augment_len_range: Optional[str] = field(
+        default = "[8000, 24000]",
+        metadata={
+            "help": (
+                "the range of length of the mix-up noise augmentation, unit in smaples"
+            )
+        },
+    )
+    # inbatch_noise_augment_number_range: Optional[List[int]] = field(
+    #     default_factory=lambda: [1, 3],
+        # default = [1, 3],
+    inbatch_noise_augment_number_range: Optional[str] = field(
+        default = "[1, 3]",
+        metadata={
+            "help": (
+                "the range of numbers of the mix-up noise augmentation"
+            )
+        },
+    )
+    inbatch_noise_augment_volume: float = field(
+        default = 1.0,
+        metadata={
+            "help": (
+                "the coefficient used to modify the volume of the noise audios wavs"
+            )
+        },
+    )
+    dynamic_crops: Optional[str] = field(
+        default="[]",
+        metadata={
+            "help": (
+                "used to set the maximum audio length setting, for training"
+                "example: \"[1, 2, 3, 4, 5, 10]\" "
+            )
+        },
+    )
+    dynamic_crops_epoches: Optional[str] = field(
+        default="[]",
+        metadata={
+            "help": (
+                "used to set training epoches of changing the maximum audio length"
+                "example: \"[1, 10, 20, 40, 80, 160,]\" "
+                "then len need to be equal to len(dynamic_crops)"
+            )
+        },
+    )
+    cqt_loss_bin_dataloader: Optional[int] = field(
+        default=-1,
+        metadata={
+            "help": (
+                "use this parameter to prepare cqt prediction objective in dataloader"
+            )
+        },
+    )
+    clip_secs: int = field(
+        default=5,
+        metadata={
+            "help": "clip secs for each audio"
+        }
+    )
+    dataset_shuffle: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "dataset shuffle when sample a batch"
+            )
+        },
+    )
+@register_task("muq_pretraining", dataclass=MuQPretrainingConfig)
+class MuQPretrainingTask(FairseqTask):
+    cfg: MuQPretrainingConfig
+    def __init__(
+        self,
+        cfg: MuQPretrainingConfig,
+    ) -> None:
+        super().__init__(cfg)
+        logger.info(f"current directory is {os.getcwd()}")
+        logger.info(f"MuQPretrainingTask Config {cfg}")
+        self.cfg = cfg
+        self.fine_tuning = cfg.fine_tuning
+        if cfg.fine_tuning:
+            self.state.add_factory("target_dictionary", self.load_dictionaries)
+        else:
+            self.state.add_factory("dictionaries", self.load_dictionaries)
+        self.blank_symbol = "<s>"
+        # use eval() to pass list parameters, skirt the fairseq/torch error:  Can't pickle <enum 'Choices'>: attribute lookup Choices on fairseq.dataclass.constants failed
+        self.augmentation_effects = eval(self.cfg.augmentation_effects)
+        self.augmentation_probs = eval(self.cfg.augmentation_probs)
+        if len(self.augmentation_effects) > 0:
+            assert len(self.augmentation_effects) == len(self.augmentation_probs)
+            logger.info(f"Applying audio augmentation {self.augmentation_effects}, probabilities: {self.augmentation_probs}")
+        self.inbatch_noise_augment_number_range = eval(self.cfg.inbatch_noise_augment_number_range)
+        self.inbatch_noise_augment_len_range = eval(self.cfg.inbatch_noise_augment_len_range)
+        self.max_sample_size = self.cfg.max_sample_size
+        self.dynamic_crops = eval(self.cfg.dynamic_crops)
+        self.dynamic_crops_epoches = eval(self.cfg.dynamic_crops_epoches)
+        assert len(self.dynamic_crops) == len(self.dynamic_crops_epoches)
+        if len(self.dynamic_crops) > 0:
+            assert self.dynamic_crops_epoches[0] == 1
+        self.cqt_loss_bin_dataloader = self.cfg.cqt_loss_bin_dataloader
+        self.numpy_memmap_label = self.cfg.numpy_memmap_label
+        self.store_labels = self.cfg.store_labels
+        if self.numpy_memmap_label:
+            assert self.store_labels
+    @property
+    def source_dictionary(self) -> Optional[Dictionary]:
+        return None
+    @property
+    def target_dictionary(self) -> Optional[Dictionary]:
+        return self.state.target_dictionary
+    @property
+    def dictionaries(self) -> List[Dictionary]:
+        return self.state.dictionaries
+    @classmethod
+    def setup_task(
+        cls, cfg: MuQPretrainingConfig, **kwargs
+    ) -> "MuQPretrainingTask":
+        return cls(cfg)
+    def load_dictionaries(self):
+        label_dir = self.cfg.data if (self.cfg.label_dir is None or self.cfg.label_dir == '') else self.cfg.label_dir
+        print(label_dir)
+        dictionaries = [
+            Dictionary.load(f"{label_dir}/dict.{label}.txt")
+            for label in self.cfg.labels
+        ]
+        return dictionaries[0] if self.cfg.fine_tuning else dictionaries
+    def get_label_dir(self) -> str:
+        if self.cfg.label_dir is None or self.cfg.label_dir=='':
+            return self.cfg.data
+        return self.cfg.label_dir
+    def is_force_load_dataset(self, epoch, training_restore=False):
+        # find the threshold that holds epoch \in [threshold, next_threshold)
+        return (epoch in self.dynamic_crops_epoches) or training_restore or (self.cfg.sharding_data > 1)
+    def set_dynamic_crop_max_sample(self, epoch):
+        pass
+    def load_dataset(self, split: str, **kwargs) -> None:
+        pass
+    def load_dataset_ark(self, split, **kwargs):
+        pass
+    def load_dataset_mert(self, split: str, **kwargs) -> None:
+        pass
+    def max_positions(self) -> Tuple[int, int]:
+        return (sys.maxsize, sys.maxsize)
+    def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array:
+        return indices

MuCodec/tools/__pycache__/get_melvaehifigan48k.cpython-310.pyc ADDED Viewed

Binary file (35.6 kB). View file

MuCodec/tools/__pycache__/torch_tools.cpython-310.pyc ADDED Viewed

Binary file (2.74 kB). View file

MuCodec/tools/__pycache__/torch_tools.cpython-312.pyc ADDED Viewed

Binary file (4.48 kB). View file

checkpoints/Qwen3-0.6B/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/Qwen3-0.6B/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 Alibaba Cloud
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

checkpoints/Qwen3-0.6B/README.md ADDED Viewed

	@@ -0,0 +1,301 @@

+---
+library_name: transformers
+license: apache-2.0
+license_link: https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/LICENSE
+pipeline_tag: text-generation
+base_model:
+- Qwen/Qwen3-0.6B-Base
+---
+# Qwen3-0.6B
+<a href="https://chat.qwen.ai/" target="_blank" style="margin: 2px;">
+    <img alt="Chat" src="https://img.shields.io/badge/%F0%9F%92%9C%EF%B8%8F%20Qwen%20Chat%20-536af5" style="display: inline-block; vertical-align: middle;"/>
+</a>
+## Qwen3 Highlights
+Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support, with the following key features:
+- **Uniquely support of seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose dialogue) **within single model**, ensuring optimal performance across various scenarios.
+- **Significantly enhancement in its reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning.
+- **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience.
+- **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks.
+- **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**.
+## Model Overview
+**Qwen3-0.6B** has the following features:
+- Type: Causal Language Models
+- Training Stage: Pretraining & Post-training
+- Number of Parameters: 0.6B
+- Number of Paramaters (Non-Embedding): 0.44B
+- Number of Layers: 28
+- Number of Attention Heads (GQA): 16 for Q and 8 for KV
+- Context Length: 32,768
+For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our [blog](https://qwenlm.github.io/blog/qwen3/), [GitHub](https://github.com/QwenLM/Qwen3), and [Documentation](https://qwen.readthedocs.io/en/latest/).
+> [!TIP]
+> If you encounter significant endless repetitions, please refer to the [Best Practices](#best-practices) section for optimal sampling parameters, and set the ``presence_penalty`` to 1.5.
+## Quickstart
+The code of Qwen3 has been in the latest Hugging Face `transformers` and we advise you to use the latest version of `transformers`.
+With `transformers<4.51.0`, you will encounter the following error:
+```
+KeyError: 'qwen3'
+```
+The following contains a code snippet illustrating how to use the model generate content based on given inputs.
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Qwen/Qwen3-0.6B"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+# prepare the model input
+prompt = "Give me a short introduction to large language model."
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32768
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+# parsing thinking content
+try:
+    # rindex finding 151668 (</think>)
+    index = len(output_ids) - output_ids[::-1].index(151668)
+except ValueError:
+    index = 0
+thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
+content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
+print("thinking content:", thinking_content)
+print("content:", content)
+```
+For deployment, you can use `sglang>=0.4.6.post1` or `vllm>=0.8.5` or to create an OpenAI-compatible API endpoint:
+- SGLang:
+    ```shell
+    python -m sglang.launch_server --model-path Qwen/Qwen3-0.6B --reasoning-parser qwen3
+    ```
+- vLLM:
+    ```shell
+    vllm serve Qwen/Qwen3-0.6B --enable-reasoning --reasoning-parser deepseek_r1
+    ```
+For local use, applications such as Ollama, LMStudio, MLX-LM, llama.cpp, and KTransformers have also supported Qwen3.
+## Switching Between Thinking and Non-Thinking Mode
+> [!TIP]
+> The `enable_thinking` switch is also available in APIs created by SGLang and vLLM.
+> Please refer to our documentation for [SGLang](https://qwen.readthedocs.io/en/latest/deployment/sglang.html#thinking-non-thinking-modes) and [vLLM](https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes) users.
+### `enable_thinking=True`
+By default, Qwen3 has thinking capabilities enabled, similar to QwQ-32B. This means the model will use its reasoning abilities to enhance the quality of generated responses. For example, when explicitly setting `enable_thinking=True` or leaving it as the default value in `tokenizer.apply_chat_template`, the model will engage its thinking mode.
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True  # True is the default value for enable_thinking
+)
+```
+In this mode, the model will generate think content wrapped in a `<think>...</think>` block, followed by the final response.
+> [!NOTE]
+> For thinking mode, use `Temperature=0.6`, `TopP=0.95`, `TopK=20`, and `MinP=0` (the default setting in `generation_config.json`). **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. For more detailed guidance, please refer to the [Best Practices](#best-practices) section.
+### `enable_thinking=False`
+We provide a hard switch to strictly disable the model's thinking behavior, aligning its functionality with the previous Qwen2.5-Instruct models. This mode is particularly useful in scenarios where disabling thinking is essential for enhancing efficiency.
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=False  # Setting enable_thinking=False disables thinking mode
+)
+```
+In this mode, the model will not generate any think content and will not include a `<think>...</think>` block.
+> [!NOTE]
+> For non-thinking mode, we suggest using `Temperature=0.7`, `TopP=0.8`, `TopK=20`, and `MinP=0`. For more detailed guidance, please refer to the [Best Practices](#best-practices) section.
+### Advanced Usage: Switching Between Thinking and Non-Thinking Modes via User Input
+We provide a soft switch mechanism that allows users to dynamically control the model's behavior when `enable_thinking=True`. Specifically, you can add `/think` and `/no_think` to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations.
+Here is an example of a multi-turn conversation:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class QwenChatbot:
+    def __init__(self, model_name="Qwen/Qwen3-0.6B"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.history = []
+    def generate_response(self, user_input):
+        messages = self.history + [{"role": "user", "content": user_input}]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = self.tokenizer(text, return_tensors="pt")
+        response_ids = self.model.generate(**inputs, max_new_tokens=32768)[0][len(inputs.input_ids[0]):].tolist()
+        response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
+        # Update history
+        self.history.append({"role": "user", "content": user_input})
+        self.history.append({"role": "assistant", "content": response})
+        return response
+# Example Usage
+if __name__ == "__main__":
+    chatbot = QwenChatbot()
+    # First input (without /think or /no_think tags, thinking mode is enabled by default)
+    user_input_1 = "How many r's in strawberries?"
+    print(f"User: {user_input_1}")
+    response_1 = chatbot.generate_response(user_input_1)
+    print(f"Bot: {response_1}")
+    print("----------------------")
+    # Second input with /no_think
+    user_input_2 = "Then, how many r's in blueberries? /no_think"
+    print(f"User: {user_input_2}")
+    response_2 = chatbot.generate_response(user_input_2)
+    print(f"Bot: {response_2}")
+    print("----------------------")
+    # Third input with /think
+    user_input_3 = "Really? /think"
+    print(f"User: {user_input_3}")
+    response_3 = chatbot.generate_response(user_input_3)
+    print(f"Bot: {response_3}")
+```
+> [!NOTE]
+> For API compatibility, when `enable_thinking=True`, regardless of whether the user uses `/think` or `/no_think`, the model will always output a block wrapped in `<think>...</think>`. However, the content inside this block may be empty if thinking is disabled.
+> When `enable_thinking=False`, the soft switches are not valid. Regardless of any `/think` or `/no_think` tags input by the user, the model will not generate think content and will not include a `<think>...</think>` block.
+## Agentic Use
+Qwen3 excels in tool calling capabilities. We recommend using [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity.
+To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself.
+```python
+from qwen_agent.agents import Assistant
+# Define LLM
+llm_cfg = {
+    'model': 'Qwen3-0.6B',
+    # Use the endpoint provided by Alibaba Model Studio:
+    # 'model_type': 'qwen_dashscope',
+    # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
+    # Use a custom endpoint compatible with OpenAI API:
+    'model_server': 'http://localhost:8000/v1',  # api_base
+    'api_key': 'EMPTY',
+    # Other parameters:
+    # 'generate_cfg': {
+    #         # Add: When the response content is `<think>this is the thought</think>this is the answer;
+    #         # Do not add: When the response has been separated by reasoning_content and content.
+    #         'thought_in_content': True,
+    #     },
+}
+# Define Tools
+tools = [
+    {'mcpServers': {  # You can specify the MCP configuration file
+            'time': {
+                'command': 'uvx',
+                'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai']
+            },
+            "fetch": {
+                "command": "uvx",
+                "args": ["mcp-server-fetch"]
+            }
+        }
+    },
+  'code_interpreter',  # Built-in tools
+]
+# Define Agent
+bot = Assistant(llm=llm_cfg, function_list=tools)
+# Streaming generation
+messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ Introduce the latest developments of Qwen'}]
+for responses in bot.run(messages=messages):
+    pass
+print(responses)
+```
+## Best Practices
+To achieve optimal performance, we recommend the following settings:
+1. **Sampling Parameters**:
+   - For thinking mode (`enable_thinking=True`), use `Temperature=0.6`, `TopP=0.95`, `TopK=20`, and `MinP=0`. **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions.
+   - For non-thinking mode (`enable_thinking=False`), we suggest using `Temperature=0.7`, `TopP=0.8`, `TopK=20`, and `MinP=0`.
+   - For supported frameworks, you can adjust the `presence_penalty` parameter between 0 and 2 to reduce endless repetitions. However, using a higher value may occasionally result in language mixing and a slight decrease in model performance.
+2. **Adequate Output Length**: We recommend using an output length of 32,768 tokens for most queries. For benchmarking on highly complex problems, such as those found in math and programming competitions, we suggest setting the max output length to 38,912 tokens. This provides the model with sufficient space to generate detailed and comprehensive responses, thereby enhancing its overall performance.
+3. **Standardize Output Format**: We recommend using prompts to standardize model outputs when benchmarking.
+   - **Math Problems**: Include "Please reason step by step, and put your final answer within \boxed{}." in the prompt.
+   - **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: "Please show your choice in the `answer` field with only the choice letter, e.g., `"answer": "C"`."
+4. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. It is implemented in the provided chat template in Jinja2. However, for frameworks that do not directly use the Jinja2 chat template, it is up to the developers to ensure that the best practice is followed.
+### Citation
+If you find our work helpful, feel free to give us a cite.
+```
+@misc{qwen3technicalreport,
+      title={Qwen3 Technical Report},
+      author={Qwen Team},
+      year={2025},
+      eprint={2505.09388},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.09388},
+}
+```

checkpoints/Qwen3-0.6B/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384
+}

checkpoints/Qwen3-0.6B/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "pad_token_id": 151643,
+    "temperature": 0.6,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.51.0"
+}

checkpoints/Qwen3-0.6B/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/Qwen3-0.6B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/Qwen3-0.6B/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff