Spaces:

Ricecake123
/

GPT-SoVITS-experiment

Runtime error

App Files Files Community

Ricecake123 commited on Jan 22

Commit

e79b770

•

1 Parent(s): 298e47d

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

AR/__init__.py +0 -0
AR/__pycache__/__init__.cpython-310.pyc +0 -0
AR/__pycache__/__init__.cpython-39.pyc +0 -0
AR/data/__init__.py +0 -0
AR/data/__pycache__/__init__.cpython-310.pyc +0 -0
AR/data/__pycache__/__init__.cpython-39.pyc +0 -0
AR/data/__pycache__/bucket_sampler.cpython-310.pyc +0 -0
AR/data/__pycache__/bucket_sampler.cpython-39.pyc +0 -0
AR/data/__pycache__/data_module.cpython-310.pyc +0 -0
AR/data/__pycache__/data_module.cpython-39.pyc +0 -0
AR/data/__pycache__/dataset.cpython-310.pyc +0 -0
AR/data/__pycache__/dataset.cpython-39.pyc +0 -0
AR/data/bucket_sampler.py +157 -0
AR/data/data_module.py +66 -0
AR/data/dataset.py +302 -0
AR/exps/__init__.py +0 -0
AR/exps/beats/BEATs.py +179 -0
AR/exps/beats/README.md +127 -0
AR/exps/beats/Tokenizers.py +172 -0
AR/exps/beats/__init__.py +2 -0
AR/exps/beats/backbone.py +791 -0
AR/exps/beats/config.py +19 -0
AR/exps/beats/modules.py +220 -0
AR/exps/beats/ontology.json +0 -0
AR/exps/beats/quantizer.py +235 -0
AR/exps/get_beats_librilight.py +321 -0
AR/exps/get_phones.py +232 -0
AR/exps/get_phones_librilight.py +198 -0
AR/exps/get_txt_librilight.py +255 -0
AR/exps/split_train_val.py +35 -0
AR/exps/t2s.py +197 -0
AR/exps/test.py +139 -0
AR/exps/text.txt +10 -0
AR/exps/train.py +103 -0
AR/exps/train_librilight_6k.py +170 -0
AR/models/__init__.py +0 -0
AR/models/__pycache__/__init__.cpython-310.pyc +0 -0
AR/models/__pycache__/__init__.cpython-39.pyc +0 -0
AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc +0 -0
AR/models/__pycache__/t2s_lightning_module.cpython-39.pyc +0 -0
AR/models/__pycache__/t2s_model.cpython-310.pyc +0 -0
AR/models/__pycache__/t2s_model.cpython-39.pyc +0 -0
AR/models/__pycache__/utils.cpython-310.pyc +0 -0
AR/models/__pycache__/utils.cpython-39.pyc +0 -0
AR/models/t2s_lightning_module.py +128 -0
AR/models/t2s_model.py +298 -0
AR/models/utils.py +164 -0
AR/modules/__init__.py +0 -0
AR/modules/__pycache__/__init__.cpython-310.pyc +0 -0
AR/modules/__pycache__/__init__.cpython-39.pyc +0 -0

AR/__init__.py ADDED Viewed

File without changes

AR/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (135 Bytes). View file

AR/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (138 Bytes). View file

AR/data/__init__.py ADDED Viewed

File without changes

AR/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (140 Bytes). View file

AR/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (143 Bytes). View file

AR/data/__pycache__/bucket_sampler.cpython-310.pyc ADDED Viewed

Binary file (4.42 kB). View file

AR/data/__pycache__/bucket_sampler.cpython-39.pyc ADDED Viewed

Binary file (4.39 kB). View file

AR/data/__pycache__/data_module.cpython-310.pyc ADDED Viewed

Binary file (2.27 kB). View file

AR/data/__pycache__/data_module.cpython-39.pyc ADDED Viewed

Binary file (2.29 kB). View file

AR/data/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (6.58 kB). View file

AR/data/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (6.57 kB). View file

AR/data/bucket_sampler.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/bucketsampler.py
+import itertools
+import math
+import random
+from random import shuffle
+from typing import Iterator
+from typing import Optional
+from typing import TypeVar
+import torch
+import torch.distributed as dist
+from torch.utils.data import Dataset
+from torch.utils.data import Sampler
+__all__ = [
+    "DistributedBucketSampler",
+]
+T_co = TypeVar('T_co', covariant=True)
+class DistributedBucketSampler(Sampler[T_co]):
+    r"""
+    sort the dataset wrt. input length
+    divide samples into buckets
+    sort within buckets
+    divide buckets into batches
+    sort batches
+    """
+    def __init__(self,
+                 dataset: Dataset,
+                 num_replicas: Optional[int]=None,
+                 rank: Optional[int]=None,
+                 shuffle: bool=True,
+                 seed: int=0,
+                 drop_last: bool=False,
+                 batch_size: int=32) -> None:
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    "Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    "Requires distributed package to be available")
+            rank = dist.get_rank()
+            torch.cuda.set_device(rank)
+        if rank >= num_replicas or rank < 0:
+            raise ValueError("Invalid rank {}, rank should be in the interval"
+                             " [0, {}]".format(rank, num_replicas - 1))
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(
+                self.
+                dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) /
+                self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(
+                len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+        self.batch_size = batch_size
+        self.id_with_length = self._get_sample_lengths()
+        self.id_buckets = self.make_buckets(bucket_width=2.0)
+    def _get_sample_lengths(self):
+        id_with_lengths = []
+        for i in range(len(self.dataset)):
+            id_with_lengths.append((i, self.dataset.get_sample_length(i)))
+        id_with_lengths.sort(key=lambda x: x[1])
+        return id_with_lengths
+    def make_buckets(self, bucket_width: float=2.0):
+        buckets = []
+        cur = []
+        max_sec = bucket_width
+        for id, sec in self.id_with_length:
+            if sec < max_sec:
+                cur.append(id)
+            else:
+                buckets.append(cur)
+                cur = [id]
+                max_sec += bucket_width
+        if len(cur) > 0:
+            buckets.append(cur)
+        return buckets
+    def __iter__(self) -> Iterator[T_co]:
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            random.seed(self.epoch + self.seed)
+            shuffled_bucket = []
+            for buc in self.id_buckets:
+                buc_copy = buc.copy()
+                shuffle(buc_copy)
+                shuffled_bucket.append(buc_copy)
+            grouped_batch_size = self.batch_size * self.num_replicas
+            shuffled_bucket = list(itertools.chain(*shuffled_bucket))
+            n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size))
+            batches = [
+                shuffled_bucket[b * grouped_batch_size:(b + 1) *
+                                grouped_batch_size] for b in range(n_batch)
+            ]
+            shuffle(batches)
+            indices = list(itertools.chain(*batches))
+        else:
+            # type: ignore[arg-type]
+            indices = list(range(len(self.dataset)))
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size /
+                                                len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[:self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+    def __len__(self) -> int:
+        return self.num_samples
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch

AR/data/data_module.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/data_module.py
+from pytorch_lightning import LightningDataModule
+from AR.data.bucket_sampler import DistributedBucketSampler
+from AR.data.dataset import Text2SemanticDataset
+from torch.utils.data import DataLoader
+class Text2SemanticDataModule(LightningDataModule):
+    def __init__(self, config, train_semantic_path, train_phoneme_path,dev_semantic_path=None, dev_phoneme_path=None):
+        super().__init__()
+        self.config = config
+        self.train_semantic_path = train_semantic_path
+        self.train_phoneme_path = train_phoneme_path
+        self.dev_semantic_path = dev_semantic_path
+        self.dev_phoneme_path = dev_phoneme_path
+        self.num_workers = self.config['data']['num_workers']
+    def prepare_data(self):
+        pass
+    def setup(self, stage=None, output_logs=False):
+        self._train_dataset = Text2SemanticDataset(
+            phoneme_path=self.train_phoneme_path,
+            semantic_path=self.train_semantic_path,
+            max_sec=self.config['data']['max_sec'],
+            pad_val=self.config['data']['pad_val'])
+        self._dev_dataset = self._train_dataset
+        # self._dev_dataset = Text2SemanticDataset(
+        #     phoneme_path=self.dev_phoneme_path,
+        #     semantic_path=self.dev_semantic_path,
+        #     max_sample=self.config['data']['max_eval_sample'],
+        #     max_sec=self.config['data']['max_sec'],
+        #     pad_val=self.config['data']['pad_val'])
+    def train_dataloader(self):
+        batch_size = self.config['train']['batch_size']
+        sampler = DistributedBucketSampler(
+            self._train_dataset, batch_size=batch_size)
+        return DataLoader(
+            self._train_dataset,
+            batch_size=batch_size,
+            sampler=sampler,
+            collate_fn=self._train_dataset.collate,
+            num_workers=self.num_workers,
+            persistent_workers=True,
+            prefetch_factor=16
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self._dev_dataset,
+            batch_size=1,
+            shuffle=False,
+            collate_fn=self._train_dataset.collate,
+            num_workers=max(self.num_workers,12),
+            persistent_workers=True,
+            prefetch_factor=16
+        )
+    # 这个会使用到嘛？
+    def test_dataloader(self):
+        return DataLoader(
+            self._dev_dataset,
+            batch_size=1,
+            shuffle=False,
+            collate_fn=self._train_dataset.collate)

AR/data/dataset.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/t2s_dataset.py
+import pdb
+import sys
+# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert")
+import traceback,os
+from typing import Dict
+from typing import List
+import numpy as np
+import pandas as pd
+import torch,json
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+from text import cleaned_text_to_sequence
+# from config import exp_dir
+def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0):
+    seq = sequences[0]
+    ndim = seq.ndim
+    if axis < 0:
+        axis += ndim
+    dtype = seq.dtype
+    pad_value = dtype.type(pad_value)
+    seq_lengths = [seq.shape[axis] for seq in sequences]
+    max_length = np.max(seq_lengths)
+    padded_sequences = []
+    for seq, length in zip(sequences, seq_lengths):
+        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
+                ndim - axis - 1)
+        padded_seq = np.pad(
+            seq, padding, mode='constant', constant_values=pad_value)
+        padded_sequences.append(padded_seq)
+    batch = np.stack(padded_sequences)
+    return batch
+class Text2SemanticDataset(Dataset):
+    """dataset class for text tokens to semantic model training."""
+    def __init__(self,
+                 phoneme_path: str,
+                 semantic_path: str,
+                 max_sample: int = None,
+                 max_sec: int = 100,
+                 pad_val: int = 1024,
+                 # min value of phoneme/sec
+                 min_ps_ratio: int = 3,
+                 # max value of phoneme/sec
+                 max_ps_ratio: int = 25) -> None:
+        super().__init__()
+        self.semantic_data = pd.read_csv(semantic_path, delimiter='\t', encoding="utf-8")
+        # get dict
+        self.path2=phoneme_path#"%s/2-name2text.txt"%exp_dir#phoneme_path
+        self.path3="%s/3-bert"%(os.path.basename(phoneme_path))#"%s/3-bert"%exp_dir#bert_dir
+        self.path6=semantic_path#"%s/6-name2semantic.tsv"%exp_dir#semantic_path
+        assert os.path.exists(self.path2)
+        assert os.path.exists(self.path6)
+        self.phoneme_data={}
+        with open(self.path2,"r",encoding="utf8")as f:
+            lines=f.read().strip("\n").split("\n")
+        for line in lines:
+            tmp=line.split("\t")
+            if(len(tmp)!=4):continue
+            self.phoneme_data[tmp[0]]=[tmp[1],tmp[2],tmp[3]]
+        # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item()
+        # pad for semantic tokens
+        self.PAD: int = pad_val
+        # self.hz = 25
+        # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read()
+        # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz
+        # self.hz=int(data[:-2])#
+        self.hz=int(os.environ.get("hz","25hz")[:-2])
+        # max seconds of semantic token
+        self.max_sec = max_sec
+        self.min_ps_ratio = min_ps_ratio
+        self.max_ps_ratio = max_ps_ratio
+        if max_sample is not None:
+            self.semantic_data = self.semantic_data[:max_sample]
+        # {idx: (semantic, phoneme)}
+        # semantic list, phoneme list
+        self.semantic_phoneme = []
+        self.item_names = []
+        self.inited = False
+        if not self.inited:
+            # 调用初始化函数
+            self.init_batch()
+            self.inited = True
+            del self.semantic_data
+            del self.phoneme_data
+        # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
+        # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large")
+    def init_batch(self):
+        semantic_data_len = len(self.semantic_data)
+        phoneme_data_len = len(self.phoneme_data.keys())
+        print("semantic_data_len:", semantic_data_len)
+        print("phoneme_data_len:", phoneme_data_len)
+        idx = 0
+        num_not_in = 0
+        num_deleted_bigger = 0
+        num_deleted_ps = 0
+        for i in range(semantic_data_len):
+            # 先依次遍历
+            # get str
+            item_name = self.semantic_data['item_name'][i]
+            # print(self.phoneme_data)
+            try:
+                phoneme, word2ph, text = self.phoneme_data[item_name]
+            except Exception:
+                traceback.print_exc()
+                # print(f"{item_name} not in self.phoneme_data !")
+                num_not_in += 1
+                continue
+            semantic_str = self.semantic_data['semantic_audio'][i]
+            # get token list
+            semantic_ids = [int(idx) for idx in semantic_str.split(' ')]
+            # (T), 是否需要变成 (1, T) -> 不需要，因为需要求 len
+            # 过滤掉太长的样本
+            if len(semantic_ids) > self.max_sec * self.hz:#########1###根据token���数推测总时长过滤时长60s（config里）#40*25=1k
+                num_deleted_bigger += 1
+                continue
+            # (T, ), 这个速度不会很慢，所以可以在一开始就处理，无需在 __getitem__ 里面单个处理####
+            phoneme = phoneme.split(' ')
+            try:
+                phoneme_ids = cleaned_text_to_sequence(phoneme)
+            except:
+                traceback.print_exc()
+                # print(f"{item_name} not in self.phoneme_data !")
+                num_not_in += 1
+                continue
+            # if len(phoneme_ids) >400:###########2：改为恒定限制为semantic/2.5就行
+            if len(phoneme_ids) >self.max_sec * self.hz/2.5:###########2：改为恒定限制为semantic/2.5就行
+                num_deleted_ps += 1
+                continue
+            # if len(semantic_ids) > 1000:###########3
+            #     num_deleted_bigger += 1
+            #     continue
+            ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz)
+            if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio:##########4#3~25#每秒多少个phone
+                num_deleted_ps += 1
+                # print(item_name)
+                continue
+            self.semantic_phoneme.append((semantic_ids, phoneme_ids))
+            idx += 1
+            self.item_names.append(item_name)
+        min_num=100#20直接不补#30补了也不存ckpt
+        leng =len(self.semantic_phoneme)
+        if(leng<min_num):
+            tmp1=self.semantic_phoneme
+            tmp2=self.item_names
+            self.semantic_phoneme=[]
+            self.item_names=[]
+            for _ in range(max(2,int(min_num/leng))):
+                self.semantic_phoneme+=tmp1
+                self.item_names+=tmp2
+        if num_not_in > 0:
+            print(f"there are {num_not_in} semantic datas not in phoneme datas")
+        if num_deleted_bigger > 0:
+            print(
+                f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds"
+            )
+        if num_deleted_ps > 0:
+            # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛？=> 需要，有值为 100 的极端值
+            print(
+                f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}"
+            )
+        '''
+        there are 31 semantic datas not in phoneme datas
+        deleted 34 audios who's duration are bigger than 54 seconds
+        deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3
+        dataset.__len__(): 366463
+        '''
+        # 345410 for LibriTTS
+        print("dataset.__len__():", self.__len__())
+    def __get_item_names__(self) -> List[str]:
+        return self.item_names
+    def __len__(self) -> int:
+        return len(self.semantic_phoneme)
+    def __getitem__(self, idx: int) -> Dict:
+        semantic_ids, phoneme_ids = self.semantic_phoneme[idx]
+        item_name = self.item_names[idx]
+        phoneme_ids_len = len(phoneme_ids)
+        # semantic tokens target
+        semantic_ids_len = len(semantic_ids)
+        flag=0
+        path_bert = "%s/%s.pt" % (self.path3, item_name)
+        if(os.path.exists(path_bert)==True):bert_feature = torch.load(path_bert,map_location="cpu")
+        else:flag=1
+        if(flag==1):
+            # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
+            bert_feature=None
+        else:
+            assert bert_feature.shape[-1] == len(phoneme_ids)
+        return {
+            'idx': idx,
+            'phoneme_ids': phoneme_ids,
+            'phoneme_ids_len': phoneme_ids_len,
+            'semantic_ids': semantic_ids,
+            'semantic_ids_len': semantic_ids_len,
+            'bert_feature': bert_feature,
+        }
+    def get_sample_length(self, idx: int):
+        semantic_ids = self.semantic_phoneme[idx][0]
+        sec = 1.0 * len(semantic_ids) / self.hz
+        return sec
+    def collate(self, examples: List[Dict]) -> Dict:
+        sample_index: List[int] = []
+        phoneme_ids: List[torch.Tensor] = []
+        phoneme_ids_lens: List[int] = []
+        semantic_ids: List[torch.Tensor] = []
+        semantic_ids_lens: List[int] = []
+        # return
+        for item in examples:
+            sample_index.append(item["idx"])
+            phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64))
+            semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64))
+            phoneme_ids_lens.append(item["phoneme_ids_len"])
+            semantic_ids_lens.append(item["semantic_ids_len"])
+        # pad 0
+        phoneme_ids = batch_sequences(phoneme_ids)
+        semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD)
+        # # convert each batch to torch.tensor
+        phoneme_ids = torch.tensor(phoneme_ids)
+        semantic_ids = torch.tensor(semantic_ids)
+        phoneme_ids_lens = torch.tensor(phoneme_ids_lens)
+        semantic_ids_lens = torch.tensor(semantic_ids_lens)
+        bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens))
+        bert_padded.zero_()
+        for idx, item in enumerate(examples):
+            bert = item['bert_feature']
+            if(bert!=None):
+                bert_padded[idx, :, :bert.shape[-1]] = bert
+        return {
+            # List[int]
+            "ids": sample_index,
+            # torch.Tensor (B, max_phoneme_length)
+            "phoneme_ids": phoneme_ids,
+            # torch.Tensor (B)
+            "phoneme_ids_len": phoneme_ids_lens,
+            # torch.Tensor (B, max_semantic_ids_length)
+            "semantic_ids": semantic_ids,
+            # torch.Tensor (B)
+            "semantic_ids_len": semantic_ids_lens,
+            # torch.Tensor (B, 1024, max_phoneme_length)
+            "bert_feature": bert_padded,
+        }
+if __name__ == '__main__':
+    root_dir = '/data/docker/liujing04/gpt-vits/prepare/dump_mix/'
+    dataset = Text2SemanticDataset(
+        phoneme_path=root_dir + 'phoneme_train.npy',
+        semantic_path=root_dir + 'semantic_train.tsv')
+    batch_size = 12
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        collate_fn=dataset.collate,
+        shuffle=False)
+    for i, batch in enumerate(dataloader):
+        if(i%1000==0):print(i)
+        # if i == 0:
+        #     print('batch["ids"]:', batch["ids"])
+            # print('batch["phoneme_ids"]:', batch["phoneme_ids"],
+            #       batch["phoneme_ids"].shape)
+            # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"],
+            #       batch["phoneme_ids_len"].shape)
+            # print('batch["semantic_ids"]:', batch["semantic_ids"],
+            #       batch["semantic_ids"].shape)
+            # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"],
+            #       batch["semantic_ids_len"].shape)

AR/exps/__init__.py ADDED Viewed

File without changes

AR/exps/beats/BEATs.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as ta_kaldi
+from torch.nn import LayerNorm
+from .backbone import TransformerEncoder
+logger = logging.getLogger(__name__)
+class BEATsConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_wise_gradient_decay_ratio: float = 1.0  # ratio for layer-wise gradient decay
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # label predictor
+        self.finetuned_model: bool = False  # whether the model is a fine-tuned model.
+        self.predictor_dropout: float = 0.1  # dropout probability for the predictor
+        self.predictor_class: int = 527  # target class number for the predictor
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+class BEATs(nn.Module):
+    def __init__(
+            self,
+            cfg: BEATsConfig, ) -> None:
+        super().__init__()
+        logger.info(f"BEATs Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim else
+                                  None)
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(
+            1,
+            self.embed,
+            kernel_size=self.input_patch_size,
+            stride=self.input_patch_size,
+            bias=cfg.conv_bias)
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        if cfg.finetuned_model:
+            self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
+            self.predictor = nn.Linear(cfg.encoder_embed_dim,
+                                       cfg.predictor_class)
+        else:
+            self.predictor = None
+    def forward_padding_mask(
+            self,
+            features: torch.Tensor,
+            padding_mask: torch.Tensor, ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+            self,
+            source: torch.Tensor,
+            fbank_mean: float=15.41663,
+            fbank_std: float=6.55582, ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2**15
+            fbank = ta_kaldi.fbank(
+                waveform,
+                num_mel_bins=128,
+                sample_frequency=16000,
+                frame_length=25,
+                frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_features(
+            self,
+            source: torch.Tensor,
+            padding_mask: Optional[torch.Tensor]=None,
+            fbank_mean: float=15.41663,
+            fbank_std: float=6.55582, ):
+        fbank = self.preprocess(
+            source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask, )
+        if self.predictor is not None:
+            x = self.predictor_dropout(x)
+            logits = self.predictor(x)
+            if padding_mask is not None and padding_mask.any():
+                logits[padding_mask] = 0
+                logits = logits.sum(dim=1)
+                logits = logits / (~padding_mask).sum(
+                    dim=1).unsqueeze(-1).expand_as(logits)
+            else:
+                logits = logits.mean(dim=1)
+            lprobs = torch.sigmoid(logits)
+            return lprobs, padding_mask
+        else:
+            return x, padding_mask

AR/exps/beats/README.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# BEATs
+[**BEATs**](https://arxiv.org/abs/2212.09058): **Audio Pre-Training with Acoustic Tokenizers**
+Official PyTorch implementation and pretrained models of BEATs
+## Pre-Trained and Fine-Tuned Tokenizers and Models
+Iterations  | Tokenizer  | Pre-Trained Model | AudioSet Fine-Tuned Model 1 | AudioSet Fine-Tuned Model 2
+|---|---|---|---|---
+Iter1  |  Random Projection | [BEATs_iter1](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter1 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter2  |  [Tokenizer_iter2](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter2](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter2 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3  |  [Tokenizer_iter3](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3+  |  [Tokenizer_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS20K) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3+  |  [Tokenizer_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS2M) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+### Load Tokenizers
+```python
+import torch
+from Tokenizers import TokenizersConfig, Tokenizers
+# load the pre-trained checkpoints
+checkpoint = torch.load('/path/to/tokenizer.pt')
+cfg = TokenizersConfig(checkpoint['cfg'])
+BEATs_tokenizer = Tokenizers(cfg)
+BEATs_tokenizer.load_state_dict(checkpoint['model'])
+BEATs_tokenizer.eval()
+# tokenize the audio and generate the labels
+audio_input_16khz = torch.randn(1, 10000)
+padding_mask = torch.zeros(1, 10000).bool()
+labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask)
+```
+### Load Pre-Trained Models
+```python
+import torch
+from BEATs import BEATs, BEATsConfig
+# load the pre-trained checkpoints
+checkpoint = torch.load('/path/to/model.pt')
+cfg = BEATsConfig(checkpoint['cfg'])
+BEATs_model = BEATs(cfg)
+BEATs_model.load_state_dict(checkpoint['model'])
+BEATs_model.eval()
+# extract the the audio representation
+audio_input_16khz = torch.randn(1, 10000)
+padding_mask = torch.zeros(1, 10000).bool()
+representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
+```
+### Load Fine-tuned Models
+```python
+import torch
+from BEATs import BEATs, BEATsConfig
+# load the fine-tuned checkpoints
+checkpoint = torch.load('/path/to/model.pt')
+cfg = BEATsConfig(checkpoint['cfg'])
+BEATs_model = BEATs(cfg)
+BEATs_model.load_state_dict(checkpoint['model'])
+BEATs_model.eval()
+# predict the classification probability of each class
+audio_input_16khz = torch.randn(3, 10000)
+padding_mask = torch.zeros(3, 10000).bool()
+probs = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
+for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))):
+    top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx]
+    print(f'Top 5 predicted labels of the {i}th audio are {top5_label} with probability of {top5_label_prob}')
+```
+## Evaluation Results
+### Comparing with the SOTA Single Models
+![alt text](Evaluation_Results/Comparing_with_the_SOTA_Single_Models.png)
+### Comparing with the SOTA Ensemble Models
+![alt text](Evaluation_Results/Comparing_with_the_SOTA_Ensemble_Models.png)
+### Comparing Different BEATS Tokenizers
+![alt text](Evaluation_Results/Comparing_Different_BEATS_Tokenizers.png)
+### Comparing Different Pre-Training Targets
+![alt text](Evaluation_Results/Comparing_Different_Pre-Training_Targets.png)
+## License
+This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
+Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) and [VQGAN](https://github.com/CompVis/taming-transformers) project.
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
+### Reference
+If you find our work is useful in your research, please cite the following paper:
+``` latex
+@article{Chen2022beats,
+  title = {BEATs: Audio Pre-Training with Acoustic Tokenizers},
+  author  = {Sanyuan Chen and Yu Wu and Chengyi Wang and Shujie Liu and Daniel Tompkins and Zhuo Chen and Furu Wei},
+  eprint={2212.09058},
+  archivePrefix={arXiv},
+  year={2022}
+}
+```
+### Contact Information
+For help or issues using BEATs models, please submit a GitHub issue.
+For other communications related to  BEATs, please contact Yu Wu (`yuwu1@microsoft.com`).

AR/exps/beats/Tokenizers.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as ta_kaldi
+from backbone import (
+    TransformerEncoder, )
+from quantizer import (
+    NormEMAVectorQuantizer, )
+from torch.nn import LayerNorm
+logger = logging.getLogger(__name__)
+class TokenizersConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # quantizer
+        self.quant_n: int = 1024  # codebook number in quantizer
+        self.quant_dim: int = 256  # codebook dimension in quantizer
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+class Tokenizers(nn.Module):
+    def __init__(
+            self,
+            cfg: TokenizersConfig, ) -> None:
+        super().__init__()
+        logger.info(f"Tokenizers Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim else
+                                  None)
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(
+            1,
+            self.embed,
+            kernel_size=self.input_patch_size,
+            stride=self.input_patch_size,
+            bias=cfg.conv_bias)
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        self.quantize = NormEMAVectorQuantizer(
+            n_embed=cfg.quant_n,
+            embedding_dim=cfg.quant_dim,
+            beta=1.0,
+            kmeans_init=True,
+            decay=0.99, )
+        self.quant_n = cfg.quant_n
+        self.quantize_layer = nn.Sequential(
+            nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
+            nn.Tanh(),
+            nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim)  # for quantize
+        )
+    def forward_padding_mask(
+            self,
+            features: torch.Tensor,
+            padding_mask: torch.Tensor, ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+            self,
+            source: torch.Tensor,
+            fbank_mean: float=15.41663,
+            fbank_std: float=6.55582, ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2**15
+            fbank = ta_kaldi.fbank(
+                waveform,
+                num_mel_bins=128,
+                sample_frequency=16000,
+                frame_length=25,
+                frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_labels(
+            self,
+            source: torch.Tensor,
+            padding_mask: Optional[torch.Tensor]=None,
+            fbank_mean: float=15.41663,
+            fbank_std: float=6.55582, ):
+        fbank = self.preprocess(
+            source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask, )
+        quantize_input = self.quantize_layer(x)
+        quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
+        return embed_ind

AR/exps/beats/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # this folder is modified from https://github.com/microsoft/unilm/tree/master/beats
2	+ # ontology.json is from https://github.com/audioset/ontology/

AR/exps/beats/backbone.py ADDED Viewed

	@@ -0,0 +1,791 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import math
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from torch.nn import LayerNorm
+from torch.nn import Parameter
+from .modules import get_activation_fn
+from .modules import GLU_Linear
+from .modules import GradMultiply
+from .modules import quant_noise
+from .modules import SamePad
+class TransformerEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.pos_conv = nn.Conv1d(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups, )
+        dropout = 0
+        std = math.sqrt(
+            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        nn.init.constant_(self.pos_conv.bias, 0)
+        self.pos_conv = nn.utils.weight_norm(
+            self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv,
+                                      SamePad(args.conv_pos), nn.GELU())
+        if hasattr(args, "relative_position_embedding"):
+            self.relative_position_embedding = args.relative_position_embedding
+            self.num_buckets = args.num_buckets
+            self.max_distance = args.max_distance
+        else:
+            self.relative_position_embedding = False
+            self.num_buckets = 0
+            self.max_distance = 0
+        self.layers = nn.ModuleList([
+            TransformerSentenceEncoderLayer(
+                embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                num_attention_heads=args.encoder_attention_heads,
+                dropout=self.dropout,
+                attention_dropout=args.attention_dropout,
+                activation_dropout=args.activation_dropout,
+                activation_fn=args.activation_fn,
+                layer_norm_first=args.layer_norm_first,
+                deep_norm=args.deep_norm,
+                has_relative_attention_bias=self.relative_position_embedding,
+                num_buckets=self.num_buckets,
+                max_distance=self.max_distance,
+                gru_rel_pos=args.gru_rel_pos,
+                encoder_layers=args.encoder_layers, )
+            for i in range(args.encoder_layers)
+        ])
+        if self.relative_position_embedding:
+            for i in range(1, args.encoder_layers):
+                del self.layers[i].self_attn.relative_attention_bias
+                self.layers[i].self_attn.relative_attention_bias = self.layers[
+                    0].self_attn.relative_attention_bias
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+        self.apply(init_bert_params)
+        if args.deep_norm:
+            deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
+            for i in range(args.encoder_layers):
+                nn.init.xavier_normal_(
+                    self.layers[i].self_attn.k_proj.weight, gain=1)
+                nn.init.xavier_normal_(
+                    self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(
+                    self.layers[i].self_attn.q_proj.weight, gain=1)
+                nn.init.xavier_normal_(
+                    self.layers[i].self_attn.out_proj.weight,
+                    gain=deep_norm_beta)
+                nn.init.xavier_normal_(
+                    self.layers[i].fc1.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(
+                    self.layers[i].fc2.weight, gain=deep_norm_beta)
+        self.layer_wise_gradient_decay_ratio = getattr(
+            args, "layer_wise_gradient_decay_ratio", 1)
+    def forward(self, x, padding_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer)
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+        return x, layer_results
+    def extract_features(self, x, padding_mask=None, tgt_layer=None):
+        if padding_mask is not None:
+            x[padding_mask] = 0
+        x_conv = self.pos_conv(x.transpose(1, 2))
+        x_conv = x_conv.transpose(1, 2)
+        x = x + x_conv
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        layer_results = []
+        z = None
+        if tgt_layer is not None:
+            layer_results.append((x, z))
+        r = None
+        pos_bias = None
+        for i, layer in enumerate(self.layers):
+            if self.layer_wise_gradient_decay_ratio != 1.0:
+                x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z, pos_bias = layer(
+                    x,
+                    self_attn_padding_mask=padding_mask,
+                    need_weights=False,
+                    pos_bias=pos_bias)
+            if tgt_layer is not None:
+                layer_results.append((x, z))
+            if i == tgt_layer:
+                r = x
+                break
+        if r is not None:
+            x = r
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        return x, layer_results
+class TransformerSentenceEncoderLayer(nn.Module):
+    def __init__(
+            self,
+            embedding_dim: float=768,
+            ffn_embedding_dim: float=3072,
+            num_attention_heads: float=8,
+            dropout: float=0.1,
+            attention_dropout: float=0.1,
+            activation_dropout: float=0.1,
+            activation_fn: str="relu",
+            layer_norm_first: bool=False,
+            deep_norm: bool=False,
+            has_relative_attention_bias: bool=False,
+            num_buckets: int=0,
+            max_distance: int=0,
+            rescale_init: bool=False,
+            gru_rel_pos: bool=False,
+            encoder_layers: int=0, ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.activation_name = activation_fn
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            has_relative_attention_bias=has_relative_attention_bias,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
+            rescale_init=rescale_init,
+            gru_rel_pos=gru_rel_pos, )
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.layer_norm_first = layer_norm_first
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        if self.activation_name == "glu":
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
+                                  "swish")
+        else:
+            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+        self.deep_norm = deep_norm
+        if self.deep_norm:
+            self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
+        else:
+            self.deep_norm_alpha = 1
+    def forward(self,
+                x: torch.Tensor,
+                self_attn_mask: torch.Tensor=None,
+                self_attn_padding_mask: torch.Tensor=None,
+                need_weights: bool=False,
+                pos_bias=None):
+        residual = x
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias)
+            x = self.dropout1(x)
+            x = residual + x
+            residual = x
+            x = self.final_layer_norm(x)
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=need_weights,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias)
+            x = self.dropout1(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.self_attn_layer_norm(x)
+            residual = x
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.final_layer_norm(x)
+        return x, attn, pos_bias
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=False,
+            encoder_decoder_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            has_relative_attention_bias=False,
+            num_buckets=32,
+            max_distance=128,
+            gru_rel_pos=False,
+            rescale_init=False, ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout_module = nn.Dropout(dropout)
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
+        self.head_dim = embed_dim // num_heads
+        self.q_head_dim = self.head_dim
+        self.k_head_dim = self.head_dim
+        assert (self.head_dim * num_heads == self.embed_dim
+                ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and "
+            "value to be of the same size")
+        k_bias = True
+        if rescale_init:
+            k_bias = False
+        k_embed_dim = embed_dim
+        q_embed_dim = embed_dim
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise,
+            qn_block_size)
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise,
+            qn_block_size)
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.grep_linear = nn.Linear(self.q_head_dim, 8)
+            self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+        if self.has_relative_attention_bias:
+            nn.init.xavier_normal_(self.relative_attention_bias.weight)
+    def _relative_positions_bucket(self, relative_positions,
+                                   bidirectional=True):
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (
+                relative_positions > 0).to(torch.long) * num_buckets
+            relative_positions = torch.abs(relative_positions)
+        else:
+            relative_positions = -torch.min(
+                relative_positions, torch.zeros_like(relative_positions))
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_positions.float() / max_exact) / math.log(
+                max_distance / max_exact) *
+            (num_buckets - max_exact)).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1))
+        relative_buckets += torch.where(is_small, relative_positions,
+                                        relative_postion_if_large)
+        return relative_buckets
+    def compute_bias(self, query_length, key_length):
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(
+            relative_position, bidirectional=True)
+        relative_position_bucket = relative_position_bucket.to(
+            self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+    def forward(self,
+                query,
+                key: Optional[Tensor],
+                value: Optional[Tensor],
+                key_padding_mask: Optional[Tensor]=None,
+                incremental_state: Optional[Dict[str, Dict[str, Optional[
+                    Tensor]]]]=None,
+                need_weights: bool=True,
+                static_kv: bool=False,
+                attn_mask: Optional[Tensor]=None,
+                before_softmax: bool=False,
+                need_head_weights: bool=False,
+                position_bias: Optional[Tensor]=None
+                ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        is_tpu = query.device.type == "xla"
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+        if self.has_relative_attention_bias and position_bias is None:
+            position_bias = self.compute_bias(tgt_len, src_len)
+            position_bias = position_bias.unsqueeze(0).repeat(
+                bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        alpha = 32
+        q *= 1 / alpha
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
+                    dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1, )
+        q = (q.contiguous().view(tgt_len, bsz * self.num_heads, self.q_head_dim)
+             .transpose(0, 1))
+        if k is not None:
+            k = (k.contiguous().view(-1, bsz * self.num_heads, self.k_head_dim)
+                 .transpose(0, 1))
+        if v is not None:
+            v = (v.contiguous().view(-1, bsz * self.num_heads, self.head_dim)
+                 .transpose(0, 1))
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1,
+                                          self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1,
+                                              self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv, )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1,
+                                             self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1,
+                                               self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat(
+                [k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat(
+                [v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
+                    dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0),
+                                    1).type_as(key_padding_mask),
+                    ],
+                    dim=1, )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = (
+            attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
+                                              bsz)
+        assert list(
+            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
+                                             src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"), )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask,
+                                                        float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
+                                             src_len)
+        if before_softmax:
+            return attn_weights, v, position_bias
+        if position_bias is not None:
+            attn_mask_rel_pos = position_bias
+            if self.gru_rel_pos == 1:
+                query_layer = q.view(bsz, self.num_heads, tgt_len,
+                                     self.q_head_dim) * alpha / self.scaling
+                _B, _H, _L, __ = query_layer.size()
+                gate_a, gate_b = torch.sigmoid(
+                    self.grep_linear(query_layer).view(_B, _H, _L, 2, 4).sum(
+                        -1, keepdim=False)).chunk(
+                            2, dim=-1)
+                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
+                attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len,
+                                                  1) * position_bias
+            attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
+            attn_weights = attn_weights + attn_mask_rel_pos
+        attn_weights_float = F.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(
+            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len,
+                                                   src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights, position_bias
+    @staticmethod
+    def _append_prev_key_padding_mask(
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool, ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()],
+                dim=1)
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device, )
+                new_key_padding_mask = torch.cat(
+                    [prev_key_padding_mask.float(), filler.float()], dim=1)
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device, )
+                new_key_padding_mask = torch.cat(
+                    [filler.float(), key_padding_mask.float()], dim=1)
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    def _get_input_buffer(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]], ):
+        return self.set_incremental_state(incremental_state, "attn_state",
+                                          buffer)
+    def apply_sparse_mask(self,
+                          attn_weights,
+                          tgt_len: int,
+                          src_len: int,
+                          bsz: int):
+        return attn_weights
+def init_bert_params(module):
+    """
+    Initialize the weights specific to the BERT Model.
+    This overrides the default initializations depending on the specified arguments.
+        1. If normal_init_linear_weights is set then weights of linear
+           layer will be initialized using the normal distribution and
+           bais will be set to the specified value.
+        2. If normal_init_embed_weights is set then weights of embedding
+           layer will be initialized using the normal distribution.
+        3. If normal_init_proj_weights is set then weights of
+           in_project_weight for MultiHeadAttention initialized using
+           the normal distribution (to be validated).
+    """
+    def normal_(data):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        normal_(module.q_proj.weight.data)
+        normal_(module.k_proj.weight.data)
+        normal_(module.v_proj.weight.data)

AR/exps/beats/config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import json
+import os
+# 获取当前脚本的所在目录
+script_dir = os.path.dirname(os.path.abspath(__file__))
+# JSON 文件的文件名
+json_filename = "ontology.json"
+# 构建 JSON 文件的完整路径
+json_path = os.path.join(script_dir, json_filename)
+id_name_dict = {}
+with open(json_path, 'r') as f:
+    json_items = json.load(f)
+# '/m/0dgw9r' -> 'Human sounds' and etc.
+for item in json_items:
+    id_name_dict[item['id']] = item['name']

AR/exps/beats/modules.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+class SamePad(nn.Module):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, :-self.remove]
+        return x
+class Swish(nn.Module):
+    def __init__(self):
+        super(Swish, self).__init__()
+        self.act = torch.nn.Sigmoid()
+    def forward(self, x):
+        return x * self.act(x)
+class GLU_Linear(nn.Module):
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 glu_type="sigmoid",
+                 bias_in_glu=True):
+        super(GLU_Linear, self).__init__()
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        if glu_type == "sigmoid":
+            self.glu_act = torch.nn.Sigmoid()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        elif glu_type == "relu":
+            self.glu_act = torch.nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = torch.nn.GELU()
+        if bias_in_glu:
+            self.linear = nn.Linear(input_dim, output_dim * 2, True)
+        else:
+            self.linear = nn.Linear(input_dim, output_dim * 2, False)
+    def forward(self, x):
+        # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
+        x = self.linear(x)
+        if self.glu_type == "bilinear":
+            x = (x[:, :, 0:self.output_dim] *
+                 x[:, :, self.output_dim:self.output_dim * 2])
+        else:
+            x = (x[:, :, 0:self.output_dim] *
+                 self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
+        return x
+def gelu_accurate(x):
+    if not hasattr(gelu_accurate, "_a"):
+        gelu_accurate._a = math.sqrt(2 / math.pi)
+    return (0.5 * x * (1 + torch.tanh(gelu_accurate._a *
+                                      (x + 0.044715 * torch.pow(x, 3)))))
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.gelu(x.float()).type_as(x)
+def get_activation_fn(activation: str):
+    """Returns the activation function corresponding to `activation`"""
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        warnings.warn(
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate")
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return torch.tanh
+    elif activation == "linear":
+        return lambda x: x
+    elif activation == "glu":
+        return lambda x: x
+    else:
+        raise RuntimeError(
+            "--activation-fn {} not supported".format(activation))
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+    # supported modules
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+    # 2D matrix
+    if not is_conv:
+        assert (
+            module.weight.size(1) %
+            block_size == 0), "Input features must be a multiple of block sizes"
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            assert (module.in_channels % block_size == 0
+                    ), "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(
+                    in_features // block_size * out_features,
+                    device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1,
+                                                                   in_features)
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device, )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(
+                        -1, in_channels)
+                else:
+                    mask = torch.zeros(
+                        weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = (
+                        mask.unsqueeze(2).unsqueeze(3)
+                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))
+            # scale weights and apply mask
+            mask = mask.to(
+                torch.
+                bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module

AR/exps/beats/ontology.json ADDED Viewed

The diff for this file is too large to render. See raw diff

AR/exps/beats/quantizer.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on VQGAN code bases
+# https://github.com/CompVis/taming-transformers
+# --------------------------------------------------------'
+import torch
+import torch.distributed as distributed
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from einops import rearrange, repeat
+except ImportError:
+    pass
+def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)
+def ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def sample_vectors(samples, num):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num, ), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False):
+    dim, dtype, device = samples.shape[-1], samples.dtype, samples.device
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        if use_cosine_sim:
+            dists = samples @ means.t()
+        else:
+            diffs = rearrange(samples, 'n d -> n () d') \
+                    - rearrange(means, 'c d -> () c d')
+            dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        if use_cosine_sim:
+            new_means = l2norm(new_means)
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EmbeddingEMA(nn.Module):
+    def __init__(self,
+                 num_tokens,
+                 codebook_dim,
+                 decay=0.99,
+                 eps=1e-5,
+                 kmeans_init=True,
+                 codebook_init_path=''):
+        super().__init__()
+        self.num_tokens = num_tokens
+        self.codebook_dim = codebook_dim
+        self.decay = decay
+        self.eps = eps
+        if codebook_init_path == '':
+            if not kmeans_init:
+                weight = torch.randn(num_tokens, codebook_dim)
+                weight = l2norm(weight)
+            else:
+                weight = torch.zeros(num_tokens, codebook_dim)
+            self.register_buffer('initted', torch.Tensor([not kmeans_init]))
+        else:
+            print(f"load init codebook weight from {codebook_init_path}")
+            codebook_ckpt_weight = torch.load(
+                codebook_init_path, map_location='cpu')
+            weight = codebook_ckpt_weight.clone()
+            self.register_buffer('initted', torch.Tensor([True]))
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        self.cluster_size = nn.Parameter(
+            torch.zeros(num_tokens), requires_grad=False)
+        self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
+        # self.register_buffer('initted', torch.Tensor([not kmeans_init]))
+        self.update = True
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.initted:
+            return
+        print("Performing Kemans init for codebook")
+        embed, cluster_size = kmeans(
+            data, self.num_tokens, 10, use_cosine_sim=True)
+        self.weight.data.copy_(embed)
+        self.cluster_size.data.copy_(cluster_size)
+        self.initted.data.copy_(torch.Tensor([True]))
+    def forward(self, embed_id):
+        return F.embedding(embed_id, self.weight)
+    def cluster_size_ema_update(self, new_cluster_size):
+        self.cluster_size.data.mul_(self.decay).add_(
+            new_cluster_size, alpha=1 - self.decay)
+    def embed_avg_ema_update(self, new_embed_avg):
+        self.embed_avg.data.mul_(self.decay).add_(
+            new_embed_avg, alpha=1 - self.decay)
+    def weight_update(self, num_tokens):
+        n = self.cluster_size.sum()
+        smoothed_cluster_size = (
+            (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n)
+        # normalize embedding average with smoothed cluster size
+        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+        # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1))
+        self.weight.data.copy_(embed_normalized)
+def norm_ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+    moving_avg.data.copy_(l2norm(moving_avg.data))
+class NormEMAVectorQuantizer(nn.Module):
+    def __init__(self,
+                 n_embed,
+                 embedding_dim,
+                 beta,
+                 decay=0.99,
+                 eps=1e-5,
+                 statistic_code_usage=True,
+                 kmeans_init=False,
+                 codebook_init_path=''):
+        super().__init__()
+        self.codebook_dim = embedding_dim
+        self.num_tokens = n_embed
+        self.beta = beta
+        self.decay = decay
+        # learnable = True if orthogonal_reg_weight > 0 else False
+        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay,
+                                      eps, kmeans_init, codebook_init_path)
+        self.statistic_code_usage = statistic_code_usage
+        if statistic_code_usage:
+            self.register_buffer('cluster_size', torch.zeros(n_embed))
+        if distributed.is_available() and distributed.is_initialized():
+            print(
+                "ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!"
+            )
+            self.all_reduce_fn = distributed.all_reduce
+        else:
+            self.all_reduce_fn = nn.Identity()
+    def reset_cluster_size(self, device):
+        if self.statistic_code_usage:
+            self.register_buffer('cluster_size', torch.zeros(self.num_tokens))
+            self.cluster_size = self.cluster_size.to(device)
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        # z, 'b c h w -> b h w c'
+        # z = rearrange(z, 'b c h w -> b h w c')
+        # z = z.transpose(1, 2)
+        z = l2norm(z)
+        z_flattened = z.reshape(-1, self.codebook_dim)
+        self.embedding.init_embed_(z_flattened)
+        d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
+            self.embedding.weight.pow(2).sum(dim=1) - 2 * \
+            torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight)  # 'n d -> d n'
+        encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(encoding_indices).view(z.shape)
+        encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
+        if not self.training:
+            with torch.no_grad():
+                cluster_size = encodings.sum(0)
+                self.all_reduce_fn(cluster_size)
+                ema_inplace(self.cluster_size, cluster_size, self.decay)
+        if self.training and self.embedding.update:
+            # EMA cluster size
+            bins = encodings.sum(0)
+            self.all_reduce_fn(bins)
+            # self.embedding.cluster_size_ema_update(bins)
+            ema_inplace(self.cluster_size, bins, self.decay)
+            zero_mask = (bins == 0)
+            bins = bins.masked_fill(zero_mask, 1.)
+            embed_sum = z_flattened.t() @ encodings
+            self.all_reduce_fn(embed_sum)
+            embed_normalized = (embed_sum / bins.unsqueeze(0)).t()
+            embed_normalized = l2norm(embed_normalized)
+            embed_normalized = torch.where(
+                zero_mask[..., None], self.embedding.weight, embed_normalized)
+            norm_ema_inplace(self.embedding.weight, embed_normalized,
+                             self.decay)
+        # compute loss for embedding
+        loss = self.beta * F.mse_loss(z_q.detach(), z)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        # z_q, 'b h w c -> b c h w'
+        # z_q = rearrange(z_q, 'b h w c -> b c h w')
+        # z_q = z_q.transpose(1, 2)
+        return z_q, loss, encoding_indices

AR/exps/get_beats_librilight.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# Use AudioTag tool BEATs to filter out audios who's top1 tag is not 'speech'
+# non_speech.npy, 存储一个 python dict 表示非 speech 类型的音频的 tag, 更小，加载和搜索速度更快
+# audio_tag 目录存储 {utt_id}.txt, 第一行是小写的 top1 tag
+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+import tqdm
+from AR.exps.beats.BEATs import BEATs
+from AR.exps.beats.BEATs import BEATsConfig
+from AR.exps.beats.config import id_name_dict
+from soundstorm.s2.exps.hubert.feature_utils import get_shard_range
+from soundstorm.utils import check_txt_file
+def get_BEATs_top1(wav,
+                   BEATs_model,
+                   BEATs_label_dict,
+                   device: str='cpu',
+                   topk: int=1):
+    wav = torch.tensor(wav).unsqueeze(0).to(device)
+    padding_mask = torch.zeros(wav.shape).bool().to(device)
+    probs = BEATs_model.extract_features(wav, padding_mask=padding_mask)[0]
+    # 单条推理
+    probs = probs[0]
+    topk_label_prob, topk_label_idx = probs.topk(k=topk)
+    topk_label = [
+        BEATs_label_dict[label_idx.item()] for label_idx in topk_label_idx
+    ]
+    topk_label_name = [id_name_dict[label] for label in topk_label]
+    top1_label = topk_label_name[0]
+    return top1_label
+def process_sentence(args,
+                     fp: Path,
+                     train_dump_dir: Path,
+                     dev_dump_dir: Path,
+                     test_dump_dir: Path,
+                     VAD_dict,
+                     BEATs_model,
+                     BEATs_label_dict,
+                     device: str='cpu'):
+    utt_id = fp.stem
+    sr = args.sr
+    record = []
+    train_audio_tag_dir = train_dump_dir / "audio_tag"
+    train_audio_tag_dir.mkdir(parents=True, exist_ok=True)
+    dev_audio_tag_dir = dev_dump_dir / "audio_tag"
+    dev_audio_tag_dir.mkdir(parents=True, exist_ok=True)
+    test_audio_tag_dir = test_dump_dir / "audio_tag"
+    test_audio_tag_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        # get info for path
+        wav_path_list = str(fp).strip().split('/')
+        sub_dataset, spk_id, book_name = wav_path_list[-4], wav_path_list[
+            -3], wav_path_list[-2]
+        wav_name = wav_path_list[-1][:-5]
+        assert wav_name == utt_id
+        # key_name for big wav
+        key_name = f'{wav_name}#{sub_dataset}#{spk_id}#{book_name}'
+        # 判断 VAD 字典中不存在该条音频信息的情况
+        if key_name not in VAD_dict.keys():
+            print(key_name, 'not in VAD_dict !')
+            return record
+        wav = None
+        sorted_split_VAD_dict = sorted(VAD_dict[key_name].items())
+        len_dict = len(sorted_split_VAD_dict)
+        for index, item in enumerate(sorted_split_VAD_dict):
+            split_name, value = item
+            start, end = value
+            # train | dev | test
+            if index == len_dict - 1:
+                subset = 'test'
+                audio_tag_path = test_audio_tag_dir / (split_name + ".txt")
+            elif index == len_dict - 2:
+                subset = 'dev'
+                audio_tag_path = dev_audio_tag_dir / (split_name + ".txt")
+            else:
+                subset = 'train'
+                audio_tag_path = train_audio_tag_dir / (split_name + ".txt")
+            if os.path.exists(audio_tag_path) and check_txt_file(
+                    audio_tag_path):
+                # print(audio_tag_path, 'exits!')
+                pass
+            else:
+                # 这里加判断保证在 sub wav 的循环中只 load 一次
+                if wav is None:
+                    # load big wav
+                    # 在最外层 load 如果 sub wav 的特征都存在了就会白白消耗 load 的时间
+                    wav, _ = librosa.load(str(fp), sr=sr)
+                sub_wav = wav[int(start * sr):int(end * sr)]
+                audio_tag_top1 = get_BEATs_top1(
+                    wav=sub_wav,
+                    BEATs_model=BEATs_model,
+                    BEATs_label_dict=BEATs_label_dict,
+                    device=device)
+                with open(audio_tag_path, 'w') as f:
+                    f.write(audio_tag_top1)
+            sub_record = {
+                "utt_id": split_name,
+                "audio_tag_path": audio_tag_path,
+                "subset": subset
+            }
+            # recodrd 变成 List of Dict
+            record.append(sub_record)
+    except Exception:
+        print("occur Exception")
+        traceback.print_exc()
+        # record 有可能是一个不完整的 List
+        return record
+    return record
+def process_sentences(args,
+                      fps: Path,
+                      train_dump_dir: Path,
+                      dev_dump_dir: Path,
+                      test_dump_dir: Path,
+                      VAD_dict,
+                      BEATs_model,
+                      BEATs_label_dict,
+                      device: str='cpu',
+                      nprocs: int=1):
+    print("nprocs:", nprocs)
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                args=args,
+                fp=fp,
+                train_dump_dir=train_dump_dir,
+                dev_dump_dir=dev_dump_dir,
+                test_dump_dir=test_dump_dir,
+                VAD_dict=VAD_dict,
+                BEATs_model=BEATs_model,
+                BEATs_label_dict=BEATs_label_dict,
+                device=device)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, args, fp,
+                                         train_dump_dir, dev_dump_dir,
+                                         test_dump_dir, VAD_dict, BEATs_model,
+                                         BEATs_label_dict, device)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+    # torch.save() to a large `.pth` file
+    non_speech_dict = dict()
+    non_speech_dict['train'] = {}
+    non_speech_dict['dev'] = {}
+    non_speech_dict['test'] = {}
+    # record 是 List of Dict, 一条大 wav 一个 record，一条小 wav 一个 sub_recored
+    print(f"start to save {args.rank}_{args.nshard}.npy ...")
+    save_start_time = time.time()
+    for record in tqdm.tqdm(results, total=len(results), colour='green'):
+        for sub_record in record:
+            # 这里加 try, 因为 txt 文件可能损坏
+            try:
+                utt_id = sub_record["utt_id"]
+                subset = sub_record["subset"]
+                audio_tag_top1 = check_txt_file(sub_record["audio_tag_path"])
+                if audio_tag_top1 is not False:
+                    if 'speech' not in audio_tag_top1.lower():
+                        non_speech_dict[subset][utt_id] = audio_tag_top1
+                    else:
+                        # print(f'audio tag result of {utt_id} is speech')
+                        pass
+                else:
+                    print(f'audio tag result of {utt_id} is False')
+            except Exception:
+                print(f"{utt_id} occur Exception")
+                traceback.print_exc()
+                continue
+    train_filename = train_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
+    dev_filename = dev_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
+    test_filename = test_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
+    np.save(train_filename, non_speech_dict['train'])
+    print(f"npy file '{train_filename}' write down")
+    np.save(dev_filename, non_speech_dict['dev'])
+    print(f"npy file '{dev_filename}' write down")
+    np.save(test_filename, non_speech_dict['test'])
+    print(f"npy file '{test_filename}' write down")
+    print('time of save stage:', time.time() - save_start_time)
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Use AudioTag tool BEATs to filter out audios who's top1 tag is not 'speech'."
+    )
+    parser.add_argument(
+        "--data_dir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+    parser.add_argument(
+        '--sr', type=int, default=16000, help='sample rate of model')
+    # For LibriLight dataset
+    parser.add_argument(
+        "--sub_dataset",
+        default="small",
+        type=str,
+        help="name of sub dataset of LibriLight",
+        choices=['small', 'medium', 'large', 'duplicate'], )
+    parser.add_argument(
+        "--VAD_path", type=str, default='./VAD/librilight_segment_dict.npy')
+    parser.add_argument("--nshard", type=int, default=3)
+    parser.add_argument("--rank", type=int, default=0)
+    # for BEATs
+    parser.add_argument(
+        "--BEATs_ckpt_path",
+        type=str,
+        default='./pretrained_model/BEATs_iter1_finetuned_on_AS2M_cpt1.pt')
+    args = parser.parse_args()
+    data_dir = Path(args.data_dir).expanduser()
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    assert data_dir.is_dir()
+    # sub_dataset here
+    sub_dataset_dir = data_dir / args.sub_dataset
+    # olny spk_id in list, sort by lexicographical order
+    speaker_list = sorted(os.listdir(sub_dataset_dir))
+    start, end = get_shard_range(len(speaker_list), args.nshard, args.rank)
+    # speaker_list for this rank
+    speaker_list = speaker_list[start:end]
+    all_wav_files = []
+    for speaker in speaker_list:
+        wav_files = sorted(list((sub_dataset_dir / speaker).rglob("*/*.flac")))
+        # filter out ._*.flac
+        wav_files = [
+            file for file in wav_files if not file.name.startswith('._')
+        ]
+        all_wav_files += wav_files
+    print(f"num of wav files in rank {args.rank}:", len(all_wav_files))
+    # get VAD info
+    VAD_dict = np.load(args.VAD_path, allow_pickle=True).item()
+    sub_dataset_dump_dir = dump_dir / args.sub_dataset
+    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
+    train_dump_dir = sub_dataset_dump_dir / "train"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = sub_dataset_dump_dir / "dev"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = sub_dataset_dump_dir / "test"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+    BEATs_ckpt = torch.load(args.BEATs_ckpt_path)
+    BEATs_cfg = BEATsConfig(BEATs_ckpt['cfg'])
+    BEATs_model = BEATs(BEATs_cfg)
+    BEATs_model.load_state_dict(BEATs_ckpt['model'])
+    BEATs_model.eval()
+    # cpu or cuda
+    device = 'cpu'
+    BEATs_model.to(device)
+    BEATs_label_dict = BEATs_ckpt['label_dict']
+    # 每条大 wav 分出一个 dev 一个 test，比例大概是 96:2:2
+    if all_wav_files:
+        process_sentences(
+            args=args,
+            fps=all_wav_files,
+            train_dump_dir=train_dump_dir,
+            dev_dump_dir=dev_dump_dir,
+            test_dump_dir=test_dump_dir,
+            VAD_dict=VAD_dict,
+            BEATs_model=BEATs_model,
+            BEATs_label_dict=BEATs_label_dict,
+            device=device,
+            nprocs=args.num_cpu)
+if __name__ == "__main__":
+    main()

AR/exps/get_phones.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+1. read text of dataset
+2. text -> IPA by GruutPhonemizer
+3. save out a *.npy dict for all text
+my_dict = {"utt_id1": text1, "utt_id2": text2}
+np.save(output_filename, my_dict)
+my_dict = np.load(output_filename, allow_pickle=True).item()
+"""
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import List
+import numpy as np
+import tqdm
+from AR.text_processing.phonemizer import GruutPhonemizer
+def read_txt(txt_file):
+    utt_name = txt_file.stem
+    utt_id = utt_name.split('.')[0]
+    try:
+        with open(txt_file, 'r') as file:
+            txt = file.readline()
+        record = {"utt_id": utt_id, "txt": txt}
+    except Exception:
+        print("occur Exception")
+        traceback.print_exc()
+        return None
+    return record
+def read_txts(txt_files: List[Path], nprocs: int=1):
+    if nprocs == 1:
+        results = []
+        for txt_file in tqdm.tqdm(txt_files, total=len(txt_files)):
+            record = read_txt(txt_file=txt_file)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(txt_files)) as progress:
+                for txt_file in txt_files:
+                    future = pool.submit(read_txt, txt_file)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+    results.sort(key=itemgetter("utt_id"))
+    return_list = []
+    for item in results:
+        return_list.append((item["utt_id"], item["txt"]))
+    return return_list
+def process_sentence(item, phonemizer):
+    utt_id, text = item
+    try:
+        phonemes = phonemizer.phonemize(text, espeak=False)
+        record = {"utt_id": utt_id, "phonemes": phonemes}
+    except Exception:
+        print("occur Exception")
+        traceback.print_exc()
+        return None
+    return record
+def process_sentences(items, phonemizer, output_dir, nprocs: int=1):
+    if nprocs == 1:
+        results = []
+        for item in tqdm.tqdm(items, total=len(items)):
+            record = process_sentence(item=item, phonemizer=phonemizer)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(items)) as progress:
+                for item in items:
+                    future = pool.submit(process_sentence, item, phonemizer)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+    results.sort(key=itemgetter("utt_id"))
+    npy_dict = {}
+    for item in results:
+        utt_id = item["utt_id"]
+        phonemes = item["phonemes"]
+        npy_dict[utt_id] = phonemes
+    filename = output_dir / 'phonemes.npy'
+    np.save(filename, npy_dict)
+    print(f"npy file '{filename}' write down")
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(description="Get phones for datasets")
+    parser.add_argument(
+        "--dataset",
+        default="ljspeech",
+        type=str,
+        help="name of dataset, should in {ljspeech, libritts} now")
+    parser.add_argument(
+        "--data_dir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+    args = parser.parse_args()
+    data_dir = Path(args.data_dir).expanduser()
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    assert data_dir.is_dir()
+    if args.dataset == "ljspeech":
+        data_dict = {}
+        text_path = data_dir / 'metadata.csv'
+        with open(text_path, 'r') as rf:
+            for line in rf:
+                line_list = line.strip().split('|')
+                utt_id = line_list[0]
+                raw_text = line_list[-1]
+                data_dict[utt_id] = raw_text
+        sorted_dict = sorted(data_dict.items())
+        num_train = 12900
+        num_dev = 100
+        # (utt_id, txt)
+        train_txts = sorted_dict[:num_train]
+        dev_txts = sorted_dict[num_train:num_train + num_dev]
+        test_txts = sorted_dict[num_train + num_dev:]
+    elif args.dataset == "libritts":
+        '''
+        we use train-clean-100、train-clean-360、train-other-500 here
+        and split dev and test from them, don't use test-* and dev-* cause the speakers are disjoint
+        the file structure is LibriTTS_R/train-clean-100/spkid/*/*.wav
+        there are about 2311 in these subsets, we split 1 dev and 1 test wav out from each speaker
+        '''
+        txt_files = []
+        train_txt_files = []
+        dev_txt_files = []
+        test_txt_files = []
+        sub_num_dev = 1
+        for sub_dataset_name in {
+                "train-clean-100", "train-clean-360", "train-other-500"
+        }:
+            sub_dataset_dir = data_dir / sub_dataset_name
+            # filter out hidden files
+            speaker_list = [
+                file for file in os.listdir(sub_dataset_dir)
+                if not file.startswith('.')
+            ]
+            for speaker in speaker_list:
+                txt_files = sorted(
+                    list((sub_dataset_dir / speaker).rglob(
+                        "*/*.normalized.txt")))
+                # filter out ._*.wav
+                txt_files = [
+                    file for file in txt_files if not file.name.startswith('._')
+                ]
+                train_txt_files += txt_files[:-sub_num_dev * 2]
+                dev_txt_files += txt_files[-sub_num_dev * 2:-sub_num_dev]
+                test_txt_files += txt_files[-sub_num_dev:]
+        print("len(train_txt_files):", len(train_txt_files))
+        print("len(dev_txt_files):", len(dev_txt_files))
+        print("len(test_txt_files):", len(test_txt_files))
+        train_txts = read_txts(train_txt_files)
+        dev_txts = read_txts(dev_txt_files)
+        test_txts = read_txts(test_txt_files)
+    else:
+        print("dataset should in {ljspeech, libritts} now!")
+    train_dump_dir = dump_dir / "train"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dump_dir / "dev"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dump_dir / "test"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+    phonemizer = GruutPhonemizer(language='en-us')
+    # process for the 3 sections
+    if train_txts:
+        process_sentences(
+            items=train_txts,
+            output_dir=train_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+    if dev_txts:
+        process_sentences(
+            items=dev_txts,
+            output_dir=dev_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+    if test_txts:
+        process_sentences(
+            items=test_txts,
+            output_dir=test_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+if __name__ == "__main__":
+    main()

AR/exps/get_phones_librilight.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+1. read text of dataset, for LibriLight read txt_*.npy -> 需要整理成 list(utt_id, txt) 的形式
+2. text -> IPA by GruutPhonemizer
+3. save out a *.npy dict for all text
+4. LibriLight 每个 split 分开处理
+my_dict = {"utt_id1": text1, "utt_id2": text2}
+np.save(output_filename, my_dict)
+my_dict = np.load(output_filename, allow_pickle=True).item()
+"""
+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+import numpy as np
+import tqdm
+from AR.text_processing.phonemizer import GruutPhonemizer
+from soundstorm.utils import check_txt_file
+def read_txts(txt_file: Path, nprocs: int=1):
+    '''
+    txt_file: path of npy dict, {"utt_id1": text1, "utt_id2": text2}
+    '''
+    txt_dict = np.load(txt_file, allow_pickle=True).item()
+    #[(utt_id, txt), ...]
+    return_list = list(txt_dict.items())
+    return return_list
+def process_sentence(item, phonemizer, output_dir):
+    utt_id, text = item
+    phonemes_dir = output_dir / "phonemes"
+    phonemes_dir.mkdir(parents=True, exist_ok=True)
+    phonemes_path = phonemes_dir / (utt_id + ".txt")
+    try:
+        if os.path.exists(phonemes_path) and check_txt_file(phonemes_path):
+            # print(phonemes_path, 'exits!')
+            pass
+        else:
+            phonemes = phonemizer.phonemize(text, espeak=False)
+            with open(phonemes_path, 'w') as f:
+                f.write(phonemes)
+        record = {"utt_id": utt_id, "phonemes_path": phonemes_path}
+    except Exception:
+        print("occur Exception")
+        traceback.print_exc()
+        return None
+    return record
+def process_sentences(args, items, phonemizer, output_dir, nprocs: int=1):
+    print("nprocs:", nprocs)
+    if nprocs == 1:
+        results = []
+        for item in tqdm.tqdm(items, total=len(items)):
+            record = process_sentence(
+                item=item, phonemizer=phonemizer, output_dir=output_dir)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(items)) as progress:
+                for item in items:
+                    future = pool.submit(process_sentence, item, phonemizer,
+                                         output_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+    results.sort(key=itemgetter("utt_id"))
+    npy_dict = {}
+    print(f"start to save {args.rank}_{args.nshard}.npy ...")
+    save_start_time = time.time()
+    for item in tqdm.tqdm(results, total=len(results), colour='green'):
+        # 这里加 try, 因为 txt 文件可能损坏
+        try:
+            utt_id = item["utt_id"]
+            phonemes = check_txt_file(item["phonemes_path"])
+            if phonemes is not False:
+                npy_dict[utt_id] = phonemes
+            else:
+                print(f'phonemes of {utt_id} is False')
+        except Exception:
+            print(f"{utt_id} occur Exception")
+            traceback.print_exc()
+            continue
+    filename = output_dir / f'phonemes_{args.rank}_{args.nshard}.npy'
+    np.save(filename, npy_dict)
+    print(f"npy file '{filename}' write down")
+    print('time of save stage:', time.time() - save_start_time)
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Get phones for LibriLight dataset from txt_*.npy")
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+    parser.add_argument(
+        '--train_txt_dir',
+        type=str,
+        default='dump/small/train/',
+        help='dir of train txt files')
+    parser.add_argument(
+        '--dev_txt_dir',
+        type=str,
+        default='dump/small/dev/',
+        help='dir of dev txt files')
+    parser.add_argument(
+        '--test_txt_dir',
+        type=str,
+        default='dump/small/test/',
+        help='dir of test txt files')
+    parser.add_argument(
+        "--sub_dataset",
+        default="small",
+        type=str,
+        help="name of sub dataset of LibriLight",
+        choices=['small', 'medium', 'large', 'duplicate'], )
+    parser.add_argument("--nshard", type=int, default=3)
+    parser.add_argument("--rank", type=int, default=0)
+    args = parser.parse_args()
+    print(f"nshard: {args.nshard}, rank: {args.rank}")
+    train_txt_dir = Path(args.train_txt_dir)
+    dev_txt_dir = Path(args.dev_txt_dir)
+    test_txt_dir = Path(args.test_txt_dir)
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    train_txt_file = train_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    dev_txt_file = dev_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    test_txt_file = test_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    train_txts = read_txts(train_txt_file)
+    dev_txts = read_txts(dev_txt_file)
+    test_txts = read_txts(test_txt_file)
+    sub_dataset_dump_dir = dump_dir / args.sub_dataset
+    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
+    train_dump_dir = sub_dataset_dump_dir / "train"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = sub_dataset_dump_dir / "dev"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = sub_dataset_dump_dir / "test"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+    phonemizer = GruutPhonemizer(language='en-us')
+    # process for the 3 sections
+    if train_txts:
+        process_sentences(
+            args=args,
+            items=train_txts,
+            output_dir=train_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+    if dev_txts:
+        process_sentences(
+            args=args,
+            items=dev_txts,
+            output_dir=dev_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+    if test_txts:
+        process_sentences(
+            args=args,
+            items=test_txts,
+            output_dir=test_dump_dir,
+            phonemizer=phonemizer,
+            nprocs=args.num_cpu)
+if __name__ == "__main__":
+    main()

AR/exps/get_txt_librilight.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import librosa
+import numpy as np
+import tqdm
+import whisper
+from soundstorm.s2.exps.hubert.feature_utils import get_shard_range
+from soundstorm.utils import check_txt_file
+def process_sentence(args,
+                     fp: Path,
+                     train_dump_dir: Path,
+                     dev_dump_dir: Path,
+                     test_dump_dir: Path,
+                     VAD_dict):
+    asr_model = whisper.load_model("tiny.en")
+    utt_id = fp.stem
+    sr = args.sr
+    record = []
+    train_txt_dir = train_dump_dir / "txt"
+    train_txt_dir.mkdir(parents=True, exist_ok=True)
+    dev_txt_dir = dev_dump_dir / "txt"
+    dev_txt_dir.mkdir(parents=True, exist_ok=True)
+    test_txt_dir = test_dump_dir / "txt"
+    test_txt_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        # get info for path
+        wav_path_list = str(fp).strip().split('/')
+        sub_dataset, spk_id, book_name = wav_path_list[-4], wav_path_list[
+            -3], wav_path_list[-2]
+        wav_name = wav_path_list[-1][:-5]
+        assert wav_name == utt_id
+        # key_name for big wav
+        key_name = f'{wav_name}#{sub_dataset}#{spk_id}#{book_name}'
+        # 判断 VAD 字典中不存在该条音频信息的情况
+        if key_name not in VAD_dict.keys():
+            print(key_name, 'not in VAD_dict !')
+            return record
+        wav = None
+        sorted_split_VAD_dict = sorted(VAD_dict[key_name].items())
+        len_dict = len(sorted_split_VAD_dict)
+        for index, item in enumerate(sorted_split_VAD_dict):
+            split_name, value = item
+            start, end = value
+            # train | dev | test
+            if index == len_dict - 1:
+                subset = 'test'
+                txt_path = test_txt_dir / (split_name + ".txt")
+            elif index == len_dict - 2:
+                subset = 'dev'
+                txt_path = dev_txt_dir / (split_name + ".txt")
+            else:
+                subset = 'train'
+                txt_path = train_txt_dir / (split_name + ".txt")
+            if os.path.exists(txt_path) and check_txt_file(txt_path):
+                # print(txt_path, 'exits!')
+                pass
+            else:
+                # 这里加判断保证在 sub wav 的循环中只 load 一次
+                if wav is None:
+                    # load big wav
+                    # 在最外层 load 如果 sub wav 的特征都存在了就会白白消耗 load 的时间
+                    wav, _ = librosa.load(str(fp), sr=sr)
+                sub_wav = wav[int(start * sr):int(end * sr)]
+                asr_result = asr_model.transcribe(sub_wav)["text"]
+                with open(txt_path, 'w') as f:
+                    f.write(asr_result)
+            sub_record = {
+                "utt_id": split_name,
+                "txt_path": txt_path,
+                "subset": subset
+            }
+            # recodrd 变成 List of Dict
+            record.append(sub_record)
+    except Exception:
+        print("occur Exception")
+        traceback.print_exc()
+        # record 有可能是一个不完整的 List
+        return record
+    return record
+def process_sentences(args,
+                      fps: Path,
+                      train_dump_dir: Path,
+                      dev_dump_dir: Path,
+                      test_dump_dir: Path,
+                      VAD_dict,
+                      nprocs: int=1):
+    print("nprocs:", nprocs)
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                args=args,
+                fp=fp,
+                train_dump_dir=train_dump_dir,
+                dev_dump_dir=dev_dump_dir,
+                test_dump_dir=test_dump_dir,
+                VAD_dict=VAD_dict)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, args, fp,
+                                         train_dump_dir, dev_dump_dir,
+                                         test_dump_dir, VAD_dict)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+    # torch.save() to a large `.pth` file
+    txt_dict = dict()
+    txt_dict['train'] = {}
+    txt_dict['dev'] = {}
+    txt_dict['test'] = {}
+    # record 是 List of Dict, 一条大 wav 一个 record，一条小 wav 一个 sub_recored
+    print(f"start to save {args.rank}_{args.nshard}.npy ...")
+    save_start_time = time.time()
+    for record in tqdm.tqdm(results, total=len(results), colour='green'):
+        for sub_record in record:
+            # 这里加 try, 因为 txt 文件可能损坏
+            try:
+                utt_id = sub_record["utt_id"]
+                subset = sub_record["subset"]
+                asr_result = check_txt_file(sub_record["txt_path"])
+                if asr_result is not False:
+                    txt_dict[subset][utt_id] = asr_result
+                else:
+                    print(f'asr result of {utt_id} is False')
+            except Exception:
+                print(f"{utt_id} occur Exception")
+                traceback.print_exc()
+                continue
+    train_filename = train_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    dev_filename = dev_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    test_filename = test_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
+    np.save(train_filename, txt_dict['train'])
+    print(f"npy file '{train_filename}' write down")
+    np.save(dev_filename, txt_dict['dev'])
+    print(f"npy file '{dev_filename}' write down")
+    np.save(test_filename, txt_dict['test'])
+    print(f"npy file '{test_filename}' write down")
+    print('time of save stage:', time.time() - save_start_time)
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features for LibriLight.")
+    parser.add_argument(
+        "--data_dir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+    parser.add_argument(
+        '--sr', type=int, default=16000, help='sample rate of model')
+    # For LibriLight dataset
+    parser.add_argument(
+        "--sub_dataset",
+        default="small",
+        type=str,
+        help="name of sub dataset of LibriLight",
+        choices=['small', 'medium', 'large', 'duplicate'], )
+    parser.add_argument(
+        "--VAD_path", type=str, default='./VAD/librilight_segment_dict.npy')
+    parser.add_argument("--nshard", type=int, default=3)
+    parser.add_argument("--rank", type=int, default=0)
+    args = parser.parse_args()
+    data_dir = Path(args.data_dir).expanduser()
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    assert data_dir.is_dir()
+    # sub_dataset here
+    sub_dataset_dir = data_dir / args.sub_dataset
+    # olny spk_id in list, sort by lexicographical order
+    speaker_list = sorted(os.listdir(sub_dataset_dir))
+    start, end = get_shard_range(len(speaker_list), args.nshard, args.rank)
+    # speaker_list for this rank
+    speaker_list = speaker_list[start:end]
+    all_wav_files = []
+    for speaker in speaker_list:
+        wav_files = sorted(list((sub_dataset_dir / speaker).rglob("*/*.flac")))
+        # filter out ._*.flac
+        wav_files = [
+            file for file in wav_files if not file.name.startswith('._')
+        ]
+        all_wav_files += wav_files
+    print(f"num of wav files in rank {args.rank}:", len(all_wav_files))
+    # get VAD info
+    VAD_dict = np.load(args.VAD_path, allow_pickle=True).item()
+    sub_dataset_dump_dir = dump_dir / args.sub_dataset
+    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
+    train_dump_dir = sub_dataset_dump_dir / "train"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = sub_dataset_dump_dir / "dev"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = sub_dataset_dump_dir / "test"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+    # 每条大 wav 分出一个 dev 一个 test，比例大概是 96:2:2
+    if all_wav_files:
+        process_sentences(
+            args=args,
+            fps=all_wav_files,
+            train_dump_dir=train_dump_dir,
+            dev_dump_dir=dev_dump_dir,
+            test_dump_dir=test_dump_dir,
+            VAD_dict=VAD_dict,
+            nprocs=args.num_cpu)
+if __name__ == "__main__":
+    main()

AR/exps/split_train_val.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy
+import pandas
+semantic_path = 'dump/semantic.tsv'
+phoneme_path = 'dump/phoneme.npy'
+train_semantic_path = 'dump/semantic_train.tsv'
+train_phoneme_path = 'dump/phoneme_train.npy'
+dev_semantic_path = 'dump/semantic_dev.tsv'
+dev_phoneme_path = 'dump/phoneme_dev.npy'
+# 读取dump/semantic.tsv
+semantic_df = pandas.read_csv(semantic_path, sep='\t')
+# pd.DataFrame(columns=["item_name", "semantic_audio"])
+# # 读取dump/phoneme.npy
+phoneme_dict = numpy.load(phoneme_path, allow_pickle=True).item()
+dev_num = 20
+# 随机从semantic_df中选取dev_num个
+dev_df = semantic_df.sample(n=dev_num)
+# 剩下的是train
+train_df = semantic_df.drop(dev_df.index)
+# 保存
+dev_df.to_csv(dev_semantic_path, sep='\t', index=False)
+train_df.to_csv(train_semantic_path, sep='\t', index=False)
+# 将dev_df中的item_name取出来 作为dev_phoneme_dict的key
+dev_item_names = dev_df['item_name'].tolist()
+dev_phoneme_dict = {k: phoneme_dict[k] for k in dev_item_names if k in phoneme_dict}
+train_phoneme_dict = {k: phoneme_dict[k] for k in phoneme_dict.keys() if k not in dev_item_names}
+numpy.save(dev_phoneme_path, dev_phoneme_dict)
+numpy.save(train_phoneme_path, train_phoneme_dict)

AR/exps/t2s.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# text to semantic
+import argparse
+import os
+import re
+import time
+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+import whisper
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from AR.text_processing.phonemizer import GruutPhonemizer
+from AR.utils.io import load_yaml_config
+def get_batch(text, phonemizer):
+    # phoneme_ids 和 phoneme_ids_len 是需要的
+    phoneme = phonemizer.phonemize(text, espeak=False)
+    phoneme_ids = phonemizer.transform(phoneme)
+    phoneme_ids_len = len(phoneme_ids)
+    phoneme_ids = np.array(phoneme_ids)
+    # add batch axis here
+    phoneme_ids = torch.tensor(phoneme_ids).unsqueeze(0)
+    phoneme_ids_len = torch.tensor([phoneme_ids_len])
+    print("phoneme:", phoneme)
+    batch = {
+        # torch.Tensor (B, max_phoneme_length)
+        "phoneme_ids": phoneme_ids,
+        # torch.Tensor (B)
+        "phoneme_ids_len": phoneme_ids_len
+    }
+    return batch
+def get_prompt(prompt_wav_path, asr_model, phonemizer, semantic_tokenizer):
+    sample_rate = 16000
+    # to get prompt
+    prompt_name = os.path.basename(prompt_wav_path).split('.')[0]
+    wav, _ = librosa.load(prompt_wav_path, sr=sample_rate)
+    # 取末尾 3s, 但是不包含最后 0.1s 防止 AR S1 infer 提前停止
+    wav = wav[-sample_rate * 3:-int(sample_rate * 0.1)]
+    # wav 需要挪出末尾的静音否则也可能提前停住
+    prompt_text = asr_model.transcribe(wav)["text"]
+    # 移除最后的句点, 防止 AR S1 infer 提前停止, 加了句点可能会有停顿
+    prompt_text = prompt_text.replace(".", "")
+    prompt_phoneme = phonemizer.phonemize(prompt_text, espeak=False)
+    prompt_phoneme_ids = phonemizer.transform(prompt_phoneme)
+    prompt_phoneme_ids_len = len(prompt_phoneme_ids)
+    # get prompt_semantic
+    # (T) -> (1, T)
+    wav = torch.tensor(wav).unsqueeze(0)
+    wav = wav.cuda()
+    # (1, T)
+    prompt_semantic_tokens = semantic_tokenizer.tokenize(wav).to(torch.int32)
+    prompt_phoneme_ids = torch.tensor(prompt_phoneme_ids).unsqueeze(0)
+    prompt_phoneme_ids_len = torch.tensor([prompt_phoneme_ids_len])
+    result = {
+        'prompt_name': prompt_name,
+        'prompt_phoneme_ids': prompt_phoneme_ids,
+        'prompt_semantic_tokens': prompt_semantic_tokens,
+        'prompt_phoneme_ids_len': prompt_phoneme_ids_len
+    }
+    return result
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(
+        description="Run SoundStorm AR S1 model for input text file")
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='conf/default.yaml',
+        help='path of config file')
+    parser.add_argument(
+        "--text_file",
+        type=str,
+        help="text file to be convert to semantic tokens, a 'utt_id sentence' pair per line."
+    )
+    parser.add_argument(
+        '--ckpt_path',
+        type=str,
+        default='exp/default/ckpt/epoch=99-step=49000.ckpt',
+        help='Checkpoint file of SoundStorm AR S1 model.')
+    parser.add_argument(
+        '--prompt_wav_path',
+        type=str,
+        default=None,
+        help='extract prompt semantic and prompt phonemes from prompt wav')
+    # to get semantic tokens from prompt_wav
+    parser.add_argument("--hubert_path", type=str, default=None)
+    parser.add_argument("--quantizer_path", type=str, default=None)
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    config = load_yaml_config(args.config_file)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    hz = 50
+    max_sec = config['data']['max_sec']
+    # get models
+    t2s_model = Text2SemanticLightningModule.load_from_checkpoint(
+        checkpoint_path=args.ckpt_path, config=config)
+    t2s_model.cuda()
+    t2s_model.eval()
+    phonemizer: GruutPhonemizer = GruutPhonemizer(language='en-us')
+    # models for prompt
+    asr_model = whisper.load_model("tiny.en")
+    semantic_tokenizer = SemanticTokenizer(
+        hubert_path=args.hubert_path,
+        quantizer_path=args.quantizer_path,
+        duplicate=True)
+    prompt_result = get_prompt(
+        prompt_wav_path=args.prompt_wav_path,
+        asr_model=asr_model,
+        phonemizer=phonemizer,
+        semantic_tokenizer=semantic_tokenizer)
+    # zero prompt => 输出的 semantic 包含的内容是对的但是音色是乱的
+    # (B, 1)
+    # prompt = torch.ones(
+    #     batch['phoneme_ids'].size(0), 1, dtype=torch.int32) * 0
+    prompt = prompt_result['prompt_semantic_tokens']
+    prompt_phoneme_ids_len = prompt_result['prompt_phoneme_ids_len']
+    prompt_phoneme_ids = prompt_result['prompt_phoneme_ids']
+    sentences = []
+    with open(args.text_file, 'rt', encoding='utf-8') as f:
+        for line in f:
+            if line.strip() != "":
+                items = re.split(r"\s+", line.strip(), 1)
+                utt_id = items[0]
+                sentence = " ".join(items[1:])
+            sentences.append((utt_id, sentence))
+    semantic_data = [['item_name', 'semantic_audio']]
+    for utt_id, sentence in sentences[1:]:
+        # 需要自己构造伪 batch 输入给模型
+        batch = get_batch(sentence, phonemizer)
+        # prompt 和真正的输入拼接
+        all_phoneme_ids = torch.cat(
+            [prompt_phoneme_ids, batch['phoneme_ids']], dim=1)
+        # 或者可以直接求 all_phoneme_ids 的 shape[-1]
+        all_phoneme_len = prompt_phoneme_ids_len + batch['phoneme_ids_len']
+        st = time.time()
+        with torch.no_grad():
+            pred_semantic = t2s_model.model.infer(
+                all_phoneme_ids.cuda(),
+                all_phoneme_len.cuda(),
+                prompt.cuda(),
+                top_k=config['inference']['top_k'],
+                early_stop_num=hz * max_sec)
+        print(f'{time.time() - st} sec used in T2S')
+        # 删除 prompt 对应的部分
+        prompt_len = prompt.shape[-1]
+        pred_semantic = pred_semantic[:, prompt_len:]
+        # bs = 1
+        pred_semantic = pred_semantic[0]
+        semantic_token = pred_semantic.detach().cpu().numpy().tolist()
+        semantic_token_str = ' '.join(str(x) for x in semantic_token)
+        semantic_data.append([utt_id, semantic_token_str])
+        delimiter = '\t'
+        filename = output_dir / f'{utt_id}_p_{prompt_result["prompt_name"]}_semantic_token.tsv'
+        with open(filename, 'w', encoding='utf-8') as writer:
+            for row in semantic_data:
+                line = delimiter.join(row)
+                writer.write(line + '\n')
+        # clean semantic token for next setence
+        semantic_data = [['item_name', 'semantic_audio']]
+if __name__ == "__main__":
+    main()

AR/exps/test.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# test from dump file
+import argparse
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from AR.data.dataset import Text2SemanticDataset
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from AR.utils.io import load_yaml_config
+from torch.utils.data import DataLoader
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(
+        description="Run SoundStorm AR S1 model for test set.")
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='conf/default.yaml',
+        help='path of config file')
+    # args for dataset
+    parser.add_argument(
+        '--test_semantic_path',
+        type=str,
+        default='dump/test/semantic_token.tsv')
+    parser.add_argument(
+        '--test_phoneme_path', type=str, default='dump/test/phonemes.npy')
+    parser.add_argument(
+        '--ckpt_path',
+        type=str,
+        default='exp/default/ckpt/epoch=99-step=49000.ckpt',
+        help='Checkpoint file of SoundStorm AR S1 model.')
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    config = load_yaml_config(args.config_file)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    batch_size = 1
+    hz = 50
+    max_sec = config['data']['max_sec']
+    # get dataset
+    test_dataset = Text2SemanticDataset(
+        phoneme_path=args.test_phoneme_path,
+        semantic_path=args.test_semantic_path,
+        # max_sec 需要与训练时保持一致，不然可能会效果不好，重复漏字等
+        # 但是这里设置太短又会直接过滤掉太长的样本，为了防止被过滤掉，可以在 infer 的时候截断
+        max_sec=100,
+        max_sample=8,
+        pad_val=config['data']['pad_val'])
+    # get model
+    t2s_model = Text2SemanticLightningModule.load_from_checkpoint(
+        checkpoint_path=args.ckpt_path, config=config)
+    t2s_model.cuda()
+    t2s_model.eval()
+    # 获取 batch_size 条
+    # 创建 DataLoader，并指定 collate_fn 函数
+    dataloader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=test_dataset.collate)
+    item_names = test_dataset.__get_item_names__()
+    # 逐批次读取数据, bs=1、shuffle=False 时可以用 __get_item_names__ 对应
+    semantic_data = [['item_name', 'semantic_audio']]
+    for i, batch in enumerate(dataloader):
+        # 要保证 bs = 1
+        utt_id = item_names[i]
+        if i == 0:
+            print("utt_id:", utt_id)
+            # bs > 1 时会补零
+            # 与 validation_step() 保持一致
+            semantic_len = batch['semantic_ids'].size(1)
+            # 以 batch['semantic_ids'] 的前 150 个为 prompt
+            # 多次合成，前 prompt_len 个是一样的，而且和 prompt 一样
+            prompt_len = min(int(semantic_len * 0.5), 150)
+            # 输入纯文本时 prompt 该输入什么？=> see t2s.py
+            prompt = batch['semantic_ids'][:, :prompt_len]
+            # # zero prompt => 也可以输出文本内容正确的 semantic token, 但是音色是乱的
+            # 证明 semantic token 中还是包含了音色信息
+            # prompt = torch.ones(
+            #     batch['semantic_ids'].size(0), 1, dtype=torch.int32) * 0
+            # print("prompt:", prompt)
+            # print("prompt.shape:", prompt.shape)
+            np.save(output_dir / 'prompt.npy', prompt.detach().cpu().numpy())
+            st = time.time()
+            with torch.no_grad():
+                # calculate acc for test
+                loss, acc = t2s_model.model.forward(
+                    batch['phoneme_ids'].cuda(),
+                    batch['phoneme_ids_len'].cuda(),
+                    batch['semantic_ids'].cuda(),
+                    batch['semantic_ids_len'].cuda())
+                print("top_3_acc of this batch:", acc)
+                pred_semantic = t2s_model.model.infer(
+                    batch['phoneme_ids'].cuda(),
+                    batch['phoneme_ids_len'].cuda(),
+                    prompt.cuda(),
+                    top_k=config['inference']['top_k'],
+                    # hz * max_sec in train dataloader
+                    # 生成的长度是 1002 应该是有一些 pad
+                    early_stop_num=hz * max_sec)
+                # bs = 1
+                pred_semantic = pred_semantic[0]
+            print(f'{time.time() - st} sec used in T2S')
+            semantic_token = pred_semantic.detach().cpu().numpy().tolist()
+            semantic_token_str = ' '.join(str(x) for x in semantic_token)
+            semantic_data.append([utt_id, semantic_token_str])
+        else:
+            break
+    delimiter = '\t'
+    filename = output_dir / "semantic_token.tsv"
+    with open(filename, 'w', encoding='utf-8') as writer:
+        for row in semantic_data:
+            line = delimiter.join(row)
+            writer.write(line + '\n')
+if __name__ == "__main__":
+    main()

AR/exps/text.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+001 Life was like a box of chocolates, you never know what you're gonna get.
+002 With great power there must come great responsibility.
+003 To be or not to be, that’s a question.
+004 A man can be destroyed but not defeated
+005 Do not, for one repulse, give up the purpose that you resolved to effort.
+006 Death is just a part of life, something we're all destined to do.
+007 I think it's hard winning a war with words.
+008 Don’t argue with the people of strong determination, because they may change the fact!
+009 Love you three thousand times.
+010 tidy tiger tied a tie tighter to tidy her tiny tall.

AR/exps/train.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py
+import argparse
+import logging
+import os
+from pathlib import Path
+import torch
+from pytorch_lightning import seed_everything
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.strategies import DDPStrategy
+from AR.data.data_module import Text2SemanticDataModule
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from soundstorm.utils.io import load_yaml_config
+logging.getLogger('numba').setLevel(logging.WARNING)
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+torch.set_float32_matmul_precision('high')
+from soundstorm.utils import get_newest_ckpt
+def main(args):
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_dir = output_dir / 'ckpt'
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    config = load_yaml_config(args.config_file)
+    seed_everything(config["train"]["seed"], workers=True)
+    ckpt_callback: ModelCheckpoint = ModelCheckpoint(
+        save_top_k=-1,
+        save_on_train_epoch_end=False,
+        every_n_epochs=config["train"]["save_every_n_epoch"],
+        dirpath=ckpt_dir)
+    logger = WandbLogger(
+        project="AR_S1",
+        name=output_dir.stem,
+        save_dir=output_dir,
+        # resume the loss curve
+        resume=True,
+        # id='k19kvsq8'
+    )
+    trainer: Trainer = Trainer(
+        max_epochs=config["train"]["epochs"],
+        accelerator='gpu',
+        devices=-1,
+        benchmark=False,
+        fast_dev_run=False,
+        strategy=DDPStrategy(find_unused_parameters=True),
+        precision=config["train"]["precision"],
+        logger=logger,
+        callbacks=[ckpt_callback])
+    model: Text2SemanticLightningModule = Text2SemanticLightningModule(
+        config, output_dir)
+    data_module: Text2SemanticDataModule = Text2SemanticDataModule(
+        config,
+        train_semantic_path=args.train_semantic_path,
+        train_phoneme_path=args.train_phoneme_path,
+        dev_semantic_path=args.dev_semantic_path,
+        dev_phoneme_path=args.dev_phoneme_path)
+    try:
+        # 使用正则表达式匹配文件名中的数字部分，并按数字大小进行排序
+        newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir))
+        ckpt_path = ckpt_dir / newest_ckpt_name
+    except Exception:
+        ckpt_path = None
+    print("ckpt_path:", ckpt_path)
+    trainer.fit(model, data_module, ckpt_path=ckpt_path)
+# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='conf/default.yaml',
+        help='path of config file')
+    # args for dataset
+    parser.add_argument(
+        '--train_semantic_path',
+        type=str,
+        default='dump/train/semantic_token.tsv')
+    parser.add_argument(
+        '--train_phoneme_path', type=str, default='dump/train/phonemes.npy')
+    parser.add_argument(
+        '--dev_semantic_path', type=str, default='dump/dev/semantic_token.tsv')
+    parser.add_argument(
+        '--dev_phoneme_path', type=str, default='dump/dev/phonemes.npy')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='exp/default',
+        help='directory to save the results')
+    args = parser.parse_args()
+    logging.info(str(args))
+    main(args)

AR/exps/train_librilight_6k.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py
+import argparse
+import logging
+import os
+from pathlib import Path
+import torch
+from pytorch_lightning import seed_everything
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.strategies import DDPStrategy
+from AR.data.data_module_librilight_6k import Text2SemanticDataModule
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from soundstorm.utils import get_newest_ckpt
+from soundstorm.utils.io import load_yaml_config
+logging.getLogger('numba').setLevel(logging.WARNING)
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+torch.set_float32_matmul_precision('high')
+def main(args):
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    ckpt_dir = output_dir / 'ckpt'
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    config = load_yaml_config(args.config_file)
+    seed_everything(config["train"]["seed"], workers=True)
+    ckpt_callback: ModelCheckpoint = ModelCheckpoint(
+        save_top_k=-1,
+        save_on_train_epoch_end=False,
+        every_n_train_steps=config["train"]["every_n_train_steps"],
+        dirpath=ckpt_dir)
+    logger = WandbLogger(
+        project="AR_S1_LibriLight",
+        name=output_dir.stem,
+        save_dir=output_dir,
+        # resume the loss curve
+        resume=True,
+        # id='k19kvsq8'
+    )
+    trainer: Trainer = Trainer(
+        max_epochs=config["train"]["epochs"],
+        accelerator='gpu',
+        devices=-1,
+        benchmark=False,
+        fast_dev_run=False,
+        strategy=DDPStrategy(find_unused_parameters=True),
+        precision=config["train"]["precision"],
+        logger=logger,
+        callbacks=[ckpt_callback])
+    model: Text2SemanticLightningModule = Text2SemanticLightningModule(
+        config, output_dir)
+    data_module: Text2SemanticDataModule = Text2SemanticDataModule(
+        config,
+        train_semantic_dirs=args.train_semantic_dirs,
+        train_phoneme_dirs=args.train_phoneme_dirs,
+        dev_semantic_dirs=args.dev_semantic_dirs,
+        dev_phoneme_dirs=args.dev_phoneme_dirs,
+        train_non_speech_dirs=args.train_non_speech_dirs,
+        dev_non_speech_dirs=args.dev_non_speech_dirs)
+    try:
+        newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir))
+        ckpt_path = ckpt_dir / newest_ckpt_name
+    except Exception:
+        ckpt_path = None
+    print("ckpt_path:", ckpt_path)
+    trainer.fit(model, data_module, ckpt_path=ckpt_path)
+# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='conf/default.yaml',
+        help='path of config file')
+    # args for dataset
+    parser.add_argument(
+        '--train_semantic_dirs',
+        type=list,
+        nargs='+',
+        default=["dump/small/train/"],
+        help='dirs of train semantic')
+    parser.add_argument(
+        '--train_phoneme_dirs',
+        type=list,
+        nargs='+',
+        default=["dump/small/train/"],
+        help='dirs of train phoneme')
+    parser.add_argument(
+        '--dev_semantic_dirs',
+        type=list,
+        nargs='+',
+        default=["dump/small/dev/"],
+        help='dirs of dev semantic')
+    parser.add_argument(
+        '--dev_phoneme_dirs',
+        type=list,
+        nargs='+',
+        default=["dump/small/dev/"],
+        help='dirs of dev phoneme')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='exp/default',
+        help='directory to save the results')
+    parser.add_argument(
+        '--train_non_speech_dirs',
+        type=list,
+        nargs='+',
+        default=None,
+        help='dirs of train non_speech data')
+    parser.add_argument(
+        '--dev_non_speech_dirs',
+        type=list,
+        nargs='+',
+        default=None,
+        help='dirs of dev non_speech data')
+    args = parser.parse_args()
+    new_train_semantic_dirs = []
+    new_train_phoneme_dirs = []
+    new_dev_semantic_dirs = []
+    new_dev_phoneme_dirs = []
+    new_train_non_speech_dirs = []
+    new_dev_non_speech_dirs = []
+    # format dataset dirs
+    for item in args.train_semantic_dirs:
+        new_train_semantic_dirs.append(''.join(item))
+    args.train_semantic_dirs = new_train_semantic_dirs
+    for item in args.train_phoneme_dirs:
+        new_train_phoneme_dirs.append(''.join(item))
+    args.train_phoneme_dirs = new_train_phoneme_dirs
+    for item in args.dev_semantic_dirs:
+        new_dev_semantic_dirs.append(''.join(item))
+    args.dev_semantic_dirs = new_dev_semantic_dirs
+    for item in args.dev_phoneme_dirs:
+        new_dev_phoneme_dirs.append(''.join(item))
+    args.dev_phoneme_dirs = new_dev_phoneme_dirs
+    if args.train_non_speech_dirs is not None:
+        for item in args.train_non_speech_dirs:
+            new_train_non_speech_dirs.append(''.join(item))
+        args.train_non_speech_dirs = new_train_non_speech_dirs
+    if args.dev_non_speech_dirs is not None:
+        for item in args.dev_non_speech_dirs:
+            new_dev_non_speech_dirs.append(''.join(item))
+        args.dev_non_speech_dirs = new_dev_non_speech_dirs
+    logging.info(str(args))
+    main(args)

AR/models/__init__.py ADDED Viewed

File without changes

AR/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (142 Bytes). View file

AR/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (145 Bytes). View file

AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc ADDED Viewed

Binary file (3.14 kB). View file

AR/models/__pycache__/t2s_lightning_module.cpython-39.pyc ADDED Viewed

Binary file (3.15 kB). View file

AR/models/__pycache__/t2s_model.cpython-310.pyc ADDED Viewed

Binary file (6.84 kB). View file

AR/models/__pycache__/t2s_model.cpython-39.pyc ADDED Viewed

Binary file (6.84 kB). View file

AR/models/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.5 kB). View file

AR/models/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (4.48 kB). View file

AR/models/t2s_lightning_module.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
+import os,sys
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from typing import Dict
+import torch
+from pytorch_lightning import LightningModule
+from AR.models.t2s_model import Text2SemanticDecoder
+from AR.modules.lr_schedulers import WarmupCosineLRSchedule
+from AR.modules.optim import ScaledAdam
+class Text2SemanticLightningModule(LightningModule):
+    def __init__(self, config, output_dir,is_train=True):
+        super().__init__()
+        self.config = config
+        self.top_k = 3
+        self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
+        pretrained_s1=config.get("pretrained_s1")
+        if(pretrained_s1 and is_train):
+            # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
+            print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["weight"]))
+        if is_train:
+            self.automatic_optimization = False
+            self.save_hyperparameters()
+            self.eval_dir = output_dir / 'eval'
+            self.eval_dir.mkdir(parents=True, exist_ok=True)
+    def training_step(self, batch: Dict, batch_idx: int):
+        opt = self.optimizers()
+        scheduler = self.lr_schedulers()
+        loss, acc = self.model.forward(
+            batch['phoneme_ids'], batch['phoneme_ids_len'],
+            batch['semantic_ids'], batch['semantic_ids_len'],
+            batch['bert_feature'])
+        self.manual_backward(loss)
+        if batch_idx > 0 and batch_idx % 4 == 0:
+            opt.step()
+            opt.zero_grad()
+            scheduler.step()
+        self.log(
+            "total_loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True)
+        self.log(
+            "lr",
+            scheduler.get_last_lr()[0],
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True)
+        self.log(
+            f"top_{self.top_k}_acc",
+            acc,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True)
+    def validation_step(self, batch: Dict, batch_idx: int):return
+        # # get loss
+        # loss, acc = self.model.forward(
+        #     batch['phoneme_ids'], batch['phoneme_ids_len'],
+        #     batch['semantic_ids'], batch['semantic_ids_len'],
+        #     batch['bert_feature']
+        # )
+        #
+        # self.log(
+        #     "val_total_loss",
+        #     loss,
+        #     on_step=True,
+        #     on_epoch=True,
+        #     prog_bar=True,
+        #     sync_dist=True)
+        # self.log(
+        #     f"val_top_{self.top_k}_acc",
+        #     acc,
+        #     on_step=True,
+        #     on_epoch=True,
+        #     prog_bar=True,
+        #     sync_dist=True)
+        #
+        # # get infer output
+        # semantic_len = batch['semantic_ids'].size(1)
+        # prompt_len = min(int(semantic_len * 0.5), 150)
+        # prompt = batch['semantic_ids'][:, :prompt_len]
+        # pred_semantic = self.model.infer(batch['phoneme_ids'],
+        #                                  batch['phoneme_ids_len'], prompt,
+        #                                  batch['bert_feature']
+        #                                  )
+        # save_name = f'semantic_toks_{batch_idx}.pt'
+        # save_path = os.path.join(self.eval_dir, save_name)
+        # torch.save(pred_semantic.detach().cpu(), save_path)
+    def configure_optimizers(self):
+        model_parameters = self.model.parameters()
+        parameters_names = []
+        parameters_names.append([
+            name_param_pair[0]
+            for name_param_pair in self.model.named_parameters()
+        ])
+        lm_opt = ScaledAdam(
+            model_parameters,
+            lr=0.01,
+            betas=(0.9, 0.95),
+            clipping_scale=2.0,
+            parameters_names=parameters_names,
+            show_dominant_parameters=False,
+            clipping_update_period=1000, )
+        return {
+            "optimizer": lm_opt,
+            "lr_scheduler": {
+                "scheduler":
+                WarmupCosineLRSchedule(
+                    lm_opt,
+                    init_lr=self.config['optimizer']['lr_init'],
+                    peak_lr=self.config['optimizer']['lr'],
+                    end_lr=self.config['optimizer']['lr_end'],
+                    warmup_steps=self.config['optimizer']['warmup_steps'],
+                    total_steps=self.config['optimizer']['decay_steps'])
+            }
+        }

AR/models/t2s_model.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
+import torch
+from tqdm import tqdm
+from AR.models.utils import make_pad_mask
+from AR.models.utils import topk_sampling,sample,logits_to_probs,multinomial_sample_one_no_sync
+from AR.modules.embedding import SinePositionalEmbedding
+from AR.modules.embedding import TokenEmbedding
+from AR.modules.transformer import LayerNorm
+from AR.modules.transformer import TransformerEncoder
+from AR.modules.transformer import TransformerEncoderLayer
+from torch import nn
+from torch.nn import functional as F
+from torchmetrics.classification import MulticlassAccuracy
+default_config = {
+    "embedding_dim": 512,
+    "hidden_dim": 512,
+    "num_head": 8,
+    "num_layers": 12,
+    "num_codebook": 8,
+    "p_dropout": 0.0,
+    "vocab_size": 1024 + 1,
+    "phoneme_vocab_size": 512,
+    "EOS": 1024
+}
+class Text2SemanticDecoder(nn.Module):
+    def __init__(self, config, norm_first=False, top_k=3):
+        super(Text2SemanticDecoder, self).__init__()
+        self.model_dim = config['model']["hidden_dim"]
+        self.embedding_dim = config['model']["embedding_dim"]
+        self.num_head = config['model']["head"]
+        self.num_layers = config['model']["n_layer"]
+        self.norm_first = norm_first
+        self.vocab_size = config['model']["vocab_size"]
+        self.phoneme_vocab_size = config['model']["phoneme_vocab_size"]
+        self.p_dropout = config['model']["dropout"]
+        self.EOS = config['model']["EOS"]
+        self.norm_first = norm_first
+        assert self.EOS == self.vocab_size - 1
+        # should be same as num of kmeans bin
+        # assert self.EOS == 1024
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_text_embedding = TokenEmbedding(
+            self.embedding_dim, self.phoneme_vocab_size, self.p_dropout)
+        self.ar_text_position = SinePositionalEmbedding(
+            self.embedding_dim, dropout=0.1, scale=False, alpha=True)
+        self.ar_audio_embedding = TokenEmbedding(
+            self.embedding_dim, self.vocab_size, self.p_dropout)
+        self.ar_audio_position = SinePositionalEmbedding(
+            self.embedding_dim, dropout=0.1, scale=False, alpha=True)
+        self.h = TransformerEncoder(
+            TransformerEncoderLayer(
+                d_model=self.model_dim,
+                nhead=self.num_head,
+                dim_feedforward=self.model_dim * 4,
+                dropout=0.1,
+                batch_first=True,
+                norm_first=norm_first, ),
+            num_layers=self.num_layers,
+            norm=LayerNorm(self.model_dim) if norm_first else None, )
+        self.ar_predict_layer = nn.Linear(
+            self.model_dim, self.vocab_size, bias=False)
+        self.loss_fct = nn.CrossEntropyLoss(reduction='sum')
+        self.ar_accuracy_metric = MulticlassAccuracy(
+            self.vocab_size,
+            top_k=top_k,
+            average="micro",
+            multidim_average="global",
+            ignore_index=self.EOS, )
+    def forward(self, x, x_lens, y, y_lens, bert_feature):
+        '''
+        x: phoneme_ids
+        y: semantic_ids
+        '''
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1,2))
+        x = self.ar_text_position(x)
+        x_mask = make_pad_mask(x_lens)
+        y_mask = make_pad_mask(y_lens)
+        y_mask_int = y_mask.type(torch.int64)
+        codes = y.type(torch.int64) * (1 - y_mask_int)
+        # Training
+        # AR Decoder
+        y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS)
+        x_len = x_lens.max()
+        y_len = y_lens.max()
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position(y_emb)
+        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
+        ar_xy_padding_mask = xy_padding_mask
+        x_attn_mask = F.pad(
+            torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
+            (0, y_len),
+            value=True, )
+        y_attn_mask = F.pad(
+            torch.triu(
+                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
+                diagonal=1, ),
+            (x_len, 0),
+            value=False, )
+        xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
+        bsz, src_len = x.shape[0], x_len + y_len
+        _xy_padding_mask = (ar_xy_padding_mask.view(bsz, 1, 1, src_len)
+                            .expand(-1, self.num_head, -1, -1)
+                            .reshape(bsz * self.num_head, 1, src_len))
+        xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)
+        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
+        new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
+        xy_attn_mask = new_attn_mask
+        # x 和完整的 y 一次性输入模型
+        xy_pos = torch.concat([x, y_pos], dim=1)
+        xy_dec, _ = self.h(
+            (xy_pos, None),
+            mask=xy_attn_mask, )
+        logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1)
+        # loss
+        # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
+        loss = F.cross_entropy(logits, targets, reduction='sum')
+        acc = self.ar_accuracy_metric(logits.detach(), targets).item()
+        return loss, acc
+    # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么
+    def infer(self,
+              x,
+              x_lens,
+              prompts,
+              bert_feature,
+              top_k: int=-100,
+              early_stop_num: int=-1,
+              temperature: float=1.0):
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1,2))
+        x = self.ar_text_position(x)
+        # AR Decoder
+        y = prompts
+        prefix_len = y.shape[1]
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+        for _ in tqdm(range(1500)):
+            y_emb = self.ar_audio_embedding(y)
+            y_pos = self.ar_audio_position(y_emb)
+            # x 和逐渐增长的 y 一起输入给模型
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            y_len = y.shape[1]
+            x_attn_mask_pad = F.pad(
+                x_attn_mask,
+                (0, y_len),
+                value=True, )
+            y_attn_mask = F.pad(
+                torch.triu(
+                    torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+                (x_len, 0),
+                value=False, )
+            xy_attn_mask = torch.concat(
+                [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device)
+            xy_dec, _ = self.h(
+                (xy_pos, None),
+                mask=xy_attn_mask, )
+            logits = self.ar_predict_layer(xy_dec[:, -1])
+            samples = topk_sampling(
+                logits, top_k=top_k, top_p=1.0, temperature=temperature)
+            if early_stop_num != -1 and (y.shape[1] - prefix_len
+                                         ) > early_stop_num:
+                print("use early stop num:", early_stop_num)
+                stop = True
+            if torch.argmax(
+                    logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
+                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
+                stop = True
+            if stop:
+                if prompts.shape[1] == y.shape[1]:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print('bad zero prediction')
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+            # print(samples.shape)#[1,1]#第一个1是bs
+            # import os
+            # os._exit(2333)
+            y = torch.concat([y, samples], dim=1)
+        return y
+    def pad_y_eos(self, y, y_mask_int, eos_id):
+        targets = F.pad(
+            y, (0, 1), value=0) + eos_id * F.pad(
+                y_mask_int, (0, 1), value=1)
+        # 错位
+        return targets[:, :-1], targets[:, 1:]
+    def infer_panel(self,
+              x,#####全部文本token
+              x_lens,
+              prompts,####参考音频token
+              bert_feature,
+              top_k: int=-100,
+              early_stop_num: int=-1,
+              temperature: float=1.0):
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1,2))
+        x = self.ar_text_position(x)
+        # AR Decoder
+        y = prompts
+        prefix_len = y.shape[1]
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+        # print(1111111,self.num_layers)
+        cache={
+            "all_stage":self.num_layers,
+            "k":[None]*self.num_layers,###根据配置自己手写
+            "v":[None]*self.num_layers,
+            # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存，每次都要重新拼xy_pos.主要还是写法原因，其实是可以历史统一一样的，但也没啥计算量就不管了
+            "y_emb":None,##只需要对最新的samples求emb，再拼历史的就行
+            # "logits":None,###原版就已经只对结尾求再拼接了，不用管
+            # "xy_dec":None,###不需要，本来只需要最后一个做logits
+            "first_infer":1,
+            "stage":0
+        }
+        for idx in tqdm(range(1500)):
+            if(cache["first_infer"]==1):
+                y_emb = self.ar_audio_embedding(y)
+            else:
+                y_emb = torch.cat([cache["y_emb"],self.ar_audio_embedding(y[:,-1:])],1)
+            cache["y_emb"]=y_emb
+            y_pos = self.ar_audio_position(y_emb)
+            # x 和逐渐增长的 y 一起输入给模型
+            if(cache["first_infer"]==1):
+                xy_pos = torch.concat([x, y_pos], dim=1)
+            else:
+                xy_pos=y_pos[:,-1:]
+            y_len = y_pos.shape[1]
+            ###以下3个不做缓存
+            if (cache["first_infer"] == 1):
+                x_attn_mask_pad = F.pad(
+                        x_attn_mask,
+                        (0, y_len),###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+                        value=True, )
+                y_attn_mask = F.pad(###yy的右上1扩展到左边xy的0,(y,x+y)
+                    torch.triu(
+                        torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+                    (x_len, 0),
+                    value=False, )
+                xy_attn_mask = torch.concat(
+                    [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device)
+            else:
+                ###最右边一列（是错的）
+                # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
+                # xy_attn_mask[:,-1]=False
+                ###最下面一行（是对的）
+                xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool, device=xy_pos.device)
+            # pdb.set_trace()
+            ###缓存重头戏
+            # print(1111,xy_pos.shape,xy_attn_mask.shape,x_len,y_len)
+            xy_dec, _ = self.h(
+                (xy_pos, None),
+                mask=xy_attn_mask,cache=cache )
+            logits = self.ar_predict_layer(xy_dec[:, -1])##不用改，如果用了cache的默认就是只有一帧，取最后一帧一样的
+            # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
+            samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
+            if early_stop_num != -1 and (y.shape[1] - prefix_len
+                                         ) > early_stop_num:
+                print("use early stop num:", early_stop_num)
+                stop = True
+            if torch.argmax(
+                    logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
+                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
+                stop = True
+            if stop:
+                if prompts.shape[1] == y.shape[1]:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print('bad zero prediction')
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+            # print(samples.shape)#[1,1]#第一个1是bs
+            y = torch.concat([y, samples], dim=1)
+            cache["first_infer"]=0
+        return y,idx

AR/models/utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/utils.py\
+import torch
+import torch.nn.functional as F
+import torchaudio
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def make_pad_mask(lengths: torch.Tensor, max_len: int=0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    #>>> lengths = torch.tensor([1, 3, 2, 5])
+    #>>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)
+# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
+def top_k_top_p_filtering(logits,
+                          top_k=0,
+                          top_p=1.0,
+                          filter_value=-float("Inf"),
+                          min_tokens_to_keep=1):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep),
+                    logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
+    # temperature: (`optional`) float
+    #     The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
+    # top_k: (`optional`) int
+    #     The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+    # top_p: (`optional`) float
+    #     The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+    # Temperature (higher temperature => more likely to sample low probability tokens)
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Top-p/top-k filtering
+    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    # Sample
+    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
+    return token
+from typing import Optional, Tuple
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def logits_to_probs(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    temperature: float = 1.0,
+    top_k: Optional[int] = None,
+    top_p: Optional[int] = None,
+    repetition_penalty: float = 1.0,
+):
+    previous_tokens=previous_tokens.squeeze()
+    # print(logits.shape,previous_tokens.shape)
+    # pdb.set_trace()
+    if previous_tokens is not None and repetition_penalty != 1.0:
+        previous_tokens = previous_tokens.long()
+        score = torch.gather(logits, dim=0, index=previous_tokens)
+        score = torch.where(
+            score < 0, score * repetition_penalty, score / repetition_penalty
+        )
+        logits.scatter_(dim=0, index=previous_tokens, src=score)
+    if top_p is not None and top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cum_probs = torch.cumsum(
+            torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
+        )
+        sorted_indices_to_remove = cum_probs > top_p
+        sorted_indices_to_remove[0] = False  # keep at least one option
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            dim=0, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+    logits = logits / max(temperature, 1e-5)
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def sample(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    **sampling_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    probs = logits_to_probs(
+        logits=logits, previous_tokens=previous_tokens, **sampling_kwargs
+    )
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs

AR/modules/__init__.py ADDED Viewed

File without changes

AR/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

AR/modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (146 Bytes). View file