Spaces:

tencent
/

SongGeneration

Running on L40S

App Files Files Community

root commited on 13 days ago

Commit

f9e2d84

1 Parent(s): 410c1c2

update v1.5-beta

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +5 -7
codeclm/models/builders.py +5 -4
codeclm/modules/conditioners.py +13 -3
codeclm/tokenizer/Flow1dVAE/cal_token_stat.py +0 -19
codeclm/tokenizer/Flow1dVAE/compare_model_weight.py +0 -13
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_and_sep_npy.py +0 -121
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_sep.py +0 -94
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x2.py +0 -70
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py +0 -46
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py +0 -86
codeclm/tokenizer/Flow1dVAE/generate_1rvq.py +3 -32
codeclm/tokenizer/Flow1dVAE/generate_2rvq.py +0 -293
codeclm/tokenizer/Flow1dVAE/generate_4rvq.py +0 -292
codeclm/tokenizer/Flow1dVAE/libs/datasets/MusicSoundMixedDataset.py +0 -1278
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_429.py +0 -372
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined.py +0 -830
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined_withset.py +0 -994
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song.py +0 -313
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_20s.py +0 -313
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_new_429.py +0 -313
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_stock.py +0 -461
codeclm/tokenizer/Flow1dVAE/model_1rvq.py +0 -2
codeclm/tokenizer/Flow1dVAE/model_2rvq.py +0 -774
codeclm/tokenizer/Flow1dVAE/model_4rvq.py +0 -774
codeclm/tokenizer/Flow1dVAE/model_septoken.py +0 -2
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml +0 -122
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_music_multinodes.yaml +0 -125
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml +0 -137
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes.yaml +0 -139
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml +0 -138
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug2node.yaml +0 -139
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_orig.yaml +0 -135
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_tune.yaml +0 -137
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M.yaml +0 -116
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq.yaml +0 -125
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml +0 -128
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_multinodes.yaml +0 -126
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml +0 -128
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_speech_multinodes.yaml +0 -128
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrvq_multinodes.yaml +0 -121
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml +0 -0
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac_multinodes.yaml +0 -121
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_groupbestrq_multinodes.yaml +0 -125
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_mel_multinodes.yaml +0 -124
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_bestrvq_multinodes.yaml +0 -108
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_multinodes.yaml +0 -105
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_speech_multinodes.yaml +0 -106
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml +0 -20
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py +0 -2
codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/ark_dataset.py +0 -115

app.py CHANGED Viewed

@@ -16,14 +16,12 @@ from download import download_model
 # 下载模型
 APP_DIR = op.dirname(op.abspath(__file__))
 download_model(APP_DIR)
-base_full_path = op.join(APP_DIR, "ckpt", "songgeneration-base-full")
-os.makedirs(base_full_path, exist_ok=True)
-download_model(base_full_path, repo_id="lglg666/SongGeneration-base-full", revision="19ebdb6")
 print("Successful downloaded model.")
 # 模型初始化
 from levo_inference import LeVoInference
-MODEL = LeVoInference(base_full_path)
 EXAMPLE_LYRICS = """
 [intro-medium]
@@ -225,7 +223,7 @@ lyrics
                     minimum=0.1,
                     maximum=2.0,
                     step=0.1,
-                    value=0.75,
                     interactive=True,
                     elem_id="temperature",
                 )
@@ -268,12 +266,12 @@ lyrics
     # 生成按钮点击事件
     generate_btn.click(
         fn=generate_song,
-        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1)],
         outputs=[output_audio, output_json]
     )
     generate_bgm_btn.click(
         fn=generate_song,
-        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1), gr.State("bgm")],
         outputs=[output_audio, output_json]
     )

 # 下载模型
 APP_DIR = op.dirname(op.abspath(__file__))
 download_model(APP_DIR)
+download_model(op.join(APP_DIR, "ckpt"), repo_id="waytan22/SongGeneration-v1.5-beta", revision="db10f47")
 print("Successful downloaded model.")
 # 模型初始化
 from levo_inference import LeVoInference
+MODEL = LeVoInference(op.join(APP_DIR, "ckpt", "SongGeneration-v1.5-beta"))
 EXAMPLE_LYRICS = """
 [intro-medium]
                     minimum=0.1,
                     maximum=2.0,
                     step=0.1,
+                    value=0.8,
                     interactive=True,
                     elem_id="temperature",
                 )
     # 生成按钮点击事件
     generate_btn.click(
         fn=generate_song,
+        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(50)],
         outputs=[output_audio, output_json]
     )
     generate_bgm_btn.click(
         fn=generate_song,
+        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(50), gr.State("bgm")],
         outputs=[output_audio, output_json]
     )

codeclm/models/builders.py CHANGED Viewed

@@ -52,7 +52,7 @@ def get_audio_tokenizer_model_cpu(checkpoint_path: str, cfg: omegaconf.DictConfi
         return AudioTokenizer.get_pretrained(name, cfg.vae_config, cfg.vae_model, 'cpu', mode=cfg.mode, tango_device='cpu')
-def get_lm_model(cfg: omegaconf.DictConfig): #-> LMModel:
     """Instantiate a LM."""
     lm_kwargs = dict_from_config(getattr(cfg, 'lm'))
@@ -61,8 +61,8 @@ def get_lm_model(cfg: omegaconf.DictConfig): #-> LMModel:
     q_modeling = lm_kwargs.pop('q_modeling', None)
     # conditioner
-    condition_provider = get_conditioner_provider(lm_kwargs["dim"], cfg)
     # codebook pattern: delay
     codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
     if codebooks_pattern_cfg.modeling is None:
@@ -97,7 +97,7 @@ def get_lm_model(cfg: omegaconf.DictConfig): #-> LMModel:
         raise KeyError(f"Unexpected LM model {lm_type}")
-def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> ConditionerProvider:
     """Instantiate a conditioning model."""
     cfg = getattr(cfg, 'conditioners')
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
@@ -115,6 +115,7 @@ def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> Cond
         elif model_type == "QwTextTokenizer":
             conditioners[str(cond)] = QwTextConditioner(
                 output_dim=output_dim,
                 **model_args
             )
         elif model_type == "qt_embedding":

         return AudioTokenizer.get_pretrained(name, cfg.vae_config, cfg.vae_model, 'cpu', mode=cfg.mode, tango_device='cpu')
+def get_lm_model(cfg: omegaconf.DictConfig, version: str = 'v1.0'): #-> LMModel:
     """Instantiate a LM."""
     lm_kwargs = dict_from_config(getattr(cfg, 'lm'))
     q_modeling = lm_kwargs.pop('q_modeling', None)
     # conditioner
+    condition_provider = get_conditioner_provider(lm_kwargs["dim"], cfg, version=version)
     # codebook pattern: delay
     codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
     if codebooks_pattern_cfg.modeling is None:
         raise KeyError(f"Unexpected LM model {lm_type}")
+def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig, version: str = 'v1.0') -> ConditionerProvider:
     """Instantiate a conditioning model."""
     cfg = getattr(cfg, 'conditioners')
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
         elif model_type == "QwTextTokenizer":
             conditioners[str(cond)] = QwTextConditioner(
                 output_dim=output_dim,
+                version=version,
                 **model_args
             )
         elif model_type == "qt_embedding":

codeclm/modules/conditioners.py CHANGED Viewed

@@ -188,10 +188,13 @@ class QwTokenizerConditioner(TextConditioner):
 class QwTextConditioner(TextConditioner):
     def __init__(self, output_dim: int,
                  token_path = "",
-                 max_len = 300): #""
         from transformers import Qwen2Tokenizer
-        self.text_tokenizer = Qwen2Tokenizer.from_pretrained(token_path)
         voc_size = len(self.text_tokenizer.get_vocab())
         # here initialize a output_proj (nn.Embedding) layer
         super().__init__(voc_size, output_dim, input_token=True, padding_idx=151643)
@@ -636,7 +639,14 @@ class ClassifierFreeGuidanceDropoutInference(ClassifierFreeGuidanceDropout):
             sample.audio[condition] = self.get_null_wav(audio_cond.wav, sr=audio_cond.sample_rate[0])
         else:
             if customized is None:
-                sample.text[condition] = None
             else:
                 text_cond = deepcopy(sample.text[condition])
                 if "structure" in customized:

 class QwTextConditioner(TextConditioner):
     def __init__(self, output_dim: int,
                  token_path = "",
+                 max_len = 300,
+                 version: str = 'v1.0'): #""
         from transformers import Qwen2Tokenizer
+        self.text_tokenizer = Qwen2Tokenizer.from_pretrained(token_path)
+        if version == 'v1.5':
+            self.text_tokenizer.add_tokens(['[Musicality-very-high]', '[Musicality-high]', '[Musicality-medium]', '[Musicality-low]', '[Musicality-very-low]'], special_tokens=True)
         voc_size = len(self.text_tokenizer.get_vocab())
         # here initialize a output_proj (nn.Embedding) layer
         super().__init__(voc_size, output_dim, input_token=True, padding_idx=151643)
             sample.audio[condition] = self.get_null_wav(audio_cond.wav, sr=audio_cond.sample_rate[0])
         else:
             if customized is None:
+                if condition in ['type_info'] and sample.text[condition] is not None:
+                    if "[Musicality-very-high]" in sample.text[condition]:
+                        sample.text[condition] = "[Musicality-very-low], ."
+                        print(f"cfg unconditioning: change sample.text[condition] to [Musicality-very-low]")
+                    else:
+                        sample.text[condition] = None
+                else:
+                    sample.text[condition] = None
             else:
                 text_cond = deepcopy(sample.text[condition])
                 if "structure" in customized:

codeclm/tokenizer/Flow1dVAE/cal_token_stat.py DELETED Viewed

@@ -1,19 +0,0 @@
-import kaldiio
-from tqdm import tqdm
-import torch
-if __name__ == "__main__":
-    bar = torch.zeros(1, 16384)
-    with open('token.scp', 'r') as f:
-        for item_idx, line in tqdm(enumerate(f)):
-            idx, pos = line.strip().split()
-            codes = kaldiio.load_mat(pos)
-            for i0 in range(codes.shape[-1]):
-                bar[0, codes[0, 0, i0]] += 1
-            if(item_idx % 1000 == 0):
-                print("=========")
-                print(1 - (bar[0]==0).sum() / bar.shape[-1])
-                print("=========")
-        print("=========")
-        print(1 - (bar[0]==0).sum() / bar.shape[-1])
-        print("=========")

codeclm/tokenizer/Flow1dVAE/compare_model_weight.py DELETED Viewed

@@ -1,13 +0,0 @@
-import torch
-import sys
-from safetensors.torch import load_file
-if __name__ == "__main__":
-    m0, m1 = sys.argv[1], sys.argv[2]
-    m0 = load_file(m0)
-    m1 = load_file(m1)
-    ks = [k for k in m0.keys() if 'bestrq' in k]
-    for k in ks:
-        print(k, (m0[k] - m1[k]).abs().sum())

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_and_sep_npy.py DELETED Viewed

@@ -1,121 +0,0 @@
-import torch,torchaudio
-import os,sys,json
-from tqdm import tqdm
-import numpy as np
-#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
-from generate_septoken import Tango as Tango_sep
-from generate_2rvq import Tango as Tango_1x2
-import kaldiio
-from kaldiio import WriteHelper
-from audio import AudioFile
-from demucs.models.pretrained import get_model_from_yaml
-from filelock import FileLock
-# os.path.join(args.model_dir, "htdemucs.pth"), os.path.join(args.model_dir, "htdemucs.yaml")
-class Separator:
-    def __init__(self, dm_model_path='demucs/ckpt/htdemucs.pth', dm_config_path='demucs/ckpt/htdemucs.yaml', gpu_id=0) -> None:
-        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
-            self.device = torch.device(f"cuda:{gpu_id}")
-        else:
-            self.device = torch.device("cpu")
-        self.demucs_model = self.init_demucs_model(dm_model_path, dm_config_path)
-    def init_demucs_model(self, model_path, config_path):
-        model = get_model_from_yaml(config_path, model_path)
-        model.to(self.device)
-        model.eval()
-        return model
-    def load_audio(self, f):
-        a, fs = torchaudio.load(f)
-        if (fs != 48000):
-            a = torchaudio.functional.resample(a, fs, 48000)
-        # if a.shape[-1] >= 48000*10:
-        #     a = a[..., :48000*10]
-        # else:
-        #     a = torch.cat([a, a], -1)
-        # return a[:, 0:48000*10]
-        return a
-    def run(self, audio_path, output_dir='demucs/test_output', ext=".flac"):
-        name, _ = os.path.splitext(os.path.split(audio_path)[-1])
-        output_paths = []
-        # lock_path = os.path.join(output_dir, f"{name}.lock")
-        # with FileLock(lock_path):  # 加一个避免多卡访问时死锁
-        for stem in self.demucs_model.sources:
-            output_path = os.path.join(output_dir, f"{name}_{stem}{ext}")
-            if os.path.exists(output_path):
-                output_paths.append(output_path)
-        if len(output_paths) == 1:  # 4
-            # drums_path, bass_path, other_path, vocal_path = output_paths
-            vocal_path = output_paths[0]
-        else:
-            lock_path = os.path.join(output_dir, f"{name}_separate.lock")
-            with FileLock(lock_path):
-                drums_path, bass_path, other_path, vocal_path = self.demucs_model.separate(audio_path, output_dir, device=self.device)
-        full_audio = self.load_audio(audio_path)
-        vocal_audio = self.load_audio(vocal_path)
-        minlen = min(full_audio.shape[-1], vocal_audio.shape[-1])
-        # bgm_audio = full_audio[:, 0:minlen] - vocal_audio[:, 0:minlen]
-        bgm_audio = self.load_audio(drums_path) + self.load_audio(bass_path) + self.load_audio(other_path)
-        for path in [drums_path, bass_path, other_path, vocal_path]:
-            os.remove(path)
-        return full_audio, vocal_audio, bgm_audio
-def read_wav(fname, sample_rate=48_000):
-    try:
-        orig_samples, fs = torchaudio.load(fname)
-    except:
-        af = AudioFile(fname)
-        orig_samples = af.read()
-        fs = af.samplerate()
-        orig_samples = orig_samples[0]
-    if(fs!=sample_rate):
-        orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
-        fs = sample_rate
-    if orig_samples.shape[0] == 1:
-        orig_samples = torch.cat([orig_samples, orig_samples], 0)
-    return orig_samples
-if __name__ == "__main__":
-    # Define Model
-    json_path = sys.argv[1]
-    mus_infos = []
-    with open(json_path) as f:
-        for line in f:
-            item = json.loads(line)
-            mus_infos.append(item)
-    tango_sep = Tango_sep(model_path="./saved/model_septoken/model_2.safetensors")
-    tango_1x2 = Tango_1x2(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
-    separator = Separator()
-    # Feature extraction loop
-    # for i in tqdm(range(2000)):
-    first_time = True
-    for item in tqdm(mus_infos):
-        if(os.path.exists(item['path'])):
-            full_path = item['path']
-        else:
-            full_path = '/mnt/share/' + item['path']
-        full_tensor, vocal_tensor, bgm_tensor = separator.run(full_path)
-        # full_tensor = read_wav(full_path)
-        # vocal_tensor = read_wav(vocal_path)
-        # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
-        # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
-        # bgm_tensor = full_tensor - vocal_tensor
-        codes_1x2 = tango_1x2.sound2code(full_tensor)
-        codes_vocal, codes_bgm = tango_sep.sound2code(vocal_tensor, bgm_tensor)
-        codes = torch.cat([codes_1x2[:,[0],:], codes_vocal, codes_bgm], 1).cpu().numpy()
-        save_path = full_path.replace('.wav', '.1x1_and_sep.npy').replace('.mp3', '.1x1_and_sep.npy').replace('.flac', '.1x1_and_sep.npy').replace('.ogg', '.1x1_and_sep.npy')
-        assert save_path != full_path, (save_path, full_path)
-        np.save(save_path, codes)
-        if(first_time):
-            first_time = False
-            print(codes_vocal.shape, codes_bgm.shape)

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_sep.py DELETED Viewed

@@ -1,94 +0,0 @@
-import torch,torchaudio
-import os,sys,json
-from tqdm import tqdm
-#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
-from generate_septoken import Tango
-import kaldiio
-from kaldiio import WriteHelper
-from audio import AudioFile
-def read_wav(fname, sample_rate=48_000):
-    try:
-        orig_samples, fs = torchaudio.load(fname)
-    except:
-        af = AudioFile(fname)
-        orig_samples = af.read()
-        fs = af.samplerate()
-        orig_samples = orig_samples[0]
-    if(fs!=sample_rate):
-        orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
-        fs = sample_rate
-    if orig_samples.shape[0] == 1:
-        orig_samples = torch.cat([orig_samples, orig_samples], 0)
-    return orig_samples
-if __name__ == "__main__":
-    # Define Model
-    json_path = sys.argv[1]
-    outdir = sys.argv[2]
-    mus_infos = []
-    with open(json_path) as f:
-        for line in f:
-            item = json.loads(line)
-            mus_infos.append(item)
-    tango = Tango(model_path="./saved/model_septoken/model_2.safetensors")
-    # Feature extraction loop
-    # for i in tqdm(range(2000)):
-    first_time = True
-    with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal,  WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm:
-        print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir))
-        print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir))
-        for item in tqdm(mus_infos):
-            try:
-            # if True:
-                idx = item['idx']
-                # print(idx)
-                if(os.path.exists(item['path'])):
-                    full_path = item['path']
-                else:
-                    full_path = '/mnt/share/' + item['path']
-                if(os.path.exists(item['vocal_path'])):
-                    vocal_path = item['vocal_path']
-                    bgm_paths = item['bgm_path']
-                else:
-                    vocal_path = '/mnt/share/' + item['vocal_path']
-                    bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']]
-                vocal_tensor = read_wav(vocal_path)
-                # full_tensor = read_wav(full_path)
-                # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
-                # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
-                # bgm_tensor = full_tensor - vocal_tensor
-                bgm_tensor = sum([read_wav(p) for p in bgm_paths])
-                codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
-                writer_vocal(str(idx), codes_vocal.cpu())
-                writer_bgm(str(idx), codes_bgm.cpu())
-                if(first_time):
-                    first_time = False
-                    print(codes_vocal.shape, codes_bgm.shape)
-            except:
-                print(item['vocal_path'])
-                print(item['bgm_path'])
-                continue
-            # idx = item['idx']
-            # # print(idx)
-            # full_path = item['path']
-            # vocal_path = item['vocal_path']
-            # bgm_paths = item['bgm_path']
-            # full_tensor = read_wav(full_path)
-            # vocal_tensor = read_wav(vocal_path)
-            # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
-            # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
-            # bgm_tensor = full_tensor - vocal_tensor
-            # codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
-            # writer_vocal(str(idx), codes_vocal.cpu())
-            # writer_bgm(str(idx), codes_bgm.cpu())
-            # if(first_time):
-            #     first_time = False
-            #     print(codes_vocal.shape, codes_bgm.shape)

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x2.py DELETED Viewed

@@ -1,70 +0,0 @@
-import torch,torchaudio
-import os,sys,json
-from tqdm import tqdm
-#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
-from generate_2rvq import Tango
-import kaldiio
-from kaldiio import WriteHelper
-import torch
-import subprocess
-import time
-import sys
-def get_gpu_memory():
-    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
-    ACCEPTABLE_AVAILABLE_MEMORY = 1024
-    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
-    memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
-    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
-    return memory_free_values
-if __name__ == "__main__":
-    # Define Model
-    json_path = sys.argv[1]
-    outdir = sys.argv[2]
-    gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
-    while True:
-        free_mem = get_gpu_memory()
-        free_mem = free_mem[gpu_idx]
-        if(free_mem > 25_000):
-            print("GPU memory {}, run matrix cal".format(free_mem))
-            break
-        else:
-            print("GPU memory {}, sleep 1min".format(free_mem))
-            time.sleep(60)
-    mus_infos = []
-    with open(json_path) as f:
-        for line in f:
-            item = json.loads(line)
-            mus_infos.append(item)
-    tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
-    # Feature extraction loop
-    # for i in tqdm(range(2000)):
-    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
-        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
-        for item in tqdm(mus_infos):
-            try:
-            # if True:
-                idx = item['idx']
-                # print(idx)
-                with torch.autocast(device_type="cuda", dtype=torch.float16):
-                    if(os.path.exists(item['path'])):
-                        codes = tango.file2code(item['path'])
-                    else:
-                        codes = tango.file2code('/mnt/share/' + item['path'])
-                writer(str(idx), codes.cpu())
-            except:
-                print(item['path'])
-                continue
-            # idx = item['idx']
-            # # print(idx)
-            # with torch.autocast(device_type="cuda", dtype=torch.float16):
-            #     codes = tango.file2code(item['path'])
-            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py DELETED Viewed

@@ -1,46 +0,0 @@
-import torch,torchaudio
-import os,sys,json
-from tqdm import tqdm
-#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
-from generate_4rvq import Tango
-import kaldiio
-from kaldiio import WriteHelper
-if __name__ == "__main__":
-    # Define Model
-    json_path = sys.argv[1]
-    outdir = sys.argv[2]
-    mus_infos = []
-    with open(json_path) as f:
-        for line in f:
-            item = json.loads(line)
-            mus_infos.append(item)
-    tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
-    # Feature extraction loop
-    # for i in tqdm(range(2000)):
-    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
-        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
-        for item in tqdm(mus_infos):
-            try:
-            # if True:
-                idx = item['idx']
-                # print(idx)
-                with torch.autocast(device_type="cuda", dtype=torch.float16):
-                    if(os.path.exists(item['path'])):
-                        codes = tango.file2code(item['path'])
-                    else:
-                        codes = tango.file2code('/mnt/share/' + item['path'])
-                writer(str(idx), codes.cpu())
-            except:
-                print(item['path'])
-                continue
-            # idx = item['idx']
-            # # print(idx)
-            # with torch.autocast(device_type="cuda", dtype=torch.float16):
-            #     codes = tango.file2code(item['path'])
-            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py DELETED Viewed

@@ -1,86 +0,0 @@
-import torch,torchaudio
-import os,sys,json
-from tqdm import tqdm
-#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
-from generate_4rvq import Tango
-import kaldiio
-from kaldiio import WriteHelper
-import torch
-import subprocess
-import time
-import sys
-def get_gpu_memory():
-    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
-    ACCEPTABLE_AVAILABLE_MEMORY = 1024
-    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
-    memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
-    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
-    return memory_free_values
-if __name__ == "__main__":
-    # Define Model
-    json_path = sys.argv[1]
-    outdir = sys.argv[2]
-    ds = int(sys.argv[3])
-    gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
-    while True:
-        free_mem = get_gpu_memory()
-        free_mem = free_mem[gpu_idx]
-        if(free_mem > 25_000):
-            print("GPU memory {}, run matrix cal".format(free_mem))
-            break
-        else:
-            print("GPU memory {}, sleep 1min".format(free_mem))
-            time.sleep(60)
-    mus_infos = []
-    with open(json_path) as f:
-        for line in f:
-            item = json.loads(line)
-            mus_infos.append(item)
-    tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
-    # Feature extraction loop
-    # for i in tqdm(range(2000)):
-    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
-        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
-        bar = torch.zeros(4, 16384)
-        for item_idx, item in tqdm(enumerate(mus_infos)):
-            try:
-            # if True:
-                idx = item['idx']
-                # print(idx)
-                with torch.autocast(device_type="cuda", dtype=torch.float16):
-                    if(os.path.exists(item['path'])):
-                        codes = tango.file2code_ds(item['path'], ds)
-                    else:
-                        codes = tango.file2code_ds('/mnt/share/' + item['path'], ds)
-                codes = codes.cpu()
-                writer(str(idx), codes)
-                for i0 in range(codes.shape[-1]):
-                    bar[0, codes[0, 0, i0]] += 1
-                    bar[1, codes[0, 1, i0]] += 1
-                    bar[2, codes[0, 2, i0]] += 1
-                    bar[3, codes[0, 3, i0]] += 1
-            except Exception as e:
-                print(item['path'])
-                # print(e.message, e.args)
-                # exit(1)
-                continue
-            if(item_idx % 1000 == 0):
-                print("=========")
-                print(1 - (bar[0]==0).sum() / bar.shape[-1])
-                print("=========")
-            # idx = item['idx']
-            # # print(idx)
-            # with torch.autocast(device_type="cuda", dtype=torch.float16):
-            #     codes = tango.file2code(item['path'])
-            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/generate_1rvq.py CHANGED Viewed

@@ -8,7 +8,6 @@ import librosa
 import os
 import math
 import numpy as np
-from tools.get_1dvae_large import get_model
 import tools.torch_tools as torch_tools
 from safetensors.torch import load_file
@@ -24,9 +23,9 @@ class Tango:
         scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
         self.device = device
-        self.vae = get_model(vae_config, vae_model)
-        self.vae = self.vae.to(device)
-        self.vae=self.vae.eval()
         self.layer_num = layer_num
         self.MAX_DURATION = 360
@@ -254,37 +253,9 @@ class Tango:
         # print(fname, wave.shape)
         return wave
-    @torch.no_grad()
-    def sound2sound_vae(self, sound, prompt=None, steps=50, disable_progress=False):
-        min_samples = int(40 * 25) # 40ms per frame
-        hop_samples = min_samples // 4 * 3
-        ovlp_samples = min_samples - hop_samples
-        dur = 20
-        latent_list = []
-        for i in range(0, sound.shape[-1], dur*48000):
-            if(i+dur*2*48000 > sound.shape[-1]):
-                latent = tango.vae.encode_audio(sound.cuda()[None,:,i:])
-                break
-            else:
-                latent = tango.vae.encode_audio(sound.cuda()[None,:,i:i+dur*48000])
-            latent_list.append(latent)
-        output = None
-        for i in range(len(latent_list)):
-            print(i)
-            latent = latent_list[i]
-            cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
-            if output is None:
-                output = cur_output
-            else:
-                output = torch.cat([output, cur_output], -1)
-        return output
     def to(self, device=None, dtype=None, non_blocking=False):
         if device is not None:
             self.device = device
             self.model.device = device
-        self.vae = self.vae.to(device, dtype, non_blocking)
         self.model = self.model.to(device, dtype, non_blocking)
         return self

 import os
 import math
 import numpy as np
 import tools.torch_tools as torch_tools
 from safetensors.torch import load_file
         scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
         self.device = device
+        # self.vae = get_model(vae_config, vae_model)
+        # self.vae = self.vae.to(device)
+        # self.vae=self.vae.eval()
         self.layer_num = layer_num
         self.MAX_DURATION = 360
         # print(fname, wave.shape)
         return wave
     def to(self, device=None, dtype=None, non_blocking=False):
         if device is not None:
             self.device = device
             self.model.device = device
         self.model = self.model.to(device, dtype, non_blocking)
         return self

codeclm/tokenizer/Flow1dVAE/generate_2rvq.py DELETED Viewed

@@ -1,293 +0,0 @@
-import json
-import torch
-from tqdm import tqdm
-from model_2rvq import PromptCondAudioDiffusion
-from diffusers import DDIMScheduler, DDPMScheduler
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-# from tools.get_mulan import get_mulan
-from tools.get_1dvae_large import get_model
-import tools.torch_tools as torch_tools
-from safetensors.torch import load_file
-from audio import AudioFile
-import kaldiio
-class Tango:
-    def __init__(self, \
-        model_path, \
-        layer_num=6, \
-        rvq_num=1, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
-        self.device = device
-        self.vae = get_model()
-        self.vae = self.vae.to(device)
-        self.vae=self.vae.eval()
-        self.layer_num = layer_num
-        self.MAX_DURATION = 360
-        main_config = {
-            "num_channels":32,
-            "unet_model_name":None,
-            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
-            "snr_gamma":None,
-        }
-        self.rvq_num = rvq_num
-        # print("rvq_num: ", self.rvq_num)
-        # exit()
-        self.model = PromptCondAudioDiffusion(**main_config).to(device)
-        if model_path.endswith(".safetensors"):
-            main_weights = load_file(model_path)
-        else:
-            main_weights = torch.load(model_path, map_location=device)
-        self.model.load_state_dict(main_weights, strict=False)
-        print ("Successfully loaded checkpoint from:", model_path)
-        self.model.eval()
-        self.model.init_device_dtype(torch.device(device), torch.float32)
-        # self.scheduler = DDIMScheduler.from_pretrained( \
-        #     scheduler_name, subfolder="scheduler")
-        # self.scheduler = DDPMScheduler.from_pretrained( \
-        #     scheduler_name, subfolder="scheduler")
-        print("Successfully loaded inference scheduler from {}".format(scheduler_name))
-    @torch.no_grad()
-    @torch.autocast(device_type="cuda", dtype=torch.float32)
-    def sound2code(self, orig_samples, batch_size=8):
-        if(orig_samples.ndim == 2):
-            audios = orig_samples.unsqueeze(0).to(self.device)
-        elif(orig_samples.ndim == 3):
-            audios = orig_samples.to(self.device)
-        else:
-            assert orig_samples.ndim in (2,3), orig_samples.shape
-        audios = self.preprocess_audio(audios)
-        audios = audios.squeeze(0)
-        orig_length = audios.shape[-1]
-        min_samples = int(40 * self.sample_rate)
-        # 40秒对应10个token
-        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
-        # print("output_len: ", output_len)
-        while(audios.shape[-1] < min_samples):
-            audios = torch.cat([audios, audios], -1)
-        int_max_len=audios.shape[-1]//min_samples+1
-        audios = torch.cat([audios, audios], -1)
-        audios=audios[:,:int(int_max_len*(min_samples))]
-        codes_list=[]
-        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
-        for audio_inx in range(0, audio_input.shape[0], batch_size):
-            # import pdb; pdb.set_trace()
-            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num)
-            # print("codes",codes[0].shape)
-            codes_list.append(torch.cat(codes, 1))
-            # print("codes_list",codes_list[0].shape)
-        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
-        codes=codes[:,:,:output_len]
-        return codes
-    @torch.no_grad()
-    @torch.autocast(device_type="cuda", dtype=torch.float32)
-    def sound2code_ds(self, orig_samples, ds, batch_size=8):
-        if(orig_samples.ndim == 2):
-            audios = orig_samples.unsqueeze(0).to(self.device)
-        elif(orig_samples.ndim == 3):
-            audios = orig_samples.to(self.device)
-        else:
-            assert orig_samples.ndim in (2,3), orig_samples.shape
-        audios = self.preprocess_audio(audios)
-        audios = audios.squeeze(0)
-        orig_length = audios.shape[-1]
-        min_samples = int(40 * self.sample_rate)
-        # 40秒对应10个token
-        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
-        # print("output_len: ", output_len)
-        while(audios.shape[-1] < min_samples):
-            audios = torch.cat([audios, audios], -1)
-        int_max_len=audios.shape[-1]//min_samples+1
-        audios = torch.cat([audios, audios], -1)
-        audios=audios[:,:int(int_max_len*(min_samples))]
-        codes_list=[]
-        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
-        for audio_inx in range(0, audio_input.shape[0], batch_size):
-            # import pdb; pdb.set_trace()
-            codes, _, spk_embeds = self.model.fetch_codes_batch_ds((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num, ds=ds)
-            # print("codes",codes[0].shape)
-            codes_list.append(torch.cat(codes, 1))
-            # print("codes_list",codes_list[0].shape)
-        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
-        codes=codes[:,:,:output_len]
-        return codes
-    @torch.no_grad()
-    def code2sound(self, codes, prompt=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
-        codes = codes.to(self.device)
-        min_samples = duration * 25 # 40ms per frame
-        hop_samples = min_samples // 4 * 3
-        ovlp_samples = min_samples - hop_samples
-        hop_frames = hop_samples
-        ovlp_frames = ovlp_samples
-        first_latent = torch.randn(codes.shape[0], min_samples, 64).to(self.device)
-        first_latent_length = 0
-        first_latent_codes_length = 0
-        if(isinstance(prompt, torch.Tensor)):
-            # prepare prompt
-            prompt = prompt.to(self.device)
-            if(prompt.ndim == 3):
-                assert prompt.shape[0] == 1, prompt.shape
-                prompt = prompt[0]
-            elif(prompt.ndim == 1):
-                prompt = prompt.unsqueeze(0).repeat(2,1)
-            elif(prompt.ndim == 2):
-                if(prompt.shape[0] == 1):
-                    prompt = prompt.repeat(2,1)
-            if(prompt.shape[-1] < int(30 * self.sample_rate)):
-                # if less than 30s, just choose the first 10s
-                prompt = prompt[:,:int(10*self.sample_rate)] # limit max length to 10.24
-            else:
-                # else choose from 20.48s which might includes verse or chorus
-                prompt = prompt[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
-            true_latent = self.vae.encode_audio(prompt).permute(0,2,1)
-            # print("true_latent.shape", true_latent.shape)
-            # print("first_latent.shape", first_latent.shape)
-            #true_latent.shape torch.Size([1, 250, 64])
-            # first_latent.shape torch.Size([1, 1000, 64])
-            first_latent[:,0:true_latent.shape[1],:] = true_latent
-            first_latent_length = true_latent.shape[1]
-            first_latent_codes = self.sound2code(prompt)
-            first_latent_codes_length = first_latent_codes.shape[-1]
-            codes = torch.cat([first_latent_codes, codes], -1)
-        codes_len= codes.shape[-1]
-        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
-        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
-        # code repeat
-        if(codes_len < min_samples):
-            while(codes.shape[-1] < min_samples):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:min_samples]
-        codes_len = codes.shape[-1]
-        if((codes_len - ovlp_samples) % hop_samples > 0):
-            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
-            while(codes.shape[-1] < len_codes):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:len_codes]
-        latent_length = min_samples
-        latent_list = []
-        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
-                codes_input=[]
-                codes_input.append(codes[:,:,sinx:sinx+min_samples])
-                if(sinx == 0):
-                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
-                    incontext_length = first_latent_length
-                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-                else:
-                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
-                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
-                    print("true_latent.shape", true_latent.shape)
-                    len_add_to_1000 = 1000 - true_latent.shape[-2]
-                    # print("len_add_to_1000", len_add_to_1000)
-                    # exit()
-                    incontext_length = true_latent.shape[-2]
-                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
-                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-        latent_list = [l.float() for l in latent_list]
-        latent_list[0] = latent_list[0][:,:,first_latent_length:]
-        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
-        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
-        ovlp_samples = min_samples - hop_samples
-        with torch.no_grad():
-            output = None
-            for i in range(len(latent_list)):
-                latent = latent_list[i]
-                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
-                if output is None:
-                    output = cur_output
-                else:
-                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
-                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
-                    print("output.shape", output.shape)
-                    print("ov_win.shape", ov_win.shape)
-                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
-                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
-            output = output[:, 0:target_len]
-        return output
-    @torch.no_grad()
-    def preprocess_audio(self, input_audios, threshold=0.8):
-        assert len(input_audios.shape) == 3, input_audios.shape
-        nchan = input_audios.shape[1]
-        input_audios = input_audios.reshape(input_audios.shape[0], -1)
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
-    @torch.no_grad()
-    def sound2sound(self, sound, prompt=None, steps=50, disable_progress=False):
-        codes = self.sound2code(sound)
-        # print(codes.shape)
-        # exit()
-        wave = self.code2sound(codes, prompt, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
-        # print(fname, wave.shape)
-        return wave
-    def file2code(self, fname):
-        try:
-            orig_samples, fs = torchaudio.load(fname)
-        except:
-            af = AudioFile(fname)
-            orig_samples = af.read()
-            fs = af.samplerate()
-            orig_samples = orig_samples[0]
-        if(fs!=self.sample_rate):
-            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
-            fs = self.sample_rate
-        if orig_samples.shape[0] == 1:
-            orig_samples = torch.cat([orig_samples, orig_samples], 0)
-        return self.sound2code(orig_samples)
-    def file2code_ds(self, fname, ds):
-        try:
-            orig_samples, fs = torchaudio.load(fname)
-        except:
-            af = AudioFile(fname)
-            orig_samples = af.read()
-            fs = af.samplerate()
-            orig_samples = orig_samples[0]
-        if(fs!=self.sample_rate):
-            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
-            fs = self.sample_rate
-        if orig_samples.shape[0] == 1:
-            orig_samples = torch.cat([orig_samples, orig_samples], 0)
-        return self.sound2code_ds(orig_samples, ds)

codeclm/tokenizer/Flow1dVAE/generate_4rvq.py DELETED Viewed

@@ -1,292 +0,0 @@
-import json
-import torch
-from tqdm import tqdm
-from model_4rvq import PromptCondAudioDiffusion
-from diffusers import DDIMScheduler, DDPMScheduler
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-# from tools.get_mulan import get_mulan
-from tools.get_1dvae_large import get_model
-import tools.torch_tools as torch_tools
-from safetensors.torch import load_file
-from audio import AudioFile
-class Tango:
-    def __init__(self, \
-        model_path, \
-        layer_num=6, \
-        rvq_num=1, \
-        device="cuda:0"):
-        self.sample_rate = 48000
-        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
-        self.device = device
-        self.vae = get_model()
-        self.vae = self.vae.to(device)
-        self.vae=self.vae.eval()
-        self.layer_num = layer_num
-        self.MAX_DURATION = 360
-        main_config = {
-            "num_channels":32,
-            "unet_model_name":None,
-            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
-            "snr_gamma":None,
-        }
-        self.rvq_num = rvq_num
-        # print("rvq_num: ", self.rvq_num)
-        # exit()
-        self.model = PromptCondAudioDiffusion(**main_config).to(device)
-        if model_path.endswith(".safetensors"):
-            main_weights = load_file(model_path)
-        else:
-            main_weights = torch.load(model_path, map_location=device)
-        self.model.load_state_dict(main_weights, strict=False)
-        print ("Successfully loaded checkpoint from:", model_path)
-        self.model.eval()
-        self.model.init_device_dtype(torch.device(device), torch.float32)
-        # self.scheduler = DDIMScheduler.from_pretrained( \
-        #     scheduler_name, subfolder="scheduler")
-        # self.scheduler = DDPMScheduler.from_pretrained( \
-        #     scheduler_name, subfolder="scheduler")
-        print("Successfully loaded inference scheduler from {}".format(scheduler_name))
-    @torch.no_grad()
-    @torch.autocast(device_type="cuda", dtype=torch.float32)
-    def sound2code(self, orig_samples, batch_size=8):
-        if(orig_samples.ndim == 2):
-            audios = orig_samples.unsqueeze(0).to(self.device)
-        elif(orig_samples.ndim == 3):
-            audios = orig_samples.to(self.device)
-        else:
-            assert orig_samples.ndim in (2,3), orig_samples.shape
-        audios = self.preprocess_audio(audios)
-        audios = audios.squeeze(0)
-        orig_length = audios.shape[-1]
-        min_samples = int(40 * self.sample_rate)
-        # 40秒对应10个token
-        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
-        # print("output_len: ", output_len)
-        while(audios.shape[-1] < min_samples):
-            audios = torch.cat([audios, audios], -1)
-        int_max_len=audios.shape[-1]//min_samples+1
-        audios = torch.cat([audios, audios], -1)
-        audios=audios[:,:int(int_max_len*(min_samples))]
-        codes_list=[]
-        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
-        for audio_inx in range(0, audio_input.shape[0], batch_size):
-            # import pdb; pdb.set_trace()
-            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num)
-            # print("codes",codes[0].shape)
-            codes_list.append(torch.cat(codes, 1))
-            # print("codes_list",codes_list[0].shape)
-        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
-        codes=codes[:,:,:output_len]
-        return codes
-    @torch.no_grad()
-    @torch.autocast(device_type="cuda", dtype=torch.float32)
-    def sound2code_ds(self, orig_samples, ds, batch_size=6):
-        if(orig_samples.ndim == 2):
-            audios = orig_samples.unsqueeze(0).to(self.device)
-        elif(orig_samples.ndim == 3):
-            audios = orig_samples.to(self.device)
-        else:
-            assert orig_samples.ndim in (2,3), orig_samples.shape
-        audios = self.preprocess_audio(audios)
-        audios = audios.squeeze(0)
-        orig_length = audios.shape[-1]
-        min_samples = int(40 * self.sample_rate)
-        # 40秒对应10个token
-        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
-        # print("output_len: ", output_len)
-        while(audios.shape[-1] < min_samples):
-            audios = torch.cat([audios, audios], -1)
-        int_max_len=audios.shape[-1]//min_samples+1
-        audios = torch.cat([audios, audios], -1)
-        audios=audios[:,:int(int_max_len*(min_samples))]
-        codes_list=[]
-        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
-        for audio_inx in range(0, audio_input.shape[0], batch_size):
-            # import pdb; pdb.set_trace()
-            codes, _, spk_embeds = self.model.fetch_codes_batch_ds((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num, ds=ds)
-            # print("codes",codes[0].shape)
-            codes_list.append(torch.cat(codes, 1))
-            # print("codes_list",codes_list[0].shape)
-        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
-        codes=codes[:,:,:output_len]
-        return codes
-    @torch.no_grad()
-    def code2sound(self, codes, prompt=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
-        codes = codes.to(self.device)
-        min_samples = duration * 25 # 40ms per frame
-        hop_samples = min_samples // 4 * 3
-        ovlp_samples = min_samples - hop_samples
-        hop_frames = hop_samples
-        ovlp_frames = ovlp_samples
-        first_latent = torch.randn(codes.shape[0], min_samples, 64).to(self.device)
-        first_latent_length = 0
-        first_latent_codes_length = 0
-        if(isinstance(prompt, torch.Tensor)):
-            # prepare prompt
-            prompt = prompt.to(self.device)
-            if(prompt.ndim == 3):
-                assert prompt.shape[0] == 1, prompt.shape
-                prompt = prompt[0]
-            elif(prompt.ndim == 1):
-                prompt = prompt.unsqueeze(0).repeat(2,1)
-            elif(prompt.ndim == 2):
-                if(prompt.shape[0] == 1):
-                    prompt = prompt.repeat(2,1)
-            if(prompt.shape[-1] < int(30 * self.sample_rate)):
-                # if less than 30s, just choose the first 10s
-                prompt = prompt[:,:int(10*self.sample_rate)] # limit max length to 10.24
-            else:
-                # else choose from 20.48s which might includes verse or chorus
-                prompt = prompt[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
-            true_latent = self.vae.encode_audio(prompt).permute(0,2,1)
-            # print("true_latent.shape", true_latent.shape)
-            # print("first_latent.shape", first_latent.shape)
-            #true_latent.shape torch.Size([1, 250, 64])
-            # first_latent.shape torch.Size([1, 1000, 64])
-            first_latent[:,0:true_latent.shape[1],:] = true_latent
-            first_latent_length = true_latent.shape[1]
-            first_latent_codes = self.sound2code(prompt)
-            first_latent_codes_length = first_latent_codes.shape[-1]
-            codes = torch.cat([first_latent_codes, codes], -1)
-        codes_len= codes.shape[-1]
-        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
-        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
-        # code repeat
-        if(codes_len < min_samples):
-            while(codes.shape[-1] < min_samples):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:min_samples]
-        codes_len = codes.shape[-1]
-        if((codes_len - ovlp_samples) % hop_samples > 0):
-            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
-            while(codes.shape[-1] < len_codes):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:len_codes]
-        latent_length = min_samples
-        latent_list = []
-        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
-                codes_input=[]
-                codes_input.append(codes[:,:,sinx:sinx+min_samples])
-                if(sinx == 0):
-                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
-                    incontext_length = first_latent_length
-                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-                else:
-                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
-                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
-                    print("true_latent.shape", true_latent.shape)
-                    len_add_to_1000 = 1000 - true_latent.shape[-2]
-                    # print("len_add_to_1000", len_add_to_1000)
-                    # exit()
-                    incontext_length = true_latent.shape[-2]
-                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
-                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-        latent_list = [l.float() for l in latent_list]
-        latent_list[0] = latent_list[0][:,:,first_latent_length:]
-        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
-        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
-        ovlp_samples = min_samples - hop_samples
-        with torch.no_grad():
-            output = None
-            for i in range(len(latent_list)):
-                latent = latent_list[i]
-                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
-                if output is None:
-                    output = cur_output
-                else:
-                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
-                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
-                    print("output.shape", output.shape)
-                    print("ov_win.shape", ov_win.shape)
-                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
-                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
-            output = output[:, 0:target_len]
-        return output
-    @torch.no_grad()
-    def preprocess_audio(self, input_audios, threshold=0.8):
-        assert len(input_audios.shape) == 3, input_audios.shape
-        nchan = input_audios.shape[1]
-        input_audios = input_audios.reshape(input_audios.shape[0], -1)
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
-    @torch.no_grad()
-    def sound2sound(self, sound, prompt=None, steps=50, disable_progress=False):
-        codes = self.sound2code(sound)
-        # print(codes.shape)
-        # exit()
-        wave = self.code2sound(codes, prompt, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
-        # print(fname, wave.shape)
-        return wave
-    def file2code(self, fname):
-        try:
-            orig_samples, fs = torchaudio.load(fname)
-        except:
-            af = AudioFile(fname)
-            orig_samples = af.read()
-            fs = af.samplerate()
-            orig_samples = orig_samples[0]
-        if(fs!=self.sample_rate):
-            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
-            fs = self.sample_rate
-        if orig_samples.shape[0] == 1:
-            orig_samples = torch.cat([orig_samples, orig_samples], 0)
-        return self.sound2code(orig_samples)
-    def file2code_ds(self, fname, ds):
-        try:
-            orig_samples, fs = torchaudio.load(fname)
-        except:
-            af = AudioFile(fname)
-            orig_samples = af.read()
-            fs = af.samplerate()
-            orig_samples = orig_samples[0]
-        if(fs!=self.sample_rate):
-            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
-            fs = self.sample_rate
-        if orig_samples.shape[0] == 1:
-            orig_samples = torch.cat([orig_samples, orig_samples], 0)
-        return self.sound2code_ds(orig_samples, ds)

codeclm/tokenizer/Flow1dVAE/libs/datasets/MusicSoundMixedDataset.py DELETED Viewed

@@ -1,1278 +0,0 @@
-from torch.utils.data import Dataset
-from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List, Union
-from beartype import beartype
-from beartype.door import is_bearable
-import random
-import pandas as pd
-import os
-from torchaudio.functional import resample
-import torch
-import typing as tp
-from pathlib import Path
-import torchaudio as ta
-import torch.nn.functional as F
-import numpy as np
-import json
-import yaml
-import torchaudio
-import math
-import re
-from loguru import logger
-import ffmpeg
-class Read_and_PadCrop_Normalized_T(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
-        if  self.n_samples < 0: #means not clip
-            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-            t_start = 0.
-            t_end = 1.0
-            offset = 0
-        else:
-            if(duration<(float(self.n_samples)/self.sample_rate+1)):
-                # print(duration,(float(self.n_samples)/self.sample_rate+1))
-                chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-                t_start = 0.
-                t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
-                offset = 0
-                # print('c1:',chunk.shape)
-            else:
-                offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-                t_start = offset / float(cur_sample_rate) / duration
-                t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
-                chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-                # print('offset:',offset)
-                # print('c0:',chunk.shape)
-            # Pad with silence if necessary.
-        if(chunk.shape[0]>1):
-            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-        else:
-            chunk = chunk[[0],:].float()
-        if(cur_sample_rate!=self.sample_rate):
-            # print('a:',cur_sample_rate,chunk.shape)
-            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
-            # print('b:',self.sample_rate,chunk.shape)
-        if self.n_samples > 0:
-            if chunk.shape[-1] < self.n_samples:
-                chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
-            else:
-                chunk = chunk[:,0:self.n_samples]
-        seconds_start = math.floor(offset / cur_sample_rate)
-        seconds_total = math.floor(duration)
-        return (
-            chunk,
-            t_start,
-            t_end,
-            seconds_start,
-            seconds_total
-        )
-class Read_and_PadCrop_Normalized_T_Avoid_Watermark(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True, w_start = 0, w_interval = 11.3):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-        self.w_start = w_start
-        self.w_interval = w_interval
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
-        if  self.n_samples < 0: #means not clip
-            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-            t_start = 0.
-            t_end = 1.0
-            offset = 0
-        else:
-            if(duration<(float(self.n_samples)/self.sample_rate+1)):
-                # print(duration,(float(self.n_samples)/self.sample_rate+1))
-                chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-                t_start = 0.
-                t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
-                offset = 0
-                # print('c1:',chunk.shape)
-            else:
-                n_offset_option = (duration - self.w_start) // self.w_interval
-                if n_offset_option <= 1:
-                    offset = 0
-                else:
-                    offset = int((random.randint(0,n_offset_option-1) * self.w_interval + self.w_start) * cur_sample_rate)
-                # offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-                t_start = offset / float(cur_sample_rate) / duration
-                t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
-                chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-                # print('offset:',offset)
-                # print('c0:',chunk.shape)
-            # Pad with silence if necessary.
-        if(chunk.shape[0]>1):
-            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-        else:
-            chunk = chunk[[0],:].float()
-        if(cur_sample_rate!=self.sample_rate):
-            # print('a:',cur_sample_rate,chunk.shape)
-            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
-            # print('b:',self.sample_rate,chunk.shape)
-        if self.n_samples > 0:
-            if chunk.shape[-1] < self.n_samples:
-                chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
-            else:
-                chunk = chunk[:,0:self.n_samples]
-        seconds_start = math.floor(offset / cur_sample_rate)
-        seconds_total = math.floor(duration)
-        return (
-            chunk,
-            t_start,
-            t_end,
-            seconds_start,
-            seconds_total
-        )
-USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
-if USE_DUMMY_AUDIO:
-    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
-class SafeAudioReader:
-    """
-       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
-    """
-    def __init__(self,
-                duration: float,  # 返回音频长度
-                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
-                randomize: bool = True,
-                use_avoid_watermark_policy = False,
-                ):
-        self.n_samples = int(sample_rate * duration)
-        self.reader = (
-            Read_and_PadCrop_Normalized_T_Avoid_Watermark if use_avoid_watermark_policy \
-            else Read_and_PadCrop_Normalized_T
-            )(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
-    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
-    def __call__(self,
-                 filepath: os.PathLike,  # 音频路径
-                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
-                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
-                 ) -> torch.Tensor:
-        if USE_DUMMY_AUDIO:
-            wav = torch.zeros(self.n_samples, dtype=torch.float32)
-            return wav
-        try:
-            if origin_sample_rate is None or origin_duration is None:
-                # audio_info = torchaudio.info(filepath)
-                # origin_sample_rate = audio_info.sample_rate
-                # origin_duration = audio_info.num_frames / origin_sample_rate
-                info = ffmpeg.probe(filepath)
-                origin_duration = float(info['format']['duration'])
-                origin_sample_rate = int(info['streams'][0]['sample_rate'])
-            wav, *ignored = self.reader(filepath, origin_duration, origin_sample_rate)
-            wav = wav.squeeze_(0)
-        except Exception as e:
-            logger.error(f"Error reading {filepath}: {e}")
-            wav = torch.zeros(self.n_samples, dtype=torch.float32)
-        return wav
-class PromptTemplate:
-    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
-        self.template_text = template_text
-        self.tag_map = tag_map
-        self.lang = lang
-    @property
-    def tags(self):
-        return tuple(self.tag_map.keys())
-    def apply(self, **kwargs):
-        for tag in list(kwargs.keys()):
-            if kwargs[tag] == '':
-                kwargs.pop(tag)
-        for tag in self.tags:
-            if tag in kwargs:
-                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
-            else:
-                kwargs[tag] = ''
-        prompt = self.template_text.format(**kwargs)
-        return self.beautify(prompt)
-    def beautify(self, text):
-        if self.lang == 'en':
-            return self._beautify_en(text)
-        elif self.lang == 'zh':
-            return self._beautify_zh(text)
-        else:
-            raise ValueError(f'Unknown language {self.lang}')
-    @staticmethod
-    def _beautify_en(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
-        # no continuous whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
-        text = re.sub(r'\s+,', r',', text)
-        text = re.sub(r',\s+', r', ', text)
-        # no whitespace before the full stop
-        text = re.sub(r'\s+\.', r'.', text)
-        # strip whitespace, comma, and replace ',.'
-        text = text.strip(' ,')
-        text = text.replace(',.', '.')
-        return text
-    @staticmethod
-    def _beautify_zh(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
-        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
-        # assume there should be NO whitespace in Chinese
-        text = re.sub(r'\s+', r'', text)
-        # strip whitespace, comma, and replace '，。'
-        text = text.strip('， 、')
-        text = text.replace('，。', '。')
-        return text
-    def __repr__(self):
-        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
-    __str__ = __repr__
-def parse_prompt_template(prompt_template_text, lang='en'):
-    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
-    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
-    template_text = prompt_template_text.strip()
-    span_texts = span_pattern.findall(prompt_template_text)
-    tag_map = {}
-    for span_text in span_texts:
-        tag = tag_pattern.findall(span_text)[0].strip('{}')
-        tag_map[tag] = span_text
-        template_text = template_text.replace(span_text, '{'+tag+'}')
-    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
-def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
-    with open(path, 'r') as f:
-        lines = f.readlines()
-    cnt = 0
-    pts = []
-    for line in lines:
-        pt = parse_prompt_template(line, lang=lang)
-        cnt += 1
-        if len(pt.tags) < num:
-            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
-        pts.append(pt)
-    return pts
-def get_base_dir_file(key: os.PathLike):
-    base = os.path.basename(key)
-    dirname = os.path.basename(os.path.dirname(key))
-    return os.path.join(dirname, base)
-def read_jsonlike(path: os.PathLike):
-    #json or jsonl
-    if str(path).endswith(".json"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = json.load(f)
-        return data
-    elif str(path).endswith(".jsonl"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = [json.loads(line) for line in f.readlines()]
-        return data
-    else:
-        raise ValueError("Unknown file format")
-dist_prob_map = {
-    1: (1.0,),
-    2: (0.5, 0.5),
-    3: (0.3, 0.4, 0.3),
-    4: (0.2, 0.3, 0.3, 0.2),
-    5: (0.2, 0.2, 0.3, 0.2, 0.1),
-    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
-    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
-    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
-    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
-    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
-}
-'''
-#更加偏向短文本的方案
-dist_prob_map = {
-    1: (1.0,),
-    2: (0.7, 0.3),
-    3: (0.7, 0.2, 0.1),
-    4: (0.6, 0.2, 0.1, 0.1),
-    5: (0.6, 0.2, 0.1, 0.05, 0.05),
-    6: (0.6, 0.15, 0.1, 0.05, 0.05, 0.05),
-    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
-    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
-    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
-    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
-}
-'''
-#全部都用的方案
-# dist_prob_map = {
-#     1: (1.0,),
-#     2: (0, 1.0),
-#     3: (0, 0, 1.0),
-#     4: (0, 0, 0, 1.0),
-#     5: (0, 0, 0, 0, 1.0),
-#     6: (0, 0, 0, 0, 0, 1.0),
-#     7: (0, 0, 0, 0, 0, 0, 1.0),
-#     8: (0, 0, 0, 0, 0, 0, 0, 1.0),
-#     9: (0, 0, 0, 0, 0, 0, 0, 0, 1.0),
-#     10: (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0)
-# }
-dist_prob_map_low = {
-    1: (1.0,),
-    2: (0.8, 0.2),
-    3: (0.8, 0.1, 0.1),
-    4: (0.7, 0.1, 0.1, 0.1),
-    5: (0.7, 0.1, 0.1, 0.05, 0.05),
-    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
-}
-_bpm_range_rights = (
-    (40, '20-40'),
-    (60, '40-60'),
-    (66, '60-66'),
-    (76, '66-76'),
-    (108, '76-108'),
-    (120, '108-120'),
-    (168, '120-168'),
-    (176, '168-176'),
-    (200, '176-200')
-)
-_bpm_desc_map = {
-    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
-    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
-    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
-    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
-    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
-    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
-    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
-    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
-    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
-    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
-}
-_bpm_desc_map_zh = {
-    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
-    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
-    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
-    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
-    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
-    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
-    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
-    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
-    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
-    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
-}
-def get_bpm_range(bpm):
-    bpm = int(bpm)
-    for right, tag in _bpm_range_rights:
-        if bpm <= right:
-            return tag
-    return '>200'
-def gen_bpm_descript(bpm, lang='en'):
-    bpm_range = get_bpm_range(bpm)
-    if lang == 'en':
-        return random.choice(_bpm_desc_map[bpm_range])
-    elif lang == 'zh':
-        return random.choice(_bpm_desc_map_zh[bpm_range])
-    else:
-        raise ValueError(f"Unknown language {lang}")
-def read_translate(translate: Union[Dict[str, os.PathLike], os.PathLike, None]):
-    if translate is None:
-        return None
-    if isinstance(translate, str):
-        return read_jsonlike(translate)
-    return {k: read_jsonlike(path) for k, path in translate.items()}
-def gen_plain_prompt(key_list, sep=', '):
-    if len(key_list) == 0:
-        return 'none'
-    key_list = [k.strip() for k in key_list]
-    if len(key_list) > 10:
-        random.shuffle(key_list)
-        key_list = key_list[:10]
-    probs = dist_prob_map[len(key_list)]
-    num_tags = random.choices(range(1, len(key_list)+1), probs, k=1)[0]
-    random.shuffle(key_list)
-    tags = key_list[:num_tags]
-    tags_str = sep.join(tags)
-    return tags_str
-class MagnaTagATuneDataset(Dataset):
-    def __init__(self):
-        pass
-def tags_to_desc(tag_list, sep=',') -> str:
-    if not isinstance(tag_list, Sequence):
-        return str(tag_list)
-    if isinstance(tag_list, str):
-        return tag_list
-    if len(tag_list) <= 0:
-        return ''
-    elif len(tag_list) <= 5:
-        probs = dist_prob_map[len(tag_list)]
-        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-    else:
-        probs = dist_prob_map[5]
-        tags_num = random.choices(range(1, 6), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-def get_sr_and_duration_info(item):
-    return item.get('sample_rate', None), item.get('duration', None)
-class MtgJamendoDatasetFromJson(Dataset):
-    def __init__(self,
-                data_dir:str,
-                json_path:str,
-                duration:float=10,
-                sr:int = 0,
-                lang = 'en',
-                plain_rate = 0,
-                return_audio = True,
-                return_path = False,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                use_literal_none = True,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.data_dir = data_dir
-        self._load_metadata_json(json_path)
-        self.sr = sr
-        self.duration = duration
-        self.plain_rate = plain_rate
-        self.return_audio = return_audio
-        self.return_path = return_path
-        self.use_literal_none = use_literal_none
-        self.lang = lang
-        self.use_dynamic_prompt = prompt_template_path is not None and plain_rate < 1.0
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
-        self.tag_types = tag_types
-        self.translate = read_translate(translate)
-    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
-    WEAK_TAG_LIST = ["title", "artist"]
-    def _load_metadata_json(self, json_path):
-        with open(json_path) as fp:
-            self.data = json.load(fp)
-    def convert_key_to_path(self, key):
-        return os.path.join(self.data_dir, get_base_dir_file(key))
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        path = self.convert_key_to_path(item['key'])
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def generate_description(self, item):
-        if random.random() > self.plain_rate:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use plain prompt, i.e. tags sequence separated by comma
-            description = self.generate_description_plain(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
-        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
-        if len(exists_strong_tag) > 0:
-            probs = dist_prob_map[len(exists_strong_tag)]
-            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
-            random.shuffle(exists_strong_tag)
-            tags = exists_strong_tag[:tags_num]
-            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
-            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
-            random.shuffle(exists_weak_tag)
-            weak_tags = exists_weak_tag[:weak_tags_num]
-            tags += weak_tags
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
-            prompt = prompt_template.apply(**tags_args)
-        if self.use_literal_none and len(tags_args) == 0:
-            return 'none'
-        return prompt
-    def generate_description_plain(self, item):
-        keywords = []
-        for tag_t in self.tag_types:
-            this_key = item[tag_t]
-            if this_key is None:
-                continue
-            if isinstance(this_key, str):
-                this_key = [this_key]
-            if self.lang != 'en':
-                this_key = [self.get_translation(tag_t, k) for k in this_key]
-            keywords += this_key
-        return gen_plain_prompt(keywords, sep=self.keysep)
-    def get_translation(self, tag_t, k):
-        k = k.strip()
-        if k in self.translate[tag_t]:
-            return self.translate[tag_t][k]
-        else:
-            return k
-    @property
-    def keysep(self):
-        if self.lang == 'zh':
-            return '，' if random.random() > 0.5 else '、'
-        elif self.lang == 'en':
-            return ', '
-class AudioStockDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                duration:float=10,
-                sr:int = 0,
-                plain_rate = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                use_literal_none = True,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self._load_metadata(metadata_path)
-        self.sr = sr
-        self.duration = duration
-        self.plain_rate = plain_rate
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_literal_none = use_literal_none
-        self.use_dynamic_prompt = prompt_template_path is not None and plain_rate < 1.0
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
-        self.tag_types = tag_types
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                self.data.append(item)
-        self.is_info_recorded = bool('Tags' in self.data[0])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        path:str = self.data[idx]["path"]
-        json_path = path[:path.rfind('.')] + ".json"
-        if self.is_info_recorded:
-            item = self.data[idx]
-        else:
-            try:
-                with open(json_path) as fp:
-                    item:dict = json.load(fp)
-            except Exception as e:
-                print(f"Error loading json file {json_path} :\n{e}")
-                item = {}
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    def generate_description(self, item):
-        if random.random() > self.plain_rate:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use plain prompt, i.e. tags sequence separated by comma
-            description = self.generate_description_plain(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        if len(exists_tag) > 0:
-            probs = dist_prob_map[len(exists_tag)]
-            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
-            random.shuffle(exists_tag)
-            tags = exists_tag[:tags_num]
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            tags_args = self.handle_BPM_tag(tags_args)
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            return 'none'
-        if self.use_literal_none and len(tags_args) == 0:
-            return 'none'
-        return prompt
-    def get_translation(self, tag_t, k):
-        k = k.strip()
-        if k in self.translate[tag_t]:
-            return self.translate[tag_t][k]
-        else:
-            return k
-    def generate_description_plain(self, item):
-        keywords = []
-        for tag_t in self.tag_types:
-            if tag_t == 'BPMDescript':
-                bpm = item['BPM']
-                if bpm is None or bpm.strip() == '' or bpm.strip() == '0':
-                    continue
-                this_key = gen_bpm_descript(bpm.strip(), lang=self.lang)
-            elif tag_t == 'BPM':
-                bpm = item['BPM']
-                if bpm is None or bpm.strip() == '' or bpm.strip() == '0':
-                    continue
-                this_key = f"{bpm.strip()} bpm"
-            else:
-                this_key = item[tag_t]
-                if this_key is None:
-                    continue
-                if isinstance(this_key, str):
-                    this_key = [this_key]
-                if self.lang != 'en':
-                    this_key = [self.get_translation(tag_t, k) for k in this_key]
-            if this_key is None:
-                continue
-            if isinstance(this_key, str):
-                this_key = [this_key]
-            keywords += this_key
-        return gen_plain_prompt(keywords, sep=self.keysep)
-    @property
-    def keysep(self):
-        if self.lang == 'zh':
-            return '，' if random.random() > 0.5 else '、'
-        elif self.lang == 'en':
-            return ', '
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            if tag_type == 'BPM':
-                return tags_to_desc(tag_list, sep='、')
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def handle_BPM_tag(self, tags_args):
-        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
-            bpm = tags_args["BPM"]
-            del tags_args["BPM"]
-            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
-            for tag_type in tag_types_used:
-                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
-        return tags_args
-def mp3_path_to_id(mp3_path):
-    return int(
-        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.')]
-    )
-class TmeDataset(Dataset):
-    def __init__(self,
-                data_index:str,
-                music_info:str = None,
-                duration:float = 10,
-                sr:int = 0,
-                plain_rate = 0,
-                return_path = False,
-                return_audio = True,
-                return_ID = False,
-                prompt_format_path: os.PathLike = None,
-                tag_types = ['*'],
-                lang = 'zh',
-                translate: Optional[os.PathLike] = None,
-                prompt_dir: os.PathLike = None, #使用GPT生成的预有的prompt
-                ):
-        if plain_rate > 0:
-            print("Tme Dataset do not support plain rate > 0, use plain_rate = 0 instead.")
-            plain_rate = 0
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.sr = sr
-        self.duration = duration
-        self.plain_rate = plain_rate
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.return_ID = return_ID
-        self.lang = lang
-        self.use_ready_prompt = prompt_dir is not None
-        data_index = read_jsonlike(data_index)
-        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
-        self.data_ids = list(self.data_index_dict.keys())
-        if not self.use_ready_prompt:
-            #读取音乐的信息文件
-            music_info = read_jsonlike(music_info)
-            if 'music' in music_info:
-                music_info = music_info['music']
-            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-            with open(prompt_format_path) as fp:
-                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
-            #加载tag types，并分成一般的tag_types和关键的key_tag_types
-            if '*' in tag_types:
-                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
-            else:
-                self.tag_types = tag_types
-            self.key_tag_types = []
-            if 'tag' in self.tag_types:
-                self.tag_types.remove('tag')
-                self.key_tag_types = list(self.prompt_formats['tag'].keys())
-            #加载translate翻译
-            if translate is not None:
-                self.translator = read_jsonlike(translate)
-        else:
-            data_ids_set = set(self.data_ids)
-            self.prompts_dict = {}
-            for fname in os.listdir(prompt_dir):
-                items = read_jsonlike(os.path.join(prompt_dir, fname))
-                for item in items:
-                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
-                        continue
-                    if item['ID'] not in self.prompts_dict:
-                        self.prompts_dict[item['ID']] = []
-                        self.prompts_dict[item['ID']].append(item['Text'])
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-    def tags_to_desc(self, tag_list) -> str:
-        if is_bearable(tag_list, int):
-            return str(tag_list)
-        if self.lang == 'zh':
-            return tags_to_desc(tag_list, sep=self.sep)
-        else:
-            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
-            return tags_to_desc(translated_tag_list, sep=self.sep)
-    def gen_desc_of_tag(self, formats, tags):
-        fmt = random.choice(formats)
-        return fmt.format(self.tags_to_desc(tags))
-    @staticmethod
-    def check_valid(value):
-        if isinstance(value, int) or isinstance(value, float):
-            return value > 0
-        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
-            return True
-        return False
-    @staticmethod
-    def remove_repeat(data):
-        #若专辑名和歌曲名相同，则只使用后者
-        album_name = data.get('专辑名', None)
-        if album_name is not None and album_name == data.get('歌曲名', None):
-            del data['专辑名']
-        return data
-    @property
-    def comma(self):
-        if self.lang == 'zh':
-            return '，'
-        elif self.lang == 'en':
-            return ', '
-    @property
-    def sep(self):
-        if self.lang == 'zh':
-            return '、'
-        elif self.lang == 'en':
-            return ', '
-    def generate_description(self, item):
-        if random.random() > self.plain_rate:
-            # dynamically generate prompt from given prompt template
-            description = self.generate_description_dynamic(item)
-        else:
-            # use plain prompt, i.e. tags sequence separated by comma
-            description = self.generate_description_plain(item)
-        return description
-    def generate_description_dynamic(self, data):
-        data = self.remove_repeat(data)
-        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
-        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
-        prompts = []
-        if len(weak_tags) > 0:
-            probs = dist_prob_map_low[len(weak_tags)]
-            if len(key_tags) > 0:
-                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
-            else:
-                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
-            random.shuffle(weak_tags)
-            tags = weak_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
-                prompts.append(tag_desc)
-        if len(key_tags) > 0:
-            probs = dist_prob_map[len(key_tags)]
-            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
-            random.shuffle(key_tags)
-            tags = key_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
-                prompts.append(tag_desc)
-        random.shuffle(prompts)
-        return self.comma.join(prompts)
-    def generate_description_plain(self, item):
-        keywords = item['tag']
-        if self.lang != 'en':
-            keywords = [self.translator[k.strip()] for k in keywords]
-        return gen_plain_prompt(keywords, sep=self.keysep)
-    @property
-    def keysep(self):
-        if self.lang == 'zh':
-            return '，' if random.random() > 0.5 else '、'
-        elif self.lang == 'en':
-            return ', '
-    def is_valid_prompt_text(self, text):
-        for bad in ('抱歉','sorry', 'Sorry'):
-            if bad in text:
-                return False
-        return True
-    def get_ready_prompt(self, path):
-        sid = mp3_path_to_id(path)
-        return random.choice(self.prompts_dict[sid])
-    def __len__(self):
-        return len(self.data_ids)
-    def __getitem__(self, idx):
-        data_id = self.data_ids[idx]
-        item = self.data_index_dict[data_id]
-        path = item['path']
-        if not self.use_ready_prompt:
-            info = self.music_info_dict[data_id]
-            description = self.generate_description(info)
-        else:
-            description = self.get_ready_prompt(path)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            if self.return_ID:
-                return audio, description, path, info['歌曲ID']
-            return audio, description, path
-        if self.return_ID:
-            return audio, description, info['歌曲ID']
-        return audio, description
-class Pond5Dataset(Dataset):
-    MAX_PROMPT_LEN = 200
-    def __init__(self,
-                metadata_path:str,
-                index_path:str,
-                duration:float=10,
-                sr:int = 0,
-                plain_rate = 0,
-                return_path = False,
-                return_audio = True,
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                use_literal_none = True,
-                use_avoid_watermark_policy = None,
-                ):
-        if use_avoid_watermark_policy is None:
-            raise ValueError("`use_avoid_watermark_policy` is an important param, you need to explicitly specify it with bool type")
-        self.use_avoid_watermark_policy = use_avoid_watermark_policy
-        self.audio_reader = SafeAudioReader(duration, sr, use_avoid_watermark_policy=use_avoid_watermark_policy)
-        self._load_metadata(metadata_path, index_path)
-        self.sr = sr
-        self.duration = duration
-        self.plain_rate = plain_rate
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_literal_none = use_literal_none
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path, index_path):
-        data_index = read_jsonlike(index_path)
-        data_ids = set([item['id'] for item in data_index])
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-        append_ids = set()
-        self.data = []
-        for line in lines:
-            item = json.loads(line)
-            if item['id'] in data_ids and item['id'] not in append_ids:
-                self.data.append(item)
-                append_ids.add(item['id'])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        path:str = item["path"]
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    @property
-    def keysep(self):
-        if self.lang == 'zh':
-            return '，' if random.random() > 0.5 else '、'
-        elif self.lang == 'en':
-            return ', '
-    def generate_description(self, item):
-        if random.random() > self.plain_rate:
-            # dynamically generate prompt from given prompt template
-            description = self.generate_description_dynamic(item)
-        else:
-            # use plain prompt, i.e. tags sequence separated by comma
-            description = self.generate_description_plain(item)
-        return description
-    def get_translation(self, k):
-        k = k.strip()
-        if k in self.translate:
-            return self.translate[k]
-        else:
-            return k
-    def generate_description_plain(self, item):
-        keywords = item['keywords']
-        if self.lang != 'en':
-            keywords = [self.get_translation(k) for k in keywords]
-        return gen_plain_prompt(keywords, sep=self.keysep)
-    def generate_description_dynamic(self,item):
-        desc = item.get('desc', 'none')
-        if desc is None:
-            desc = 'none'
-        desc = desc.strip()
-        if len(desc) > self.MAX_PROMPT_LEN:
-            shorter_desc = desc[:self.MAX_PROMPT_LEN]
-            # find last stop
-            stop_idx = shorter_desc.rfind('.')
-            if stop_idx == -1:
-                stop_idx = shorter_desc.rfind('!')
-            if stop_idx == -1:
-                stop_idx = shorter_desc.rfind(',')
-            if stop_idx == -1:
-                stop_idx = self.MAX_PROMPT_LEN - 1
-            desc = desc[:stop_idx+1]
-        return desc
-class SoundDataset(Dataset):
-    def __init__(self,
-                metadata_index: str,
-                duration:float = 10,
-                min_non_silent_duration:float = 3,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                ):
-        self.data = read_jsonlike(metadata_index)
-        self.sr = sr
-        self.reader = SafeAudioReader(duration, sr)
-        self.duration = duration
-        self.min_non_silent_duration = min_non_silent_duration
-        self.return_audio = return_audio
-        self.return_path = return_path
-    def __getitem__(self, index):
-        item = self.data[index]
-        if self.return_audio:
-            origin_duration = item['duration']
-            if origin_duration < self.min_non_silent_duration:
-                audio = self.read_and_repeat_and_pad(item)
-            else:
-                audio = self.reader(item['path'], item['sample_rate'], origin_duration)
-        else:
-            audio = None
-        desc = item['caption']
-        if self.return_path:
-            return audio, desc, item['path']
-        else:
-            return audio, desc
-    def __len__(self):
-        return len(self.data)
-    def read_and_repeat_and_pad(self, item):
-        path = item['path']
-        try:
-            # read
-            clip, sr = torchaudio.load(path)
-            if len(clip.shape) > 1:
-                clip = torch.mean(clip, dim=0, keepdim=True)
-            clip = resample(clip, sr, self.sr)
-            #repeat
-            n_repeats = math.ceil(self.min_non_silent_duration/item['duration'])
-            clip = torch.repeat_interleave(clip, n_repeats, dim=0).reshape(-1)
-            #pad
-            n_samples = int(self.duration * self.sr)
-            if clip.shape[0] >= n_samples:
-                audio = clip[:n_samples]
-            else:
-                audio = torch.zeros(int(self.duration * self.sr), dtype=clip.dtype)
-                start_pos = np.random.randint(0, max(0,(n_samples - clip.shape[0])))
-                audio[start_pos:start_pos+clip.shape[0]] = clip
-            return audio
-        except Exception as e:
-            logger.error(f"Error reading {path}: {e}")
-            wav = torch.zeros(int(self.duration * self.sr), dtype=torch.float32)
-        return wav
-class CombinedDataset(Dataset):
-    @beartype
-    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-    def __len__(self):
-        return len(self.datasets_index)
-    def __getitem__(self, idx):
-        index = self.datasets_index[idx]
-        i,j = index
-        return self.datasets[i][j]
-class CombinedDataset_random(Dataset):
-    @beartype
-    def __init__(self, num_examples:int, datasets: Sequence[Dataset], ratios: Sequence[int]):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-        if num_examples > 0:
-            self.random_choose = True
-            self.dataset_len = num_examples
-        else:
-            self.random_choose = False
-            self.dataset_len = len(self.datasets_index)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        first_try = True
-        try_cnt = 0
-        while True:
-            try:
-                if(self.random_choose or not first_try):
-                    index2 = []
-                    index2.append(np.random.randint(0,len(self.datasets)))
-                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
-                else:
-                    index2 = self.datasets_index[idx]
-                first_try = False
-                out = list(self.datasets[index2[0]][index2[1]])
-                return out
-            except:
-                print("Error loadding ", index2)
-                try_cnt += 1
-                if(try_cnt>10):
-                    raise ValueError()
-class SoundMixedDataset(Dataset):
-    @staticmethod
-    def music_desc(desc):
-        return f'Music:<{desc}>'
-    @staticmethod
-    def sound_desc(desc):
-        return f'Effect:<{desc}>'
-    def __init__(self,
-                 music_dataset: Dataset,
-                 sound_dataset: Dataset,
-                 mixed_ratios: Tuple[float, float, float] = (0.3, 0.3, 0.4)  # 只有音乐：只有音效：音乐音效混合 的比例
-                 ) -> None:
-        self.music_dataset = music_dataset
-        self.sound_dataset = sound_dataset
-        music_r, sound_r, mix_r = [r/sum(mixed_ratios) for r in mixed_ratios] #化为0-1间的比例
-        #三个概率区间的左端点
-        self.music_anchor = 0
-        self.sound_anchor = music_r
-        self.mix_anchor = music_r + sound_r
-    def __len__(self):
-        return len(self.music_dataset)
-    def get_random_sound_data(self):
-        idx = random.randint(0, len(self.sound_dataset)-1)
-        return self.sound_dataset[idx]
-    def __getitem__(self, idx):
-        p = random.random()
-        if p >= self.mix_anchor:
-            music, m_desc = self.music_dataset[idx]
-            sound, s_desc = self.get_random_sound_data()
-            audio = music + sound
-            if(audio.abs().max()>1.0):
-                music = music / audio.abs().max() * 0.95
-                audio = audio / audio.abs().max() * 0.95
-            desc = self.music_desc(m_desc) + self.sound_desc(s_desc)
-            return audio[None,:], music[None,:], desc
-        elif p >= self.sound_anchor:
-            audio, desc = self.get_random_sound_data()
-            return audio[None,:], torch.zeros_like(audio[None,:]), self.sound_desc(desc)
-        else:
-            audio, desc = self.music_dataset[idx]
-            return audio[None,:], audio[None,:], self.music_desc(desc)
-class DecoTagDataset(Dataset):
-    '''这个类把普通的datatset包装成适用于标签解耦学习的dataset'''
-    TAG_TYPES = ('genre', 'mood', 'insrument')
-    def __init__(self, dataset_class: type, tag_map: Dict[str, str], *args, **kwargs):
-        self.datasets = []
-        for i, tag_t in enumerate(self.TAG_TYPES):
-            kwargs['tag_types'] = [tag_map[tag_t]]
-            kwargs['return_audio'] = (i == 0) #只有第0个需要返回音频和文本，其余只需要返回文本
-            self.datasets.append(dataset_class(*args, **kwargs))
-    def __len__(self):
-        return len(self.datasets[0])
-    def __getitem__(self, idx):
-        audio, text = self.datasets[0][idx]
-        texts = (text, self.datasets[1][idx][1], self.datasets[2][idx][1])
-        return audio, texts
-class DecoTagWrapper:
-    '''这是一个包装器，便于选择是否使用标签解耦学习'''
-    def __init__(self, dataset_class: Dataset, deco_tag_types: List[str] = list(), switch_on: bool = False):
-        self.dataset_class = dataset_class
-        self.tag_map = dict(zip(DecoTagDataset.TAG_TYPES, deco_tag_types))
-        self.switch_on = switch_on
-    def __call__(self, *args, **kwargs):
-        if self.switch_on:
-            return DecoTagDataset(self.dataset_class, self.tag_map, *args, **kwargs)
-        else:
-            return self.dataset_class(*args, **kwargs)

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_429.py DELETED Viewed

@@ -1,372 +0,0 @@
-import re
-import sys
-import json
-from typing import List, Union
-from torch.utils.data import Dataset
-import torchaudio
-from torchaudio.functional import resample
-import torch
-import numpy as np
-from torch.nn.utils.rnn import pad_sequence
-PARAGRAPH_GAP = 6
-MIN_MUSIC_LEN = 3
-def check_lryics(lyric):
-    _FILTER_STRING = [
-        '作词', '作曲', '编曲', '【', '策划',
-        '录音', '混音', '母带', '：', '制作',
-        '版权', '校对', '演奏', '制作', '伴奏'
-    ]
-    for item in _FILTER_STRING:
-        if item in lyric:
-            return True
-    return False
-def process_lyrics(lines):
-    lyric_part = []
-    timestamp_part = []
-    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
-    for i, line in enumerate(lines):
-        # 删除前几行的特定信息
-        if i<10 and check_lryics(line):
-            continue
-        # 检查是否包含有效的时间戳和歌词内容
-        if timestamp_pattern.match(line):
-            timestamp_end = line.rfind(']')
-            lyrics = line[timestamp_end + 1:].strip()
-            timestamps = line[:timestamp_end + 1]
-            if '：' in lyrics:
-                if len(lyrics.split("：")[0]) <=5:
-                     lyrics = "".join(lyrics.split("：")[1:])
-            # if lyrics:  # 确保歌词部分不是空的
-            #     lyric_part.append(lyrics)
-            #     timestamp_part.append(timestamps)
-    # print(processed_lyrics)
-    return timestamp_part, lyric_part
-def get_timestamps(timestamp_part):
-    # 转换为秒
-    timestamps = []
-    for line in timestamp_part:
-        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
-        if match:
-            minutes = int(match.group(1))
-            seconds = float(match.group(2))
-            millis = float(match.group(3)) if match.group(3) else 0
-            total_seconds = minutes * 60 + seconds + millis
-            timestamps.append(total_seconds)
-    return timestamps
-def process_lyrics_lrc(lyrics):
-    timestamp_part, lyric_part = process_lyrics(lyrics)
-    # print(timestamp_part)
-    # print(lyric_part)
-    timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    return output_list
-def process_lyrics_yrc(lyrics):
-    timestamps, lyric_part = extract_lrc(lyrics)
-    # timestamp_part, lyric_part = process_lyrics(lyrics)
-    # import pdb; pdb.set_trace()
-    # print(timestamp_part)
-    # print(lyric_part)
-    # timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    # import pdb; pdb.set_trace()
-    return output_list
-def extract_lrc(lyrics):
-    timestamp_part, lyric_part = [], []
-    for i,  text in enumerate(lyrics):
-        # 提取中括号内的内容
-        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
-        bracket_content = bracket_content.split(',')
-        # 提取小括号内的内容
-        parentheses_content = re.findall(r'\((.*?)\)', text)
-        # 提取其他内容
-        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
-        # 数据怎么处理？
-        if i<10 and check_lryics(other_content):
-            continue
-        timestamp_part.append(float(bracket_content[0])/1000)
-        lyric_part.append(other_content)
-    return timestamp_part, lyric_part
-class WYYSongDataset(Dataset):
-    def __init__(self,
-                metadata_path: Union[str, List[str]],
-                sr:int = 0,
-                use_lang = ['en', 'zh-cn'],
-                num_examples = -1,
-                max_dur = 20,
-                min_dur=0,
-                add_music=False,
-                pad_to_max= True,
-                ):
-        self.sr = sr
-        self.use_lang = use_lang
-        self.data = []
-        if type(metadata_path) == str:
-            metadata_path = [metadata_path]
-        for _meta in metadata_path:
-            self._load_metadata(_meta)
-        self.max_dur = max_dur
-        self.min_dur = min_dur
-        self.pad_to_max = pad_to_max
-        self.add_music = add_music
-        # buffer
-        self.lyric_buffer = {}
-        if(num_examples<=0):
-            self.dataset_len = len(self.data)
-            self.random_slc = False
-        else:
-            self.dataset_len = num_examples
-            self.random_slc = True
-    # 读取jsonl文件
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            for line in lines:
-                item = json.loads(line)
-                if '伴奏' not in item['path']:
-                    # if "lang_type" in item and item['lang_type'] == 'en':
-                     if "lang_type" in item:
-                        self.data.append(item)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        try_cnt = 0
-        while True:
-            if(self.random_slc):
-                idx = np.random.randint(0, len(self.data))
-            yrc_lyrics = []
-            lrc_lyrics = []
-            try:
-                info = self.data[idx]
-                # audio path
-                path = info["path"]
-                lang_type = info["lang_type"]
-                lyrics = info['lyrics'] # chinese
-                    # lyrics = info['lyrics_phone']
-                # 随机选取一个lyric段落
-                parsed_lyrics = []
-                # st_idx = np.random.randint(0, len(lyrics))
-                for ly_id in range(len(lyrics)):
-                    lyric = lyrics[ly_id].strip()
-                    st, et, lyric = self.parse_lyric(lyric)
-                    if et - st >= self.max_dur:
-                        continue #TODO 前后外沿 [MUSIC]
-                    if parsed_lyrics != []:
-                        if st - parsed_lyrics[-1][1] >= PARAGRAPH_GAP: # 大gap
-                            parsed_lyrics.append((parsed_lyrics[-1][1], st, '[GAP]'))
-                        elif self.add_music and st - parsed_lyrics[-1][1] >= MIN_MUSIC_LEN:
-                            parsed_lyrics.append((parsed_lyrics[-1][1], st, '[MUSIC]'))
-                    lyric = lyric.replace("\xa0", " ")
-                    lyric = " ".join(lyric.split())
-                    parsed_lyrics.append((st, et, lyric))
-                assert parsed_lyrics != []
-                # if parsed_lyrics[-1][1] - parsed_lyrics[0][0] > self.max_dur:
-                #     print(f"{parsed_lyrics[0][0]}-{parsed_lyrics[-1][1]} {parsed_lyrics}", file=open('tmp.txt', 'a'))
-                parsed_lyrics = [(0, parsed_lyrics[0][0], '[GAP]')] + parsed_lyrics
-                possible_starts = [e for e,i in enumerate(parsed_lyrics) if i[2]=='[GAP]']
-                st_idx = np.random.choice(possible_starts)
-                paraphrase = []
-                for i in parsed_lyrics[st_idx+1:]:
-                    if i[2] == '[GAP]':
-                        break
-                    paraphrase.append(i)
-                # print(paraphrase, lyrics)
-                while paraphrase[-1][1] - paraphrase[0][0] > self.max_dur:
-                    if np.random.rand() > 0.2:
-                        paraphrase.pop(-1) # 大概率从后面截断
-                    else:
-                        paraphrase.pop(0) # 小概率截前面
-                st, et, lyric = paraphrase[0][0], paraphrase[-1][1], ', '.join([i[2] for i in paraphrase]) # [SEP]
-                # print(st, et, lyric)
-                # import pdb; pdb.set_trace()
-                assert self.min_dur < et - st < self.max_dur, f"{st}-{et} {lyric}"
-                # print(et-st, lyric)
-                # import pdb; pdb.set_trace()
-                if info["lang_type"] == 'en':
-                    # print(len(lyric.split())/(et-st))
-                    char_num = sum([len(lrc[-1].split()) for lrc in paraphrase])
-                    assert 6 > char_num / (et-st) > 1
-                else:
-                    # print(len(lyric.split())/(et-st))
-                    char_num = sum([len(lrc[-1]) for lrc in paraphrase])
-                    assert 6 > char_num / (et-st) > 1
-                # 读取音频文件
-                cur_sample_rate = torchaudio.info(path).sample_rate
-                offset = int(cur_sample_rate*st)
-                num_frames = int(cur_sample_rate * (et -st))
-                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
-                # chunk = torch.zeros(1, 48000*15)
-                if abs(chunk.shape[-1] - num_frames) > num_frames * 0.05: # 音频文件长度与歌词不一致
-                    print(f"fail to load {path} from {st} to {et} !")
-                    raise FileNotFoundError
-                # 随机选取一个channel
-                if(chunk.shape[0]>1):
-                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-                else:
-                    chunk = chunk[[0],:].float()
-                if(cur_sample_rate!=self.sr):
-                    # print('a:',cur_sample_rate,chunk.shape)
-                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
-                if self.pad_to_max:
-                    chunk = self.pad_2d_tensor(chunk, int(self.max_dur * self.sr), 0)
-                # print(self.sz_cnt)
-                return chunk, lyric, [st, et], path, lang_type
-            except (AssertionError, FileNotFoundError, RuntimeError) as e: # 其他Error不ok
-                    # print("Error loadding ", info["path"])
-                    try_cnt += 1
-                    idx  = np.random.randint(0, len(self.data))
-                    if(try_cnt>100):
-                        raise e
-    def parse_lyric(self, lyric):
-        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
-        match = re.search(pattern, lyric)
-        start_time = float(match.group(1))
-        end_time = float(match.group(2))
-        content = match.group(3)
-        return start_time, end_time, content
-    def pad_2d_tensor(self, x, max_len, pad_id):
-        # 获取输入 tensor 的形状
-        batch_size, seq_len = x.size()
-        max_len = max(max_len, seq_len)
-        # 计算需要填充的长度
-        pad_len = max_len - seq_len
-        # 如果需要填充
-        if pad_len > 0:
-            # 创建填充 tensor
-            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
-            # 沿第二个维度（列）连接输入 tensor 和填充 tensor
-            padded_tensor = torch.cat([x, pad_tensor], dim=1)
-        else:
-            # 如果不需要填充，直接返回输入 tensor
-            padded_tensor = x
-        return padded_tensor
-def collect_data(data_list):
-    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
-    lyrics = [data[1] for data in data_list]
-    st_et = [data[2] for data in data_list]
-    paths = [data[3] for data in data_list]
-    lang_types = [data[4] for data in data_list]
-    return audios, lyrics, st_et
-    # return audios, lyrics, st_et
-def build_dataset(train_jsonl_list, val_jsonl_list, min_dur=0, max_dur=20, add_music=False):
-    print(min_dur,max_dur)
-    print(train_jsonl_list)
-    # ["exp/wyy3_20240418_v2f.jsonl",
-                        # "exp/tme_lyric_baokuan.jsonl"]
-    train_dataset = WYYSongDataset(
-        metadata_path = train_jsonl_list,
-        sr = 48000,
-        use_lang = ['zh-cn', 'en'],
-        num_examples = 10*10000,
-        min_dur=min_dur,
-        max_dur=max_dur,
-        add_music=add_music
-    )
-    valid_dataset = WYYSongDataset(
-        metadata_path = val_jsonl_list,
-        sr = 48000,
-        use_lang = ['zh-cn', 'en'],
-        num_examples = 500,
-        min_dur=min_dur,
-        max_dur=max_dur,
-        add_music=add_music
-    )
-    print(train_jsonl_list, "\t total_song = ", len(train_dataset.data))
-    return train_dataset, valid_dataset

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined.py DELETED Viewed

@@ -1,830 +0,0 @@
-from torch.utils.data import Dataset
-from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List
-from beartype import beartype
-from beartype.door import is_bearable
-import random
-import pandas as pd
-import os
-from torchaudio.functional import resample
-import torch
-import typing as tp
-from pathlib import Path
-import torchaudio as ta
-import torch.nn.functional as F
-import numpy as np
-import json
-import yaml
-import torchaudio
-import math
-import re
-from loguru import logger
-class Read_and_PadCrop_Normalized_T(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
-        if(duration<(float(self.n_samples)/self.sample_rate+1)):
-            # print(duration,(float(self.n_samples)/self.sample_rate+1))
-            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-            t_start = 0.
-            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
-            offset = 0
-            # print('c1:',chunk.shape)
-        else:
-            offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-            t_start = offset / float(cur_sample_rate) / duration
-            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
-            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-            # print('offset:',offset)
-            # print('c0:',chunk.shape)
-        # Pad with silence if necessary.
-        if(chunk.shape[0]>1):
-            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-        else:
-            chunk = chunk[[0],:].float()
-        if(cur_sample_rate!=self.sample_rate):
-            # print('a:',cur_sample_rate,chunk.shape)
-            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
-            # print('b:',self.sample_rate,chunk.shape)
-        if chunk.shape[-1] < self.n_samples:
-            chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
-        else:
-            chunk = chunk[:,0:self.n_samples]
-        seconds_start = math.floor(offset / cur_sample_rate)
-        seconds_total = math.floor(duration)
-        return (
-            chunk,
-            t_start,
-            t_end,
-            seconds_start,
-            seconds_total
-        )
-USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
-if USE_DUMMY_AUDIO:
-    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
-class SafeAudioReader:
-    """
-       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
-    """
-    def __init__(self,
-                duration: float,  # 返回音频长度
-                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
-                randomize: bool = True
-                ):
-        self.n_samples = int(sample_rate * max(duration, 0))
-        self.reader = Read_and_PadCrop_Normalized_T(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
-    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
-    def __call__(self,
-                 filepath: os.PathLike,  # 音频路径
-                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
-                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
-                 ) -> torch.Tensor:
-        if USE_DUMMY_AUDIO:
-            wav = torch.zeros(self.n_samples, dtype=torch.float32)
-            return wav
-        try:
-            if origin_sample_rate is None or origin_duration is None:
-                audio_info = torchaudio.info(filepath)
-                origin_sample_rate = audio_info.sample_rate
-                origin_duration = audio_info.num_frames / origin_sample_rate
-            wav, *ignored = self.reader(filepath, origin_duration, origin_sample_rate)
-        except Exception as e:
-            logger.error(f"Error reading {filepath}: {e}")
-            wav = torch.zeros(self.n_samples, dtype=torch.float32)
-        return wav
-class PromptTemplate:
-    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
-        self.template_text = template_text
-        self.tag_map = tag_map
-        self.lang = lang
-    @property
-    def tags(self):
-        return tuple(self.tag_map.keys())
-    def apply(self, **kwargs):
-        for tag in list(kwargs.keys()):
-            if kwargs[tag] == '':
-                kwargs.pop(tag)
-        for tag in self.tags:
-            if tag in kwargs:
-                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
-            else:
-                kwargs[tag] = ''
-        prompt = self.template_text.format(**kwargs)
-        return self.beautify(prompt)
-    def beautify(self, text):
-        if self.lang == 'en':
-            return self._beautify_en(text)
-        elif self.lang == 'zh':
-            return self._beautify_zh(text)
-        else:
-            raise ValueError(f'Unknown language {self.lang}')
-    @staticmethod
-    def _beautify_en(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
-        # no continuous whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
-        text = re.sub(r'\s+,', r',', text)
-        text = re.sub(r',\s+', r', ', text)
-        # no whitespace before the full stop
-        text = re.sub(r'\s+\.', r'.', text)
-        # strip whitespace, comma, and replace ',.'
-        text = text.strip(' ,')
-        text = text.replace(',.', '.')
-        return text
-    @staticmethod
-    def _beautify_zh(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
-        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
-        # assume there should be NO whitespace in Chinese
-        text = re.sub(r'\s+', r'', text)
-        # strip whitespace, comma, and replace '，。'
-        text = text.strip('， 、')
-        text = text.replace('，。', '。')
-        return text
-    def __repr__(self):
-        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
-    __str__ = __repr__
-def parse_prompt_template(prompt_template_text, lang='en'):
-    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
-    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
-    template_text = prompt_template_text.strip()
-    span_texts = span_pattern.findall(prompt_template_text)
-    tag_map = {}
-    for span_text in span_texts:
-        tag = tag_pattern.findall(span_text)[0].strip('{}')
-        tag_map[tag] = span_text
-        template_text = template_text.replace(span_text, '{'+tag+'}')
-    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
-def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
-    with open(path, 'r') as f:
-        lines = f.readlines()
-    cnt = 0
-    pts = []
-    for line in lines:
-        pt = parse_prompt_template(line, lang=lang)
-        cnt += 1
-        if len(pt.tags) < num:
-            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
-        pts.append(pt)
-    return pts
-def get_base_dir_file(key: os.PathLike):
-    base = os.path.basename(key)
-    dirname = os.path.basename(os.path.dirname(key))
-    return os.path.join(dirname, base)
-def read_jsonlike(path: os.PathLike):
-    #json or jsonl
-    if str(path).endswith(".json"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = json.load(f)
-        return data
-    elif str(path).endswith(".jsonl"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = [json.loads(line) for line in f.readlines()]
-        return data
-    else:
-        raise ValueError("Unknown file format")
-dist_prob_map = {
-    1: (1.0,),
-    2: (0.5, 0.5),
-    3: (0.3, 0.4, 0.3),
-    4: (0.2, 0.3, 0.3, 0.2),
-    5: (0.2, 0.2, 0.3, 0.2, 0.1),
-    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
-    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
-    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
-    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
-    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
-}
-dist_prob_map_low = {
-    1: (1.0,),
-    2: (0.8, 0.2),
-    3: (0.8, 0.1, 0.1),
-    4: (0.7, 0.1, 0.1, 0.1),
-    5: (0.7, 0.1, 0.1, 0.05, 0.05),
-    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
-}
-_bpm_range_rights = (
-    (40, '20-40'),
-    (60, '40-60'),
-    (66, '60-66'),
-    (76, '66-76'),
-    (108, '76-108'),
-    (120, '108-120'),
-    (168, '120-168'),
-    (176, '168-176'),
-    (200, '176-200')
-)
-_bpm_desc_map = {
-    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
-    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
-    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
-    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
-    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
-    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
-    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
-    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
-    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
-    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
-}
-_bpm_desc_map_zh = {
-    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
-    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
-    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
-    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
-    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
-    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
-    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
-    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
-    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
-    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
-}
-def get_bpm_range(bpm):
-    bpm = int(bpm)
-    for right, tag in _bpm_range_rights:
-        if bpm <= right:
-            return tag
-    return '>200'
-def gen_bpm_descript(bpm, lang='en'):
-    bpm_range = get_bpm_range(bpm)
-    if lang == 'en':
-        return random.choice(_bpm_desc_map[bpm_range])
-    elif lang == 'zh':
-        return random.choice(_bpm_desc_map_zh[bpm_range])
-    else:
-        raise ValueError(f"Unknown language {lang}")
-def read_translate(translate: Optional[Dict[str, os.PathLike]]):
-    if translate is None:
-        return None
-    return {k: read_jsonlike(path) for k, path in translate.items()}
-class MagnaTagATuneDataset(Dataset):
-    def __init__(self):
-        pass
-def tags_to_desc(tag_list, sep=',') -> str:
-    if not isinstance(tag_list, Sequence):
-        return str(tag_list)
-    if isinstance(tag_list, str):
-        return tag_list
-    if len(tag_list) <= 0:
-        return ''
-    elif len(tag_list) <= 5:
-        probs = dist_prob_map[len(tag_list)]
-        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-    else:
-        probs = dist_prob_map[5]
-        tags_num = random.choices(range(1, 6), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-def get_sr_and_duration_info(item):
-    return item.get('sample_rate', None), item.get('duration', None)
-class MtgJamendoDatasetFromJson(Dataset):
-    def __init__(self,
-                data_dir:str,
-                json_path:str,
-                duration:float=10,
-                sr:int = 0,
-                *,
-                lang = 'en',
-                return_path = False,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.data_dir = data_dir
-        self._load_metadata_json(json_path)
-        self.sr = sr
-        self.duration = duration
-        self.return_path = return_path
-        self.lang = lang
-        self.use_dynamic_prompt = prompt_template_path is not None
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
-            self.tag_types = tag_types
-            self.translate = read_translate(translate)
-        if not self.use_dynamic_prompt and self.lang != 'en':
-            raise NotImplementedError
-    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
-    WEAK_TAG_LIST = ["title", "artist"]
-    def _load_metadata_json(self, json_path):
-        with open(json_path) as fp:
-            self.data = json.load(fp)
-    def convert_key_to_path(self, key):
-        return os.path.join(self.data_dir, get_base_dir_file(key))
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        path = self.convert_key_to_path(item['key'])
-        description = self.generate_description(item)
-        sr, duration = get_sr_and_duration_info(item)
-        audio = self.audio_reader(path, sr, duration)
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def generate_description(self, item):
-        if self.use_dynamic_prompt:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use ordinary static prompt instead
-            description = self.generate_description_ordinary(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
-        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
-        if len(exists_strong_tag) > 0:
-            probs = dist_prob_map[len(exists_strong_tag)]
-            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
-            random.shuffle(exists_strong_tag)
-            tags = exists_strong_tag[:tags_num]
-            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
-            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
-            random.shuffle(exists_weak_tag)
-            weak_tags = exists_weak_tag[:weak_tags_num]
-            tags += weak_tags
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
-            prompt = prompt_template.apply(**tags_args)
-        return prompt
-    def generate_description_ordinary(self, data, thresh = 0.3):
-        # Initialize the description with title and artist
-        description = f'"{data["title"]+" is " if random.random() > thresh else ""}"a piece of music by {data["artist"]}'
-        # Add genre if available
-        if data["genre"] and random.random() > thresh:
-            genres = ', '.join(data["genre"])
-            description += f', belonging to the {genres} genres'
-        # Add moods if available
-        if data["moods"] and random.random() > thresh:
-            moods = ', '.join(data["moods"])
-            description += f'. This track conveys a {moods} mood'
-        # Add instruments if available
-        if data["instrument"] and random.random() > thresh:
-            instruments = ', '.join(data["instrument"])
-            description += f', and primarily features the following instruments: {instruments}'
-        # Add a period to end the description
-        description += '.'
-        return description
-class AudioStockDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                duration:float=10,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self._load_metadata(metadata_path)
-        self.sr = sr
-        self.duration = duration
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_dynamic_prompt = prompt_template_path is not None
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
-        self.tag_types = tag_types
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                self.data.append(item)
-        self.is_info_recorded = bool('Tags' in self.data[0])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        path:str = self.data[idx]["path"]
-        json_path = path[:path.rfind('.')] + ".json"
-        if self.is_info_recorded:
-            item = self.data[idx]
-        else:
-            try:
-                with open(json_path) as fp:
-                    item:dict = json.load(fp)
-            except Exception as e:
-                print(f"Error loading json file {json_path} :\n{e}")
-                item = {}
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    def generate_description(self, item):
-        if self.use_dynamic_prompt:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use ordinary static prompt instead
-            description = self.generate_description_ordinary(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        if len(exists_tag) > 0:
-            probs = dist_prob_map[len(exists_tag)]
-            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
-            random.shuffle(exists_tag)
-            tags = exists_tag[:tags_num]
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            tags_args = self.handle_BPM_tag(tags_args)
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            prompt = prompt_template.apply()
-        return prompt
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            if tag_type == 'BPM':
-                return tags_to_desc(tag_list, sep='、')
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def handle_BPM_tag(self, tags_args):
-        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
-            bpm = tags_args["BPM"]
-            del tags_args["BPM"]
-            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
-            for tag_type in tag_types_used:
-                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
-        return tags_args
-    def generate_description_ordinary(self, data, thresh = 0.3):
-        if self.lang != 'en':
-            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
-        description = f'a piece of music by {data["Artist"]}'
-        # Add genre if available
-        if data["Genre"] and random.random() > thresh:
-            genres = ', '.join(data["Genre"])
-            description += f', belonging to the {genres} genres'
-        # Add moods if available
-        if data["Tags"] and random.random() > thresh:
-            tags = ', '.join(data["Tags"])
-            description += f'. This track contains the tags:{tags}'
-        # Add moods if available
-        if data["Mood"] and random.random() > thresh:
-            moods = ', '.join(data["Mood"])
-            description += f'. This track conveys a {moods} mood.'
-        # Add instruments if available
-        if data["Instrument"] and random.random() > thresh:
-            instruments = ', '.join(data["Instrument"])
-            description += f'. and primarily features the following instruments: {instruments}'
-        # Add a period to end the description
-        description += '.'
-        return description
-def mp3_path_to_id(mp3_path):
-    return int(
-        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.mp3')]
-    )
-class TmeDataset(Dataset):
-    def __init__(self,
-                data_index:str,
-                music_info:str = None,
-                duration:float = 10,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_format_path: os.PathLike = None,
-                tag_types = ['*'],
-                lang = 'zh',
-                translate: Optional[os.PathLike] = None,
-                prompt_dir: os.PathLike = None,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.sr = sr
-        self.duration = duration
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.lang = lang
-        self.use_ready_prompt = prompt_dir is not None
-        data_index = read_jsonlike(data_index)
-        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
-        self.data_ids = list(self.data_index_dict.keys())
-        if not self.use_ready_prompt:
-            #读取音乐的信息文件
-            music_info = read_jsonlike(music_info)
-            if 'music' in music_info:
-                music_info = music_info['music']
-            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-            with open(prompt_format_path) as fp:
-                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
-            #加载tag types，并分成一般的tag_types和关键的key_tag_types
-            if '*' in tag_types:
-                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
-            else:
-                self.tag_types = tag_types
-            self.key_tag_types = []
-            if 'tag' in self.tag_types:
-                self.tag_types.remove('tag')
-                self.key_tag_types = list(self.prompt_formats['tag'].keys())
-            #加载translate翻译
-            if translate is not None:
-                self.translator = read_jsonlike(translate)
-        else:
-            data_ids_set = set(self.data_ids)
-            self.prompts_dict = {}
-            for fname in os.listdir(prompt_dir):
-                items = read_jsonlike(os.path.join(prompt_dir, fname))
-                for item in items:
-                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
-                        continue
-                    if item['ID'] not in self.prompts_dict:
-                        self.prompts_dict[item['ID']] = []
-                        self.prompts_dict[item['ID']].append(item['Text'])
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-    def tags_to_desc(self, tag_list) -> str:
-        if is_bearable(tag_list, int):
-            return str(tag_list)
-        if self.lang == 'zh':
-            return tags_to_desc(tag_list, sep=self.sep)
-        else:
-            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
-            return tags_to_desc(translated_tag_list, sep=self.sep)
-    def gen_desc_of_tag(self, formats, tags):
-        fmt = random.choice(formats)
-        return fmt.format(self.tags_to_desc(tags))
-    @staticmethod
-    def check_valid(value):
-        if isinstance(value, int) or isinstance(value, float):
-            return value > 0
-        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
-            return True
-        return False
-    @staticmethod
-    def remove_repeat(data):
-        #若专辑名和歌曲名相同，则只使用后者
-        album_name = data.get('专辑名', None)
-        if album_name is not None and album_name == data.get('歌曲名', None):
-            del data['专辑名']
-        return data
-    @property
-    def comma(self):
-        if self.lang == 'zh':
-            return '，'
-        elif self.lang == 'en':
-            return ', '
-    @property
-    def sep(self):
-        if self.lang == 'zh':
-            return '、'
-        elif self.lang == 'en':
-            return ', '
-    def generate_description(self, data):
-        data = self.remove_repeat(data)
-        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
-        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
-        prompts = []
-        if len(weak_tags) > 0:
-            probs = dist_prob_map_low[len(weak_tags)]
-            if len(key_tags) > 0:
-                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
-            else:
-                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
-            random.shuffle(weak_tags)
-            tags = weak_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
-                prompts.append(tag_desc)
-        if len(key_tags) > 0:
-            probs = dist_prob_map[len(key_tags)]
-            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
-            random.shuffle(key_tags)
-            tags = key_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
-                prompts.append(tag_desc)
-        random.shuffle(prompts)
-        return self.comma.join(prompts)
-    def is_valid_prompt_text(self, text):
-        for bad in ('抱歉','sorry', 'Sorry'):
-            if bad in text:
-                return False
-        return True
-    def get_ready_prompt(self, path):
-        sid = mp3_path_to_id(path)
-        return random.choice(self.prompts_dict[sid])
-    def __len__(self):
-        return len(self.data_ids)
-    def __getitem__(self, idx):
-        data_id = self.data_ids[idx]
-        item = self.data_index_dict[data_id]
-        path = item['path']
-        if not self.use_ready_prompt:
-            info = self.music_info_dict[data_id]
-            description = self.generate_description(info)
-        else:
-            description = self.get_ready_prompt(path)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-class CombinedDataset(Dataset):
-    @beartype
-    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-    def __len__(self):
-        return len(self.datasets_index)
-    def __getitem__(self, idx):
-        index = self.datasets_index[idx]
-        i,j = index
-        return self.datasets[i][j]
-class CombinedDataset_random(Dataset):
-    @beartype
-    def __init__(self,
-        num_examples:int,
-        datasets: Sequence[Dataset], ratios: Sequence[int]
-    ):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-        if num_examples > 0:
-            self.random_choose = True
-            self.dataset_len = num_examples
-        else:
-            self.random_choose = False
-            self.dataset_len = len(self.datasets_index)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        first_try = True
-        try_cnt = 0
-        while True:
-            try:
-                if(self.random_choose or not first_try):
-                    index2 = []
-                    index2.append(np.random.randint(0,len(self.datasets)))
-                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
-                else:
-                    index2 = self.datasets_index[idx]
-                first_try = False
-                out = self.datasets[index2[0]][index2[1]]
-                if(len(out[0].shape)==1):out[0]=out[0][None,:]
-                return out
-            except:
-                print("Error loadding ", index2)
-                try_cnt += 1
-                if(try_cnt>10):
-                    raise ValueError()

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined_withset.py DELETED Viewed

@@ -1,994 +0,0 @@
-from torch.utils.data import Dataset
-from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List
-from beartype import beartype
-from beartype.door import is_bearable
-import random
-import pandas as pd
-import os
-from torchaudio.functional import resample
-import torch
-import typing as tp
-from pathlib import Path
-import torchaudio as ta
-import torch.nn.functional as F
-import numpy as np
-import json
-import yaml
-import torchaudio
-import math
-import re
-from loguru import logger
-def gen_plain_prompt(key_list, sep=', '):
-    if len(key_list) == 0:
-        return 'none'
-    key_list = [k.strip() for k in key_list]
-    if len(key_list) > 10:
-        random.shuffle(key_list)
-        key_list = key_list[:10]
-    probs = dist_prob_map[len(key_list)]
-    num_tags = random.choices(range(1, len(key_list)+1), probs, k=1)[0]
-    random.shuffle(key_list)
-    tags = key_list[:num_tags]
-    tags_str = sep.join(tags)
-    return tags_str
-class Read_and_PadCrop_Normalized_T(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-        self.prob = {"is_start":0.2, "is_end":0.9}
-        self.shift_secs = 5
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
-        if(duration<(float(self.n_samples)/self.sample_rate+1)):
-            raise ValueError(duration,float(self.n_samples),self.sample_rate)
-            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-            t_start = 0.
-            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
-            offset = 0
-            is_start = True
-            is_end = True
-        else:
-            prob = random.uniform(0,1)
-            if(prob<self.prob['is_start']):
-                is_start = True
-                is_end = False
-                offset = 0
-            elif(prob>self.prob['is_end']):
-                is_start = False
-                is_end = True
-                offset = int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate)
-            else:
-                is_start = False
-                is_end = False
-                offset = np.random.randint(self.shift_secs*cur_sample_rate, \
-                    int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate)-self.shift_secs*cur_sample_rate)
-            t_start = offset / float(cur_sample_rate) / duration
-            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
-            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-        if(chunk.shape[0]>1):
-            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-        else:
-            chunk = chunk[[0],:].float()
-        if(cur_sample_rate!=self.sample_rate):
-            # print('a:',cur_sample_rate,chunk.shape)
-            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
-            # print('b:',self.sample_rate,chunk.shape)
-        if chunk.shape[-1] != self.n_samples:
-            raise ValueError(chunk.shape, self.n_samples, offset, int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-        # if chunk.shape[-1] < self.n_samples:
-        #     chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
-        # else:
-        #     chunk = chunk[:,0:self.n_samples]
-        seconds_start = math.floor(offset / cur_sample_rate)
-        seconds_total = math.floor(duration)
-        # # In this dataset, we do not introduce zeros
-        # if(is_start):
-        #     chunk = torch.cat([torch.zeros(1, self.shift_secs*self.sample_rate), chunk],1)[:,0:self.n_samples]
-        # elif(is_end):
-        #     chunk = torch.cat([chunk, torch.zeros(1, self.shift_secs*self.sample_rate)],1)[:,self.shift_secs*self.sample_rate:]
-        return (
-            chunk,
-            t_start,
-            t_end,
-            seconds_start,
-            seconds_total,
-            is_start,
-            is_end,
-        )
-USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
-if USE_DUMMY_AUDIO:
-    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
-class SafeAudioReader:
-    """
-       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
-    """
-    def __init__(self,
-                duration: float,  # 返回音频长度
-                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
-                randomize: bool = True
-                ):
-        self.n_samples = int(sample_rate * max(duration, 0))
-        self.reader = Read_and_PadCrop_Normalized_T(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
-    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
-    def __call__(self,
-                 filepath: os.PathLike,  # 音频路径
-                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
-                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
-                 ) -> torch.Tensor:
-        if USE_DUMMY_AUDIO:
-            wav = torch.zeros(self.n_samples, dtype=torch.float32)
-            return wav
-        try:
-            # if origin_sample_rate is None or origin_duration is None:
-            #     audio_info = torchaudio.info(filepath)
-            #     origin_sample_rate = audio_info.sample_rate
-            #     origin_duration = audio_info.num_frames / origin_sample_rate
-            audio_info = torchaudio.info(filepath)
-            origin_sample_rate = audio_info.sample_rate
-            origin_duration = audio_info.num_frames / origin_sample_rate
-            wav, *ignored, is_start, is_end = self.reader(filepath, origin_duration, origin_sample_rate)
-        except Exception as e:
-            logger.error(f"Error reading {filepath}: {e}")
-            raise FileNotFoundError(filepath)
-        return wav, is_start, is_end
-class PromptTemplate:
-    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
-        self.template_text = template_text
-        self.tag_map = tag_map
-        self.lang = lang
-    @property
-    def tags(self):
-        return tuple(self.tag_map.keys())
-    def apply(self, **kwargs):
-        for tag in list(kwargs.keys()):
-            if kwargs[tag] == '':
-                kwargs.pop(tag)
-        for tag in self.tags:
-            if tag in kwargs:
-                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
-            else:
-                kwargs[tag] = ''
-        prompt = self.template_text.format(**kwargs)
-        return self.beautify(prompt)
-    def beautify(self, text):
-        if self.lang == 'en':
-            return self._beautify_en(text)
-        elif self.lang == 'zh':
-            return self._beautify_zh(text)
-        else:
-            raise ValueError(f'Unknown language {self.lang}')
-    @staticmethod
-    def _beautify_en(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
-        # no continuous whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
-        text = re.sub(r'\s+,', r',', text)
-        text = re.sub(r',\s+', r', ', text)
-        # no whitespace before the full stop
-        text = re.sub(r'\s+\.', r'.', text)
-        # strip whitespace, comma, and replace ',.'
-        text = text.strip(' ,')
-        text = text.replace(',.', '.')
-        return text
-    @staticmethod
-    def _beautify_zh(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
-        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
-        # assume there should be NO whitespace in Chinese
-        text = re.sub(r'\s+', r'', text)
-        # strip whitespace, comma, and replace '，。'
-        text = text.strip('， 、')
-        text = text.replace('，。', '。')
-        return text
-    def __repr__(self):
-        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
-    __str__ = __repr__
-def parse_prompt_template(prompt_template_text, lang='en'):
-    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
-    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
-    template_text = prompt_template_text.strip()
-    span_texts = span_pattern.findall(prompt_template_text)
-    tag_map = {}
-    for span_text in span_texts:
-        tag = tag_pattern.findall(span_text)[0].strip('{}')
-        tag_map[tag] = span_text
-        template_text = template_text.replace(span_text, '{'+tag+'}')
-    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
-def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
-    with open(path, 'r') as f:
-        lines = f.readlines()
-    cnt = 0
-    pts = []
-    for line in lines:
-        pt = parse_prompt_template(line, lang=lang)
-        cnt += 1
-        if len(pt.tags) < num:
-            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
-        pts.append(pt)
-    return pts
-def get_base_dir_file(key: os.PathLike):
-    base = os.path.basename(key)
-    dirname = os.path.basename(os.path.dirname(key))
-    return os.path.join(dirname, base)
-def read_jsonlike(path: os.PathLike):
-    #json or jsonl
-    if str(path).endswith(".json"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = json.load(f)
-        return data
-    elif str(path).endswith(".jsonl"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = [json.loads(line) for line in f.readlines()]
-        return data
-    else:
-        raise ValueError("Unknown file format")
-dist_prob_map = {
-    1: (1.0,),
-    2: (0.5, 0.5),
-    3: (0.3, 0.4, 0.3),
-    4: (0.2, 0.3, 0.3, 0.2),
-    5: (0.2, 0.2, 0.3, 0.2, 0.1),
-    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
-    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
-    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
-    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
-    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
-}
-dist_prob_map_low = {
-    1: (1.0,),
-    2: (0.8, 0.2),
-    3: (0.8, 0.1, 0.1),
-    4: (0.7, 0.1, 0.1, 0.1),
-    5: (0.7, 0.1, 0.1, 0.05, 0.05),
-    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
-}
-_bpm_range_rights = (
-    (40, '20-40'),
-    (60, '40-60'),
-    (66, '60-66'),
-    (76, '66-76'),
-    (108, '76-108'),
-    (120, '108-120'),
-    (168, '120-168'),
-    (176, '168-176'),
-    (200, '176-200')
-)
-_bpm_desc_map = {
-    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
-    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
-    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
-    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
-    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
-    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
-    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
-    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
-    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
-    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
-}
-_bpm_desc_map_zh = {
-    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
-    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
-    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
-    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
-    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
-    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
-    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
-    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
-    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
-    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
-}
-def get_bpm_range(bpm):
-    bpm = int(bpm)
-    for right, tag in _bpm_range_rights:
-        if bpm <= right:
-            return tag
-    return '>200'
-def gen_bpm_descript(bpm, lang='en'):
-    bpm_range = get_bpm_range(bpm)
-    if lang == 'en':
-        return random.choice(_bpm_desc_map[bpm_range])
-    elif lang == 'zh':
-        return random.choice(_bpm_desc_map_zh[bpm_range])
-    else:
-        raise ValueError(f"Unknown language {lang}")
-def read_translate(translate: Optional[Dict[str, os.PathLike]]):
-    if translate is None:
-        return None
-    if isinstance(translate, str):
-        return read_jsonlike(translate)
-    return {k: read_jsonlike(path) for k, path in translate.items()}
-class MagnaTagATuneDataset(Dataset):
-    def __init__(self):
-        pass
-def tags_to_desc(tag_list, sep=',') -> str:
-    if not isinstance(tag_list, Sequence):
-        return str(tag_list)
-    if isinstance(tag_list, str):
-        return tag_list
-    if len(tag_list) <= 0:
-        return ''
-    elif len(tag_list) <= 5:
-        probs = dist_prob_map[len(tag_list)]
-        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-    else:
-        probs = dist_prob_map[5]
-        tags_num = random.choices(range(1, 6), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-def get_sr_and_duration_info(item):
-    return item.get('sample_rate', None), item.get('duration', None)
-class MtgJamendoDatasetFromJson(Dataset):
-    def __init__(self,
-                data_dir:str,
-                json_path:str,
-                duration:float=10,
-                sr:int = 0,
-                *,
-                lang = 'en',
-                return_path = False,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.data_dir = data_dir
-        self._load_metadata_json(json_path)
-        self.sr = sr
-        self.duration = duration
-        self.return_path = return_path
-        self.lang = lang
-        self.use_dynamic_prompt = prompt_template_path is not None
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
-            self.tag_types = tag_types
-            self.translate = read_translate(translate)
-        if not self.use_dynamic_prompt and self.lang != 'en':
-            raise NotImplementedError
-    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
-    WEAK_TAG_LIST = ["title", "artist"]
-    def _load_metadata_json(self, json_path):
-        with open(json_path) as fp:
-            self.data = json.load(fp)
-    def convert_key_to_path(self, key):
-        return os.path.join(self.data_dir, get_base_dir_file(key))
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        path = self.convert_key_to_path(item['key'])
-        description = self.generate_description(item)
-        sr, duration = get_sr_and_duration_info(item)
-        audio, is_start, is_end = self.audio_reader(path, sr, duration)
-        if self.return_path:
-            return audio, description, path
-        return audio, description, is_start, is_end
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def generate_description(self, item):
-        if self.use_dynamic_prompt:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use ordinary static prompt instead
-            description = self.generate_description_ordinary(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
-        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
-        if len(exists_strong_tag) > 0:
-            probs = dist_prob_map[len(exists_strong_tag)]
-            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
-            random.shuffle(exists_strong_tag)
-            tags = exists_strong_tag[:tags_num]
-            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
-            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
-            random.shuffle(exists_weak_tag)
-            weak_tags = exists_weak_tag[:weak_tags_num]
-            tags += weak_tags
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
-            prompt = prompt_template.apply(**tags_args)
-        return prompt
-    def generate_description_ordinary(self, data, thresh = 0.3):
-        # Initialize the description with title and artist
-        description = f'"{data["title"]+" is " if random.random() > thresh else ""}"a piece of music by {data["artist"]}'
-        # Add genre if available
-        if data["genre"] and random.random() > thresh:
-            genres = ', '.join(data["genre"])
-            description += f', belonging to the {genres} genres'
-        # Add moods if available
-        if data["moods"] and random.random() > thresh:
-            moods = ', '.join(data["moods"])
-            description += f'. This track conveys a {moods} mood'
-        # Add instruments if available
-        if data["instrument"] and random.random() > thresh:
-            instruments = ', '.join(data["instrument"])
-            description += f', and primarily features the following instruments: {instruments}'
-        # Add a period to end the description
-        description += '.'
-        return description
-class AudioStockDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                duration:float=10,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.duration = duration
-        self._load_metadata(metadata_path)
-        self.sr = sr
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_dynamic_prompt = prompt_template_path is not None
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
-        self.tag_types = tag_types
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                if(item['duration']>self.duration+10):
-                    self.data.append(item)
-        self.is_info_recorded = bool('Tags' in self.data[0])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        path:str = self.data[idx]["path"]
-        json_path = path[:path.rfind('.')] + ".json"
-        if self.is_info_recorded:
-            item = self.data[idx]
-        else:
-            try:
-                with open(json_path) as fp:
-                    item:dict = json.load(fp)
-            except Exception as e:
-                print(f"Error loading json file {json_path} :\n{e}")
-                item = {}
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio, is_start, is_end = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path, is_start, is_end
-        else:
-            return audio, description, is_start, is_end
-    def generate_description(self, item):
-        if self.use_dynamic_prompt:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use ordinary static prompt instead
-            description = self.generate_description_ordinary(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        if len(exists_tag) > 0:
-            probs = dist_prob_map[len(exists_tag)]
-            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
-            random.shuffle(exists_tag)
-            tags = exists_tag[:tags_num]
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            tags_args = self.handle_BPM_tag(tags_args)
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            prompt = prompt_template.apply()
-        return prompt
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            if tag_type == 'BPM':
-                return tags_to_desc(tag_list, sep='、')
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def handle_BPM_tag(self, tags_args):
-        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
-            bpm = tags_args["BPM"]
-            del tags_args["BPM"]
-            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
-            for tag_type in tag_types_used:
-                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
-        return tags_args
-    def generate_description_ordinary(self, data, thresh = 0.3):
-        if self.lang != 'en':
-            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
-        description = f'a piece of music by {data["Artist"]}'
-        # Add genre if available
-        if data["Genre"] and random.random() > thresh:
-            genres = ', '.join(data["Genre"])
-            description += f', belonging to the {genres} genres'
-        # Add moods if available
-        if data["Tags"] and random.random() > thresh:
-            tags = ', '.join(data["Tags"])
-            description += f'. This track contains the tags:{tags}'
-        # Add moods if available
-        if data["Mood"] and random.random() > thresh:
-            moods = ', '.join(data["Mood"])
-            description += f'. This track conveys a {moods} mood.'
-        # Add instruments if available
-        if data["Instrument"] and random.random() > thresh:
-            instruments = ', '.join(data["Instrument"])
-            description += f'. and primarily features the following instruments: {instruments}'
-        # Add a period to end the description
-        description += '.'
-        return description
-def mp3_path_to_id(mp3_path):
-    return int(
-        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.mp3')]
-    )
-class TmeDataset(Dataset):
-    def __init__(self,
-                data_index:str,
-                music_info:str = None,
-                duration:float = 10,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_format_path: os.PathLike = None,
-                tag_types = ['*'],
-                lang = 'zh',
-                translate: Optional[os.PathLike] = None,
-                prompt_dir: os.PathLike = None,
-                ):
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.sr = sr
-        self.duration = duration
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.lang = lang
-        self.use_ready_prompt = prompt_dir is not None
-        data_index = read_jsonlike(data_index)
-        data_index = [d for d in data_index if d['duration']>self.duration+10]
-        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
-        self.data_ids = list(self.data_index_dict.keys())
-        if not self.use_ready_prompt:
-            #读取音乐的信息文件
-            music_info = read_jsonlike(music_info)
-            if 'music' in music_info:
-                music_info = music_info['music']
-            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-            with open(prompt_format_path) as fp:
-                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
-            #加载tag types，并分成一般的tag_types和关键的key_tag_types
-            if '*' in tag_types:
-                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
-            else:
-                self.tag_types = tag_types
-            self.key_tag_types = []
-            if 'tag' in self.tag_types:
-                self.tag_types.remove('tag')
-                self.key_tag_types = list(self.prompt_formats['tag'].keys())
-            #加载translate翻译
-            if translate is not None:
-                self.translator = read_jsonlike(translate)
-        else:
-            data_ids_set = set(self.data_ids)
-            self.prompts_dict = {}
-            for fname in os.listdir(prompt_dir):
-                items = read_jsonlike(os.path.join(prompt_dir, fname))
-                for item in items:
-                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
-                        continue
-                    if item['ID'] not in self.prompts_dict:
-                        self.prompts_dict[item['ID']] = []
-                        self.prompts_dict[item['ID']].append(item['Text'])
-            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
-            self.data_ids = list(self.data_index_dict.keys())
-    def tags_to_desc(self, tag_list) -> str:
-        if is_bearable(tag_list, int):
-            return str(tag_list)
-        if self.lang == 'zh':
-            return tags_to_desc(tag_list, sep=self.sep)
-        else:
-            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
-            return tags_to_desc(translated_tag_list, sep=self.sep)
-    def gen_desc_of_tag(self, formats, tags):
-        fmt = random.choice(formats)
-        return fmt.format(self.tags_to_desc(tags))
-    @staticmethod
-    def check_valid(value):
-        if isinstance(value, int) or isinstance(value, float):
-            return value > 0
-        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
-            return True
-        return False
-    @staticmethod
-    def remove_repeat(data):
-        #若专辑名和歌曲名相同，则只使用后者
-        album_name = data.get('专辑名', None)
-        if album_name is not None and album_name == data.get('歌曲名', None):
-            del data['专辑名']
-        return data
-    @property
-    def comma(self):
-        if self.lang == 'zh':
-            return '，'
-        elif self.lang == 'en':
-            return ', '
-    @property
-    def sep(self):
-        if self.lang == 'zh':
-            return '、'
-        elif self.lang == 'en':
-            return ', '
-    def generate_description(self, data):
-        data = self.remove_repeat(data)
-        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
-        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
-        prompts = []
-        if len(weak_tags) > 0:
-            probs = dist_prob_map_low[len(weak_tags)]
-            if len(key_tags) > 0:
-                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
-            else:
-                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
-            random.shuffle(weak_tags)
-            tags = weak_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
-                prompts.append(tag_desc)
-        if len(key_tags) > 0:
-            probs = dist_prob_map[len(key_tags)]
-            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
-            random.shuffle(key_tags)
-            tags = key_tags[:tags_num]
-            for tag_type in tags:
-                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
-                prompts.append(tag_desc)
-        random.shuffle(prompts)
-        return self.comma.join(prompts)
-    def is_valid_prompt_text(self, text):
-        for bad in ('抱歉','sorry', 'Sorry'):
-            if bad in text:
-                return False
-        return True
-    def get_ready_prompt(self, path):
-        sid = mp3_path_to_id(path)
-        return random.choice(self.prompts_dict[sid])
-    def __len__(self):
-        return len(self.data_ids)
-    def __getitem__(self, idx):
-        data_id = self.data_ids[idx]
-        item = self.data_index_dict[data_id]
-        path = item['path']
-        if not self.use_ready_prompt:
-            info = self.music_info_dict[data_id]
-            description = self.generate_description(info)
-        else:
-            description = self.get_ready_prompt(path)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio, is_start, is_end = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path, is_start, is_end
-        else:
-            return audio, description, is_start, is_end
-class Pond5Dataset(Dataset):
-    MAX_PROMPT_LEN = 200
-    def __init__(self,
-                metadata_path:str,
-                index_path:str,
-                duration:float=10,
-                sr:int = 0,
-                plain_rate = 0,
-                return_path = False,
-                return_audio = True,
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None,
-                use_literal_none = True,
-                use_avoid_watermark_policy = None,
-                ):
-        if use_avoid_watermark_policy is None:
-            raise ValueError("`use_avoid_watermark_policy` is an important param, you need to explicitly specify it with bool type")
-        self.use_avoid_watermark_policy = use_avoid_watermark_policy
-        assert self.use_avoid_watermark_policy is False
-        self.audio_reader = SafeAudioReader(duration, sr)
-        self.duration = duration
-        self._load_metadata(metadata_path, index_path)
-        self.sr = sr
-        self.plain_rate = plain_rate
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_literal_none = use_literal_none
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path, index_path):
-        data_index = read_jsonlike(index_path)
-        data_ids = set([item['id'] for item in data_index])
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-        append_ids = set()
-        self.data = []
-        for line in lines:
-            item = json.loads(line)
-            if item['id'] in data_ids and item['id'] not in append_ids and item["details"]["duration"] is not None and item["details"]["duration"]>self.duration+10:
-                self.data.append(item)
-                append_ids.add(item['id'])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        path:str = item["path"]
-        description = self.generate_description(item)
-        if self.return_audio:
-            sr, duration = get_sr_and_duration_info(item)
-            audio, is_start, is_end = self.audio_reader(path, sr, duration)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description, is_start, is_end
-    @property
-    def keysep(self):
-        if self.lang == 'zh':
-            return '，' if random.random() > 0.5 else '、'
-        elif self.lang == 'en':
-            return ', '
-    def generate_description(self, item):
-        if random.random() > self.plain_rate:
-            # dynamically generate prompt from given prompt template
-            description = self.generate_description_dynamic(item)
-        else:
-            # use plain prompt, i.e. tags sequence separated by comma
-            description = self.generate_description_plain(item)
-        return description
-    def get_translation(self, k):
-        k = k.strip()
-        if k in self.translate:
-            return self.translate[k]
-        else:
-            return k
-    def generate_description_plain(self, item):
-        keywords = item['keywords']
-        if self.lang != 'en':
-            keywords = [self.get_translation(k) for k in keywords]
-        return gen_plain_prompt(keywords, sep=self.keysep)
-    def generate_description_dynamic(self,item):
-        desc = item.get('desc', 'none')
-        if desc is None:
-            desc = 'none'
-        desc = desc.strip()
-        if len(desc) > self.MAX_PROMPT_LEN:
-            shorter_desc = desc[:self.MAX_PROMPT_LEN]
-            # find last stop
-            stop_idx = shorter_desc.rfind('.')
-            if stop_idx == -1:
-                stop_idx = shorter_desc.rfind('!')
-            if stop_idx == -1:
-                stop_idx = shorter_desc.rfind(',')
-            if stop_idx == -1:
-                stop_idx = self.MAX_PROMPT_LEN - 1
-            desc = desc[:stop_idx+1]
-        return desc
-class CombinedDataset(Dataset):
-    @beartype
-    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-    def __len__(self):
-        return len(self.datasets_index)
-    def __getitem__(self, idx):
-        index = self.datasets_index[idx]
-        i,j = index
-        return self.datasets[i][j]
-class CombinedDataset_random(Dataset):
-    @beartype
-    def __init__(self,
-        num_examples:int,
-        datasets: Sequence[Dataset], ratios: Sequence[int]
-    ):
-        self.datasets = datasets
-        self.datasets_index = []
-        for i,dataset in enumerate(datasets):
-            if dataset is None:
-                continue
-            for dup in range(ratios[i]):
-                for j in range(len(dataset)):
-                    self.datasets_index.append((i,j))
-        if num_examples > 0:
-            self.random_choose = True
-            self.dataset_len = num_examples
-        else:
-            self.random_choose = False
-            self.dataset_len = len(self.datasets_index)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        first_try = True
-        try_cnt = 0
-        while True:
-            try:
-                if(self.random_choose or not first_try):
-                    index2 = []
-                    index2.append(np.random.randint(0,len(self.datasets)))
-                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
-                else:
-                    index2 = self.datasets_index[idx]
-                first_try = False
-                out = self.datasets[index2[0]][index2[1]]
-                if(len(out[0].shape)==1):out[0]=out[0][None,:]
-                return out
-            except:
-                print("Error loadding ", index2)
-                try_cnt += 1
-                if(try_cnt>10):
-                    raise FileNotFoundError()

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song.py DELETED Viewed

@@ -1,313 +0,0 @@
-import re
-import sys
-import json
-from torch.utils.data import Dataset
-import torchaudio
-from torchaudio.functional import resample
-import torch
-import numpy as np
-from torch.nn.utils.rnn import pad_sequence
-def check_lryics(lyric):
-    _FILTER_STRING = [
-        '作词', '作曲', '编曲', '【', '策划',
-        '录音', '混音', '母带', '：', '制作',
-        '版权', '校对', '演奏', '制作', '伴奏'
-    ]
-    for item in _FILTER_STRING:
-        if item in lyric:
-            return True
-    return False
-def process_lyrics(lines):
-    lyric_part = []
-    timestamp_part = []
-    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
-    for i, line in enumerate(lines):
-        # 删除前几行的特定信息
-        if i<10 and check_lryics(line):
-            continue
-        # 检查是否包含有效的时间戳和歌词内容
-        if timestamp_pattern.match(line):
-            timestamp_end = line.rfind(']')
-            lyrics = line[timestamp_end + 1:].strip()
-            timestamps = line[:timestamp_end + 1]
-            if '：' in lyrics:
-                if len(lyrics.split("：")[0]) <=5:
-                     lyrics = "".join(lyrics.split("：")[1:])
-            # if lyrics:  # 确保歌词部分不是空的
-            #     lyric_part.append(lyrics)
-            #     timestamp_part.append(timestamps)
-    # print(processed_lyrics)
-    return timestamp_part, lyric_part
-def get_timestamps(timestamp_part):
-    # 转换为秒
-    timestamps = []
-    for line in timestamp_part:
-        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
-        if match:
-            minutes = int(match.group(1))
-            seconds = float(match.group(2))
-            millis = float(match.group(3)) if match.group(3) else 0
-            total_seconds = minutes * 60 + seconds + millis
-            timestamps.append(total_seconds)
-    return timestamps
-def process_lyrics_lrc(lyrics):
-    timestamp_part, lyric_part = process_lyrics(lyrics)
-    # print(timestamp_part)
-    # print(lyric_part)
-    timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    return output_list
-def process_lyrics_yrc(lyrics):
-    timestamps, lyric_part = extract_lrc(lyrics)
-    # timestamp_part, lyric_part = process_lyrics(lyrics)
-    # import pdb; pdb.set_trace()
-    # print(timestamp_part)
-    # print(lyric_part)
-    # timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    # import pdb; pdb.set_trace()
-    return output_list
-def extract_lrc(lyrics):
-    timestamp_part, lyric_part = [], []
-    for i,  text in enumerate(lyrics):
-        # 提取中括号内的内容
-        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
-        bracket_content = bracket_content.split(',')
-        # 提取小括号内的内容
-        parentheses_content = re.findall(r'\((.*?)\)', text)
-        # 提取其他内容
-        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
-        # 数据怎么处理？
-        # import pdb; pdb.set_trace()
-        if i<10 and check_lryics(other_content):
-            continue
-        # import pdb; pdb.set_trace()
-        timestamp_part.append(float(bracket_content[0])/1000)
-        lyric_part.append(other_content)
-    # import pdb; pdb.set_trace()
-    return timestamp_part, lyric_part
-class WYYSongDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                sr:int = 0,
-                use_lang = ['en', 'zh-cn'],
-                num_examples = -1,
-                ):
-        self.sr = sr
-        self.use_lang = use_lang
-        self._load_metadata(metadata_path)
-        # buffer
-        self.lyric_buffer = {}
-        if(num_examples<=0):
-            self.dataset_len = len(self.data)
-            self.random_slc = False
-        else:
-            self.dataset_len = num_examples
-            self.random_slc = True
-    # 读取jsonl文件
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                # if item['lrc-lyric'] is not None and item['yrc-lyric'] is not None:
-                if 'lyrics' in item and 'lang_info' in item:
-                    if len(item['lyrics']) > 0:
-                        for lang in self.use_lang:
-                            if lang in item['lang_info'] and item['lang_info'][lang]['proportion'] > 0.8 and item['lang_info'][lang]['probability'] > 0.9:
-                                # if '伴奏' not in item['path'] and "cloud" in item['path']:
-                                if '伴奏' not in item['path']:
-                                    self.data.append(item)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        try_cnt = 0
-        while True:
-            if(self.random_slc):
-                idx = np.random.randint(0, len(self.data))
-            yrc_lyrics = []
-            lrc_lyrics = []
-            try:
-                info = self.data[idx]
-                # audio path
-                path:str = info["path"]
-                # 读取歌词段落
-                if 'lyrics' not in info:
-                    if idx not in self.lyric_buffer:
-                        # 字级别align的歌词
-                        if info['yrc-lyric'] is not None:
-                            with open(info['yrc-lyric']) as f_in:
-                                yrc_lyric = json.load(f_in)
-                            yrc_lyrics = process_lyrics_yrc(yrc_lyric['lyrics'][:-1])
-                        # 句子级align的歌词
-                        if info['lrc-lyric'] is not None:
-                            with open(info['lrc-lyric']) as f_in:
-                                lrc_lyric = json.load(f_in)
-                            lrc_lyrics = process_lyrics_lrc(lrc_lyric['lyrics'][:-1])
-                        # 优先使用字级别align的歌词
-                        if len(yrc_lyrics) > 0:
-                            lyrics = yrc_lyrics
-                        else:
-                            lyrics = lrc_lyrics
-                        self.lyric_buffer[idx] = lyrics
-                        # TODO 每段歌词进行长度筛选，过滤掉太长和太短的歌曲
-                    else:
-                        lyrics = self.lyric_buffer[idx]
-                else:
-                    lyrics = info['lyrics']
-                # 随机选取一个lyric段落
-                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
-                # ly_id = 0
-                lyric = lyrics[ly_id]
-                st, et, lyric = self.parse_lyric(lyric)
-                assert et - st < 40
-                # 文本过滤
-                lyric = re.sub(r'【.*?】', '', lyric)
-                if 'zh-cn' in info['lang_info'] and info['lang_info']['zh-cn']['proportion'] > 0.8:
-                    assert  200 > len(lyric.replace(" ", "")) > 30
-                    if '：' in lyrics:
-                        if len(lyrics.split("：")[0]) <=5:
-                            lyrics = "".join(lyrics.split("：")[1:])
-                    if ':' in lyrics:
-                        if len(lyrics.split("：")[0]) <=5:
-                            lyrics = "".join(lyrics.split(":")[1:])
-                if 'en' in info['lang_info'] and info['lang_info']['en']['proportion'] > 0.8:
-                    assert  200 > len(lyric.split()) > 20
-                    if '：' in lyrics:
-                        if len(lyrics.split("：")[0].split()) <=3:
-                            lyrics = "".join(lyrics.split("：")[1:])
-                    if ':' in lyrics:
-                        if len(lyrics.split("：")[0].split()) <=3:
-                            lyrics = "".join(lyrics.split(":")[1:])
-                # 读取音频文件
-                cur_sample_rate = torchaudio.info(path).sample_rate
-                offset = int(cur_sample_rate*st)
-                num_frames = int(cur_sample_rate * (et -st))
-                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
-                # 随机选取一个channel
-                if(chunk.shape[0]>1):
-                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-                else:
-                    chunk = chunk[[0],:].float()
-                if(cur_sample_rate!=self.sr):
-                    # print('a:',cur_sample_rate,chunk.shape)
-                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
-                return chunk, lyric, [st, et], path
-            except:
-                    print("Error loadding ", info["path"])
-                    try_cnt += 1
-                    idx  = np.random.randint(0, len(self.data))
-                    if(try_cnt>10):
-                        raise FileNotFoundError()
-    def parse_lyric(self, lyric):
-        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
-        match = re.search(pattern, lyric)
-        start_time = float(match.group(1))
-        end_time = float(match.group(2))
-        content = match.group(3)
-        return start_time, end_time, content
-def collect_song(data_list):
-    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
-    lyrics = [data[1] for data in data_list]
-    st_et = [data[2] for data in data_list]
-    paths = [data[3] for data in data_list]
-    return audios, lyrics, st_et

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_20s.py DELETED Viewed

@@ -1,313 +0,0 @@
-import re
-import sys
-import json
-from torch.utils.data import Dataset
-import torchaudio
-from torchaudio.functional import resample
-import torch
-import numpy as np
-from torch.nn.utils.rnn import pad_sequence
-def check_lryics(lyric):
-    _FILTER_STRING = [
-        '作词', '作曲', '编曲', '【', '策划',
-        '录音', '混音', '母带', '：', '制作',
-        '版权', '校对', '演奏', '制作', '伴奏'
-    ]
-    for item in _FILTER_STRING:
-        if item in lyric:
-            return True
-    return False
-def process_lyrics(lines):
-    lyric_part = []
-    timestamp_part = []
-    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
-    for i, line in enumerate(lines):
-        # 删除前几行的特定信息
-        if i<10 and check_lryics(line):
-            continue
-        # 检查是否包含有效的时间戳和歌词内容
-        if timestamp_pattern.match(line):
-            timestamp_end = line.rfind(']')
-            lyrics = line[timestamp_end + 1:].strip()
-            timestamps = line[:timestamp_end + 1]
-            if '：' in lyrics:
-                if len(lyrics.split("：")[0]) <=5:
-                     lyrics = "".join(lyrics.split("：")[1:])
-            # if lyrics:  # 确保歌词部分不是空的
-            #     lyric_part.append(lyrics)
-            #     timestamp_part.append(timestamps)
-    # print(processed_lyrics)
-    return timestamp_part, lyric_part
-def get_timestamps(timestamp_part):
-    # 转换为秒
-    timestamps = []
-    for line in timestamp_part:
-        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
-        if match:
-            minutes = int(match.group(1))
-            seconds = float(match.group(2))
-            millis = float(match.group(3)) if match.group(3) else 0
-            total_seconds = minutes * 60 + seconds + millis
-            timestamps.append(total_seconds)
-    return timestamps
-def process_lyrics_lrc(lyrics):
-    timestamp_part, lyric_part = process_lyrics(lyrics)
-    # print(timestamp_part)
-    # print(lyric_part)
-    timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    return output_list
-def process_lyrics_yrc(lyrics):
-    timestamps, lyric_part = extract_lrc(lyrics)
-    # timestamp_part, lyric_part = process_lyrics(lyrics)
-    # import pdb; pdb.set_trace()
-    # print(timestamp_part)
-    # print(lyric_part)
-    # timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    # import pdb; pdb.set_trace()
-    return output_list
-def extract_lrc(lyrics):
-    timestamp_part, lyric_part = [], []
-    for i,  text in enumerate(lyrics):
-        # 提取中括号内的内容
-        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
-        bracket_content = bracket_content.split(',')
-        # 提取小括号内的内容
-        parentheses_content = re.findall(r'\((.*?)\)', text)
-        # 提取其他内容
-        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
-        # 数据怎么处理？
-        # import pdb; pdb.set_trace()
-        if i<10 and check_lryics(other_content):
-            continue
-        # import pdb; pdb.set_trace()
-        timestamp_part.append(float(bracket_content[0])/1000)
-        lyric_part.append(other_content)
-    # import pdb; pdb.set_trace()
-    return timestamp_part, lyric_part
-class WYYSongDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                sr:int = 0,
-                use_lang = ['en', 'zh-cn'],
-                num_examples = -1,
-                ):
-        self.sr = sr
-        self.use_lang = use_lang
-        self._load_metadata(metadata_path)
-        # buffer
-        self.lyric_buffer = {}
-        if(num_examples<=0):
-            self.dataset_len = len(self.data)
-            self.random_slc = False
-        else:
-            self.dataset_len = num_examples
-            self.random_slc = True
-    # 读取jsonl文件
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                # if item['lrc-lyric'] is not None and item['yrc-lyric'] is not None:
-                if 'lyrics' in item and 'lang_info' in item:
-                    if len(item['lyrics']) > 0:
-                        for lang in self.use_lang:
-                            if lang in item['lang_info'] and item['lang_info'][lang]['proportion'] > 0.8 and item['lang_info'][lang]['probability'] > 0.9:
-                                # if '伴奏' not in item['path'] and "cloud" in item['path']:
-                                if '伴奏' not in item['path']:
-                                    self.data.append(item)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        try_cnt = 0
-        while True:
-            if(self.random_slc):
-                idx = np.random.randint(0, len(self.data))
-            yrc_lyrics = []
-            lrc_lyrics = []
-            try:
-                info = self.data[idx]
-                # audio path
-                path:str = info["path"]
-                # 读取歌词段落
-                if 'lyrics' not in info:
-                    if idx not in self.lyric_buffer:
-                        # 字级别align的歌词
-                        if info['yrc-lyric'] is not None:
-                            with open(info['yrc-lyric']) as f_in:
-                                yrc_lyric = json.load(f_in)
-                            yrc_lyrics = process_lyrics_yrc(yrc_lyric['lyrics'][:-1])
-                        # 句子级align的歌词
-                        if info['lrc-lyric'] is not None:
-                            with open(info['lrc-lyric']) as f_in:
-                                lrc_lyric = json.load(f_in)
-                            lrc_lyrics = process_lyrics_lrc(lrc_lyric['lyrics'][:-1])
-                        # 优先使用字级别align的歌词
-                        if len(yrc_lyrics) > 0:
-                            lyrics = yrc_lyrics
-                        else:
-                            lyrics = lrc_lyrics
-                        self.lyric_buffer[idx] = lyrics
-                        # TODO 每段歌词进行长度筛选，过滤掉太长和太短的歌曲
-                    else:
-                        lyrics = self.lyric_buffer[idx]
-                else:
-                    lyrics = info['lyrics']
-                # 随机选取一个lyric段落
-                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
-                # ly_id = 0
-                lyric = lyrics[ly_id]
-                st, et, lyric = self.parse_lyric(lyric)
-                assert et - st < 20
-                # 文本过滤
-                lyric = re.sub(r'【.*?】', '', lyric)
-                if 'zh-cn' in info['lang_info'] and info['lang_info']['zh-cn']['proportion'] > 0.8:
-                    assert  100 > len(lyric.replace(" ", "")) > 5
-                    if '：' in lyrics:
-                        if len(lyrics.split("：")[0]) <=5:
-                            lyrics = "".join(lyrics.split("：")[1:])
-                    if ':' in lyrics:
-                        if len(lyrics.split("：")[0]) <=5:
-                            lyrics = "".join(lyrics.split(":")[1:])
-                if 'en' in info['lang_info'] and info['lang_info']['en']['proportion'] > 0.8:
-                    assert  100 > len(lyric.split()) > 5
-                    if '：' in lyrics:
-                        if len(lyrics.split("：")[0].split()) <=3:
-                            lyrics = "".join(lyrics.split("：")[1:])
-                    if ':' in lyrics:
-                        if len(lyrics.split("：")[0].split()) <=3:
-                            lyrics = "".join(lyrics.split(":")[1:])
-                # 读取音频文件
-                cur_sample_rate = torchaudio.info(path).sample_rate
-                offset = int(cur_sample_rate*st)
-                num_frames = int(cur_sample_rate * (et -st))
-                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
-                # 随机选取一个channel
-                if(chunk.shape[0]>1):
-                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-                else:
-                    chunk = chunk[[0],:].float()
-                if(cur_sample_rate!=self.sr):
-                    # print('a:',cur_sample_rate,chunk.shape)
-                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
-                return chunk, lyric, [st, et], path
-            except:
-                    print("Error loadding ", info["path"])
-                    try_cnt += 1
-                    idx  = np.random.randint(0, len(self.data))
-                    if(try_cnt>10):
-                        raise FileNotFoundError()
-    def parse_lyric(self, lyric):
-        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
-        match = re.search(pattern, lyric)
-        start_time = float(match.group(1))
-        end_time = float(match.group(2))
-        content = match.group(3)
-        return start_time, end_time, content
-def collect_song(data_list):
-    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
-    lyrics = [data[1] for data in data_list]
-    st_et = [data[2] for data in data_list]
-    paths = [data[3] for data in data_list]
-    return audios, lyrics, st_et

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_new_429.py DELETED Viewed

@@ -1,313 +0,0 @@
-import re
-import sys
-import json
-from torch.utils.data import Dataset
-import torchaudio
-from torchaudio.functional import resample
-import torch
-import numpy as np
-from torch.nn.utils.rnn import pad_sequence
-def check_lryics(lyric):
-    _FILTER_STRING = [
-        '作词', '作曲', '编曲', '【', '策划',
-        '录音', '混音', '母带', '：', '制作',
-        '版权', '校对', '演奏', '制作', '伴奏'
-    ]
-    for item in _FILTER_STRING:
-        if item in lyric:
-            return True
-    return False
-def process_lyrics(lines):
-    lyric_part = []
-    timestamp_part = []
-    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
-    for i, line in enumerate(lines):
-        # 删除前几行的特定信息
-        if i<10 and check_lryics(line):
-            continue
-        # 检查是否包含有效的时间戳和歌词内容
-        if timestamp_pattern.match(line):
-            timestamp_end = line.rfind(']')
-            lyrics = line[timestamp_end + 1:].strip()
-            timestamps = line[:timestamp_end + 1]
-            if '：' in lyrics:
-                if len(lyrics.split("：")[0]) <=5:
-                     lyrics = "".join(lyrics.split("：")[1:])
-            # if lyrics:  # 确保歌词部分不是空的
-            #     lyric_part.append(lyrics)
-            #     timestamp_part.append(timestamps)
-    # print(processed_lyrics)
-    return timestamp_part, lyric_part
-def get_timestamps(timestamp_part):
-    # 转换为秒
-    timestamps = []
-    for line in timestamp_part:
-        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
-        if match:
-            minutes = int(match.group(1))
-            seconds = float(match.group(2))
-            millis = float(match.group(3)) if match.group(3) else 0
-            total_seconds = minutes * 60 + seconds + millis
-            timestamps.append(total_seconds)
-    return timestamps
-def process_lyrics_lrc(lyrics):
-    timestamp_part, lyric_part = process_lyrics(lyrics)
-    # print(timestamp_part)
-    # print(lyric_part)
-    timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    return output_list
-def process_lyrics_yrc(lyrics):
-    timestamps, lyric_part = extract_lrc(lyrics)
-    # timestamp_part, lyric_part = process_lyrics(lyrics)
-    # import pdb; pdb.set_trace()
-    # print(timestamp_part)
-    # print(lyric_part)
-    # timestamps = get_timestamps(timestamp_part)
-    # print(timestamps)
-    if len(timestamps) == 0:
-        # print(f'{lyric_path}')
-        return []
-    slice_start = timestamps[0]
-    slice_start_idx = 0
-    output_list = []
-    for i in range(1, len(timestamps)):
-        # 如果累积时间超过30秒，则进行切分
-        if timestamps[i] - slice_start > 30:
-            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
-            slice_start = timestamps[i]
-            slice_start_idx = i
-    # import pdb; pdb.set_trace()
-    return output_list
-def extract_lrc(lyrics):
-    timestamp_part, lyric_part = [], []
-    for i,  text in enumerate(lyrics):
-        # 提取中括号内的内容
-        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
-        bracket_content = bracket_content.split(',')
-        # 提取小括号内的内容
-        parentheses_content = re.findall(r'\((.*?)\)', text)
-        # 提取其他内容
-        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
-        # 数据怎么处理？
-        if i<10 and check_lryics(other_content):
-            continue
-        timestamp_part.append(float(bracket_content[0])/1000)
-        lyric_part.append(other_content)
-    return timestamp_part, lyric_part
-class WYYSongDataset(Dataset):
-    def __init__(self,
-                metadata_path:str,
-                sr:int = 0,
-                use_lang = ['en', 'zh-cn'],
-                num_examples = -1,
-                max_dur = 20,
-                pad_to_max= True,
-                ):
-        self.sr = sr
-        self.use_lang = use_lang
-        self._load_metadata(metadata_path)
-        self.max_dur = max_dur
-        self.pad_to_max = pad_to_max
-        # buffer
-        self.lyric_buffer = {}
-        if(num_examples<=0):
-            self.dataset_len = len(self.data)
-            self.random_slc = False
-        else:
-            self.dataset_len = num_examples
-            self.random_slc = True
-    # 读取jsonl文件
-    def _load_metadata(self, metadata_path):
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                if '伴奏' not in item['path']:
-                    # if "lang_type" in item and item['lang_type'] == 'en':
-                     if "lang_type" in item:
-                        self.data.append(item)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        try_cnt = 0
-        while True:
-            if(self.random_slc):
-                idx = np.random.randint(0, len(self.data))
-            yrc_lyrics = []
-            lrc_lyrics = []
-            try:
-                info = self.data[idx]
-                # audio path
-                path = info["path"]
-                lang_type = info["lang_type"]
-                if info["lang_type"] == 'en':
-                    lyrics = info['lyrics']
-                else:
-                    lyrics = info['lyrics_phone']
-                # 随机选取一个lyric段落
-                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
-                lyric = lyrics[ly_id].strip()
-                st, et, lyric = self.parse_lyric(lyric)
-                lyric = lyric.replace("\xa0", " ")
-                lyric = " ".join(lyric.split())
-                assert et - st < self.max_dur
-                if info["lang_type"] == 'en':
-                    # print(len(lyric.split())/(et-st))
-                    assert 6 > len(lyric.split())/(et-st) > 1
-                else:
-                    # print(len(lyric.split())/(et-st))
-                    lyric = lyric.replace("-", "")
-                    assert 6 > len(lyric.split())/(et-st) > 1
-                # 读取音频文件
-                cur_sample_rate = torchaudio.info(path).sample_rate
-                offset = int(cur_sample_rate*st)
-                num_frames = int(cur_sample_rate * (et -st))
-                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
-                # chunk = torch.zeros(1, 48000*15)
-                # 随机选取一个channel
-                if(chunk.shape[0]>1):
-                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-                else:
-                    chunk = chunk[[0],:].float()
-                if(cur_sample_rate!=self.sr):
-                    # print('a:',cur_sample_rate,chunk.shape)
-                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
-                if self.pad_to_max:
-                    chunk = self.pad_2d_tensor(chunk, int(self.max_dur * self.sr), 0)
-                return chunk, lyric, et-st, path, lang_type
-            except:
-                    # print("Error loadding ", info["path"])
-                    try_cnt += 1
-                    idx  = np.random.randint(0, len(self.data))
-                    if(try_cnt>20):
-                        raise FileNotFoundError()
-    def parse_lyric(self, lyric):
-        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
-        match = re.search(pattern, lyric)
-        start_time = float(match.group(1))
-        end_time = float(match.group(2))
-        content = match.group(3)
-        return start_time, end_time, content
-    def pad_2d_tensor(self, x, max_len, pad_id):
-        # 获取输入 tensor 的形状
-        batch_size, seq_len = x.size()
-        max_len = max(max_len, seq_len)
-        # 计算需要填充的长度
-        pad_len = max_len - seq_len
-        # 如果需要填充
-        if pad_len > 0:
-            # 创建填充 tensor
-            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
-            # 沿第二个维度（列）连接输入 tensor 和填充 tensor
-            padded_tensor = torch.cat([x, pad_tensor], dim=1)
-        else:
-            # 如果不需要填充，直接返回输入 tensor
-            padded_tensor = x
-        return padded_tensor
-def collect_data(data_list):
-    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
-    lyrics = [data[1] for data in data_list]
-    st_et = [data[2] for data in data_list]
-    paths = [data[3] for data in data_list]
-    lang_types = [data[4] for data in data_list]
-    return audios, lyrics, st_et, lang_types
-    # return audios, lyrics, st_et
-def build_dataset():
-    train_dataset = WYYSongDataset(
-        metadata_path = "train.jsonl",
-        sr = 48000,
-        use_lang = ['zh-cn', 'en'],
-        num_examples = 10*10000
-    )
-    valid_dataset = WYYSongDataset(
-        metadata_path = "valid.jsonl",
-        sr = 48000,
-        use_lang = ['zh-cn', 'en'],
-        num_examples = 500
-    )
-    return train_dataset, valid_dataset

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_stock.py DELETED Viewed

@@ -1,461 +0,0 @@
-from torch.utils.data import Dataset
-from beartype.typing import Sequence, Callable, Optional, Dict, List
-from beartype.door import is_bearable
-import random
-import os
-from torchaudio.functional import resample
-import torch
-import typing as tp
-from pathlib import Path
-import torchaudio as ta
-import torch.nn.functional as F
-import soundfile
-import numpy as np
-import json
-import yaml
-import random
-import librosa
-from loguru import logger
-import re
-def _av_read(filepath, seek_time=0, duration=None):
-    if duration is not None:
-        sr = librosa.get_samplerate(filepath)
-        offset = seek_time
-        num_samples = int(duration * sr)
-        wav, _ = librosa.load(filepath, sr=sr, offset=offset, duration=duration)
-    else:
-        wav, sr = librosa.load(filepath, sr=None, offset=seek_time)
-    return wav, sr
-def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
-               duration: float = -1., pad: bool = True) -> tp.Tuple[torch.Tensor, int]:
-    """Read audio by picking the most appropriate backend tool based on the audio format.
-    Args:
-        filepath (str or Path): Path to audio file to read.
-        seek_time (float): Time at which to start reading in the file.
-        duration (float): Duration to read from the file. If set to -1, the whole file is read.
-        pad (bool): Pad output audio if not reaching expected duration.
-    Returns:
-        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
-    """
-    fp = Path(filepath)
-    if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
-        # There is some bug with ffmpeg and reading flac
-        info = soundfile.info(filepath)
-        frames = -1 if duration <= 0 else int(duration * info.samplerate)
-        frame_offset = int(seek_time * info.samplerate)
-        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
-        assert info.samplerate == sr, f"Mismatch of sample rates {info.samplerate} {sr}"
-        wav = torch.from_numpy(wav).t().contiguous()
-        if len(wav.shape) == 1:
-            wav = torch.unsqueeze(wav, 0)
-    elif (
-        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
-        and duration <= 0 and seek_time == 0
-    ):
-        # Torchaudio is faster if we load an entire file at once.
-        wav, sr = librosa.load(fp, sr=None, mono=True)
-    else:
-        wav, sr = _av_read(filepath, seek_time, duration)
-    if pad and duration > 0:
-        expected_frames = int(duration * sr)
-        wav = F.pad(torch.tensor(wav), (0, expected_frames - wav.shape[-1]))
-    if not isinstance(wav, torch.Tensor):
-        wav = torch.tensor(wav)
-    return wav, sr
-def random_seek_read(filepath, duration):
-    if duration > 0:
-        total_duration = librosa.get_duration(path=filepath)
-        acceptable_start = max(0, total_duration - duration)
-        wav, sr = audio_read(filepath, random.uniform(0, acceptable_start), duration, pad=True)
-    else:
-        wav, sr = audio_read(filepath, 0, -1, pad=False)
-    return wav, sr
-def safe_random_seek_read(filepath, duration, sample_rate):
-    try:
-        wav, sr = random_seek_read(filepath, duration)
-        if sr != sample_rate:
-            wav = resample(wav, sr, sample_rate)
-            sr = sample_rate
-    except Exception as e:
-        logger.error(f"Error reading {filepath}: {e}")
-        sr = sample_rate
-        wav = torch.zeros(sr * max(duration, 0), dtype=torch.float32)
-    return wav, sr
-def read_jsonlike(path: os.PathLike):
-    #json or jsonl
-    if str(path).endswith(".json"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = json.load(f)
-        return data
-    elif str(path).endswith(".jsonl"):
-        with open(path, 'r', encoding='utf8') as f:
-            data = [json.loads(line) for line in f.readlines()]
-        return data
-    else:
-        raise ValueError("Unknown file format")
-dist_prob_map = {
-    1: (1.0,),
-    2: (0.5, 0.5),
-    3: (0.3, 0.4, 0.3),
-    4: (0.2, 0.3, 0.3, 0.2),
-    5: (0.2, 0.2, 0.3, 0.2, 0.1),
-    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
-    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
-    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
-    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
-    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
-}
-dist_prob_map_low = {
-    1: (1.0,),
-    2: (0.8, 0.2),
-    3: (0.8, 0.1, 0.1),
-    4: (0.7, 0.1, 0.1, 0.1),
-    5: (0.7, 0.1, 0.1, 0.05, 0.05),
-    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
-}
-_bpm_range_rights = (
-    (40, '20-40'),
-    (60, '40-60'),
-    (66, '60-66'),
-    (76, '66-76'),
-    (108, '76-108'),
-    (120, '108-120'),
-    (168, '120-168'),
-    (176, '168-176'),
-    (200, '176-200')
-)
-_bpm_desc_map = {
-    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
-    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
-    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
-    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
-    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
-    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
-    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
-    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
-    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
-    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
-}
-_bpm_desc_map_zh = {
-    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
-    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
-    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
-    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
-    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
-    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
-    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
-    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
-    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
-    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
-}
-def get_bpm_range(bpm):
-    bpm = int(bpm)
-    for right, tag in _bpm_range_rights:
-        if bpm <= right:
-            return tag
-    return '>200'
-def gen_bpm_descript(bpm, lang='en'):
-    bpm_range = get_bpm_range(bpm)
-    if lang == 'en':
-        return random.choice(_bpm_desc_map[bpm_range])
-    elif lang == 'zh':
-        return random.choice(_bpm_desc_map_zh[bpm_range])
-    else:
-        raise ValueError(f"Unknown language {lang}")
-def read_translate(translate: Optional[Dict[str, os.PathLike]]):
-    if translate is None:
-        return None
-    return {k: read_jsonlike(path) for k, path in translate.items()}
-def tags_to_desc(tag_list, sep=',') -> str:
-    if not isinstance(tag_list, Sequence):
-        return str(tag_list)
-    if isinstance(tag_list, str):
-        return tag_list
-    if len(tag_list) <= 0:
-        return ''
-    elif len(tag_list) <= 5:
-        probs = dist_prob_map[len(tag_list)]
-        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-    else:
-        probs = dist_prob_map[5]
-        tags_num = random.choices(range(1, 6), probs)[0]
-        random.shuffle(tag_list)
-        tag_list = tag_list[:tags_num]
-        return sep.join(tag_list)
-class PromptTemplate:
-    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
-        self.template_text = template_text
-        self.tag_map = tag_map
-        self.lang = lang
-    @property
-    def tags(self):
-        return tuple(self.tag_map.keys())
-    def apply(self, **kwargs):
-        for tag in list(kwargs.keys()):
-            if kwargs[tag] == '':
-                kwargs.pop(tag)
-        for tag in self.tags:
-            if tag in kwargs:
-                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
-            else:
-                kwargs[tag] = ''
-        prompt = self.template_text.format(**kwargs)
-        return self.beautify(prompt)
-    def beautify(self, text):
-        if self.lang == 'en':
-            return self._beautify_en(text)
-        elif self.lang == 'zh':
-            return self._beautify_zh(text)
-        else:
-            raise ValueError(f'Unknown language {self.lang}')
-    @staticmethod
-    def _beautify_en(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
-        # no continuous whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
-        text = re.sub(r'\s+,', r',', text)
-        text = re.sub(r',\s+', r', ', text)
-        # no whitespace before the full stop
-        text = re.sub(r'\s+\.', r'.', text)
-        # strip whitespace, comma, and replace ',.'
-        text = text.strip(' ,')
-        text = text.replace(',.', '.')
-        return text
-    @staticmethod
-    def _beautify_zh(text):
-        # no continuous commas without content between them
-        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
-        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
-        # assume there should be NO whitespace in Chinese
-        text = re.sub(r'\s+', r'', text)
-        # strip whitespace, comma, and replace '，。'
-        text = text.strip('， 、')
-        text = text.replace('，。', '。')
-        return text
-    def __repr__(self):
-        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
-    __str__ = __repr__
-def parse_prompt_template(prompt_template_text, lang='en'):
-    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
-    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
-    template_text = prompt_template_text.strip()
-    span_texts = span_pattern.findall(prompt_template_text)
-    tag_map = {}
-    for span_text in span_texts:
-        tag = tag_pattern.findall(span_text)[0].strip('{}')
-        tag_map[tag] = span_text
-        template_text = template_text.replace(span_text, '{'+tag+'}')
-    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
-def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
-    with open(path, 'r') as f:
-        lines = f.readlines()
-    cnt = 0
-    pts = []
-    for line in lines:
-        pt = parse_prompt_template(line, lang=lang)
-        cnt += 1
-        if len(pt.tags) < num:
-            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
-        pts.append(pt)
-    return pts
-class AudioStockDataset(Dataset):
-    def __init__(self,
-                num_examples:int,
-                metadata_path:str,
-                duration:float=60,
-                sr:int = 0,
-                return_path = False,
-                return_audio = True,
-                prompt_template_path: os.PathLike = None,
-                tag_types = [],
-                lang = 'en',
-                translate:Optional[Dict[str, os.PathLike]] = None
-                ):
-        self.duration = duration
-        self.MAX_DURATION = 360
-        self._load_metadata(metadata_path)
-        if num_examples > 0:
-            self.random_choose = True
-            self.dataset_len = num_examples
-        else:
-            self.random_choose = False
-            self.dataset_len = len(self.data)
-        self.sr = sr
-        self.return_path = return_path
-        self.return_audio = return_audio
-        self.use_dynamic_prompt = prompt_template_path is not None
-        if self.use_dynamic_prompt:
-            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
-        self.tag_types = tag_types
-        self.lang = lang
-        self.translate = read_translate(translate)
-    def _load_metadata(self, metadata_path):
-        total_len = 0; valid_len = 0
-        with open(metadata_path) as fp:
-            lines = fp.readlines()
-            self.data = []
-            for line in lines:
-                item = json.loads(line)
-                total_len += 1
-                if(item['duration']>self.duration and item['duration']<self.MAX_DURATION):
-                    valid_len += 1
-                    self.data.append(item)
-        print("Filter data from {} to {}".format(total_len, valid_len))
-        self.is_info_recorded = bool('Tags' in self.data[0])
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        first_try = True
-        try_cnt = 0
-        while True:
-            try:
-                if(self.random_choose or not first_try):
-                    index2 = np.random.randint(0,len(self.data))
-                else:
-                    index2 = idx
-                first_try = False
-                return self.getitem_main(index2)
-            except:
-                print("Error loadding ", self.data[idx]["path"])
-                try_cnt += 1
-                if(try_cnt>10):
-                    raise ValueError()
-    def getitem_main(self, idx):
-        path:str = self.data[idx]["path"]
-        json_path = path[:path.rfind('.')] + ".json"
-        if self.is_info_recorded:
-            item = self.data[idx]
-        else:
-            with open(json_path) as fp:
-                item:dict = json.load(fp)
-        description = self.generate_description(item)
-        if self.return_audio:
-            audio, sr = safe_random_seek_read(path, duration=self.duration, sample_rate=self.sr)
-        else:
-            audio = None
-        if self.return_path:
-            return audio, description, path
-        return audio, description
-    def generate_description(self, item):
-        if self.use_dynamic_prompt:
-            # dynamically generate prompt from given prompt template
-            prompt_template = random.choice(self.prompt_templates)
-            description = self.generate_description_dynamic(item, prompt_template)
-        else:
-            # use ordinary static prompt instead
-            description = self.generate_description_ordinary(item)
-        return description
-    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
-        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
-        if len(exists_tag) > 0:
-            probs = dist_prob_map[len(exists_tag)]
-            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
-            random.shuffle(exists_tag)
-            tags = exists_tag[:tags_num]
-            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
-            tags_args = self.handle_BPM_tag(tags_args)
-            prompt = prompt_template.apply(**tags_args)
-        else:
-            # no strong tags, use all weak tags instead
-            prompt = prompt_template.apply()
-        return prompt
-    def tags_to_desc(self, tag_list, tag_type) -> str:
-        if self.lang == 'en':
-            return tags_to_desc(tag_list)
-        elif self.lang == 'zh':
-            if tag_type == 'BPM':
-                return tags_to_desc(tag_list, sep='、')
-            translator = self.translate[tag_type]
-            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
-            return tags_to_desc(translated_tag_list, sep='、')
-    def handle_BPM_tag(self, tags_args):
-        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
-            bpm = tags_args["BPM"]
-            del tags_args["BPM"]
-            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
-            for tag_type in tag_types_used:
-                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
-        return tags_args
-    def generate_description_ordinary(self, data, thresh = 0.3):
-        if self.lang != 'en':
-            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
-        description = f'a piece of music by {data["Artist"]}'
-        # Add genre if available
-        if data["Genre"] and random.random() > thresh:
-            genres = ', '.join(data["Genre"])
-            description += f', belonging to the {genres} genres'
-        # Add moods if available
-        if data["Tags"] and random.random() > thresh:
-            tags = ', '.join(data["Tags"])
-            description += f'. This track contains the tags:{tags}'
-        # Add moods if available
-        if data["Mood"] and random.random() > thresh:
-            moods = ', '.join(data["Mood"])
-            description += f'. This track conveys a {moods} mood.'
-        # Add instruments if available
-        if data["Instrument"] and random.random() > thresh:
-            instruments = ', '.join(data["Instrument"])
-            description += f'. and primarily features the following instruments: {instruments}'
-        # Add a period to end the description
-        description += '.'
-        return description

codeclm/tokenizer/Flow1dVAE/model_1rvq.py CHANGED Viewed

@@ -270,8 +270,6 @@ class PromptCondAudioDiffusion(nn.Module):
         hubert_layer=None,
         ssl_layer=None,
         uncondition=True,
-        out_paint=False,
-        ssl_path='ckpt/encode-s12k.pt'
     ):
         super().__init__()

         hubert_layer=None,
         ssl_layer=None,
         uncondition=True,
     ):
         super().__init__()

codeclm/tokenizer/Flow1dVAE/model_2rvq.py DELETED Viewed

@@ -1,774 +0,0 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-import diffusers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers import DDPMScheduler
-from models.transformer_2d_flow import Transformer2DModel
-from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
-# from tools.get_mulan import get_mulan
-from third_party.wespeaker.extract_embd import XVECModel
-# from libs.rvq2 import RVQEmbedding
-from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        # print(rescale, self.mean)
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp,
-        ssl_layer
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-        self.ssl_layer = ssl_layer
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            # print("incontext_x.shape:",incontext_x.shape)
-            # print("noise.shape:",noise.shape)
-            # print("t.shape:",t.shape)
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            # print("dphi_dt.shape:",dphi_dt.shape)
-            # print("x.shape:",x.shape)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # print("y.shape:",y.shape)
-        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        # print("model_input.shape:",model_input.shape)
-        # print("attention_mask.shape:",attention_mask.shape)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer = out.hidden_states[self.ssl_layer]
-        hidden_proj = self.mlp(hidden_layer)
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("mert_emb.shape:",mert_emb.shape)
-        # exit()
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        # out=self.proj_out(out)
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        # print("out.shape",out.shape)
-        # print("u.shape",u.shape)
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        # print("loss_cos:",loss_cos,loss_cos.device)
-        print("loss:",loss,loss.device)
-        # exit()
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        hubert_layer=None,
-        ssl_layer=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        self.hubert_layer = hubert_layer
-        self.ssl_layer = ssl_layer
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='path/to/our-MERT/mert_fairseq',
-            checkpoint_dir='checkpoint-120000.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 2, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        # for v in self.rvq_bestrq_emb.parameters():
-        #     print(v)
-        freeze_parameters='quantizers.0'
-        for name, param in self.rvq_bestrq_emb.named_parameters():
-            if freeze_parameters in name:
-                param.requires_grad = False
-                print("Freezing RVQ parameters:", name)
-        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(1200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
-        self.mask_emb = torch.nn.Embedding(3, 48)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-        # self.unet.set_attn_processor(AttnProcessor2_0())
-        # self.unet.set_use_memory_efficient_attention_xformers(True)
-        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
-        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        # print(wav2vec_embeds)
-        # print("audio.shape:",input_audios.shape)
-        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
-        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
-        self.bestrq.eval()
-        # print("audio shape:",input_audio_0.shape)
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        # print("input_wav_mean.shape:",input_wav_mean.shape)
-        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        # print("layer_results.shape:",layer_results[layer].shape)
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        #[b,t,1024] t=t/960
-        #35.84s->batch,896,1024
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer=5):
-        if not hasattr(self,"device"):
-            self.device = input_audios.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios.dtype
-        device = self.device
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
-        # energy_embedding = self.extract_energy_bar(input_audios)
-        # print("energy_embedding.shape:",energy_embedding.shape)
-        # with autocast(enabled=False):
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
-                bestrq_emb = bestrq_emb.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        if(train_rvq):
-            random_num=random.random()
-            if(random_num<0.6):
-                rvq_layer = 1
-            elif(random_num<0.8):
-                rvq_layer = 2
-            else:
-                rvq_layer = 4
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb
-        codebook_loss = codebook_loss_bestrq_emb
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-        # print("latents.shape:",latents.shape)
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        # print("incontext_latents.shape:",incontext_latents.shape)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        latent_mask_input = self.mask_emb(latent_masks)
-        #64+48+64+1024
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb = codes[0]
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    @torch.no_grad()
-    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        import time
-        start = time.time()
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents,time.time()-start
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

codeclm/tokenizer/Flow1dVAE/model_4rvq.py DELETED Viewed

@@ -1,774 +0,0 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-import diffusers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers import DDPMScheduler
-from models.transformer_2d_flow import Transformer2DModel
-from transformers import AutoFeatureExtractor, Wav2Vec2BertModel,HubertModel
-# from tools.get_mulan import get_mulan
-from third_party.wespeaker.extract_embd import XVECModel
-# from libs.rvq2 import RVQEmbedding
-from libs.rvq.descript_quantize3_4layer_freezelayer1 import ResidualVectorQuantize
-from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
-from models_gpt.models.gpt2_config import GPT2Config
-from torch.cuda.amp import autocast
-from our_MERT_BESTRQ.test import load_model
-class HubertModelWithFinalProj(HubertModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # The final projection layer is only used for backward compatibility.
-        # Following https://github.com/auspicious3000/contentvec/issues/6
-        # Remove this layer is necessary to achieve the desired outcome.
-        print("hidden_size:",config.hidden_size)
-        print("classifier_proj_size:",config.classifier_proj_size)
-        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature1DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 100, power_std = 1., \
-                 num_samples: int = 100_000, cal_num_frames: int = 600):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.cal_num_frames = cal_num_frames
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim))
-        self.register_buffer('sum_x2', torch.zeros(dim))
-        self.register_buffer('sum_target_x2', torch.zeros(dim))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        if(self.counts < 10):
-            mean = torch.zeros_like(mean)
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        if(self.counts < 10):
-            std = torch.ones_like(std)
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        rescale = (self.std / self.target_std) ** self.power_std
-        # print(rescale, self.mean)
-        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
-        return x
-def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
-    if(prior_text_encoder_hidden_states.shape[1]<len_size):
-        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
-            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
-            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
-            dtype=prior_text_encoder_hidden_states.dtype)],1)
-        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
-    else:
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
-        prior_text_mask = prior_text_mask[:,0:len_size]
-    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
-    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-        mlp,
-        ssl_layer
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-        self.mlp = mlp
-        self.ssl_layer = ssl_layer
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            print("incontext_x.shape:",incontext_x.shape)
-            print("noise.shape:",noise.shape)
-            print("t.shape:",t.shape)
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            print("dphi_dt.shape:",dphi_dt.shape)
-            print("x.shape:",x.shape)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def projection_loss(self,hidden_proj, bestrq_emb):
-        bsz = hidden_proj.shape[0]
-        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
-        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)
-        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1)
-        proj_loss = 1+proj_loss.mean()
-        return proj_loss
-    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        b = mu[0].shape[0]
-        len_x = x1.shape[2]
-        # random timestep
-        if(validation_mode):
-            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
-        else:
-            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # print("y.shape:",y.shape)
-        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
-        model_input = torch.cat([*mu,y], 2)
-        t=t.squeeze(-1).squeeze(-1)
-        # print("model_input.shape:",model_input.shape)
-        # print("attention_mask.shape:",attention_mask.shape)
-        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
-        hidden_layer = out.hidden_states[self.ssl_layer]
-        hidden_proj = self.mlp(hidden_layer)
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("mert_emb.shape:",mert_emb.shape)
-        # exit()
-        out = out.last_hidden_state
-        out=out[:,:,-len_x:]
-        # out=self.proj_out(out)
-        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
-        # print("out.shape",out.shape)
-        # print("u.shape",u.shape)
-        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
-        # print("hidden_proj.shape:",hidden_proj.shape)
-        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
-        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
-        loss = loss_re + loss_cos * 0.5
-        # print("loss_cos:",loss_cos,loss_cos.device)
-        print("loss:",loss,loss.device)
-        # exit()
-        return loss, loss_re, loss_cos
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        hubert_layer=None,
-        ssl_layer=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        self.hubert_layer = hubert_layer
-        self.ssl_layer = ssl_layer
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature1DProcessor(dim=64)
-        self.sample_rate = 48000
-        self.num_samples_perseg = self.sample_rate * 20 // 1000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
-        self.bestrq = load_model(
-            model_dir='path/to/our-MERT/mert_fairseq',
-            checkpoint_dir='checkpoint-120000.pt',
-        )
-        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
-        for v in self.bestrq.parameters():v.requires_grad = False
-        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        # for v in self.rvq_bestrq_emb.parameters():
-        #     print(v)
-        freeze_parameters='quantizers.0'
-        for name, param in self.rvq_bestrq_emb.named_parameters():
-            if freeze_parameters in name:
-                param.requires_grad = False
-                print("Freezing RVQ parameters:", name)
-        self.hubert = HubertModelWithFinalProj.from_pretrained("huggingface_cache/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
-        for v in self.hubert.parameters():v.requires_grad = False
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        # self.xvecmodel = XVECModel()
-        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
-        unet = GPT2Model(config)
-        mlp =  nn.Sequential(
-            nn.Linear(1200, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 1024),
-            nn.SiLU(),
-            nn.Linear(1024, 768)
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
-        self.mask_emb = torch.nn.Embedding(3, 48)
-        print("Transformer initialized from pretrain.")
-        torch.cuda.empty_cache()
-        # self.unet.set_attn_processor(AttnProcessor2_0())
-        # self.unet.set_use_memory_efficient_attention_xformers(True)
-        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
-        # self.end_embedding = nn.Parameter(torch.randn(1,1024))
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_wav2vec_embeds(self, input_audios,output_len):
-        wav2vec_stride = 2
-        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
-        # print(wav2vec_embeds)
-        # print("audio.shape:",input_audios.shape)
-        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
-        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
-        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
-        return wav2vec_embeds_last
-    def extract_mert_embeds(self, input_audios):
-        prompt_stride = 3
-        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
-        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
-        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024
-        mert_emb= prompt_embeds[-1]
-        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)
-        return mert_emb
-    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
-        self.bestrq.eval()
-        # print("audio shape:",input_audio_0.shape)
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        # print("input_wav_mean.shape:",input_wav_mean.shape)
-        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
-        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        # print("layer_results.shape:",layer_results[layer].shape)
-        bestrq_emb = layer_results[layer]
-        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
-        #[b,t,1024] t=t/960
-        #35.84s->batch,896,1024
-        return bestrq_emb
-    def extract_spk_embeds(self, input_audios):
-        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
-        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
-        return spk_embeds
-    def extract_lyric_feats(self, lyric):
-        with torch.no_grad():
-            try:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
-            except:
-                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
-            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
-            text_mask = text_mask.to(self.device)
-            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
-                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
-            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
-            return text_encoder_hidden_states, text_mask
-    def extract_energy_bar(self, input_audios):
-        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
-            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        else:
-            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
-        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
-        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
-        energy_embedding = self.energy_embedding(energy_bar)
-        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
-        return energy_embedding
-    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
-        additional_feats = ['spk', 'lyric'], \
-        train_rvq=True, train_ssl=False,layer=5):
-        if not hasattr(self,"device"):
-            self.device = input_audios.device
-        if not hasattr(self,"dtype"):
-            self.dtype = input_audios.dtype
-        device = self.device
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
-        # energy_embedding = self.extract_energy_bar(input_audios)
-        # print("energy_embedding.shape:",energy_embedding.shape)
-        # with autocast(enabled=False):
-        if(train_ssl):
-            self.wav2vec.train()
-            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
-            self.clap_embd_extractor.train()
-            prompt_embeds = self.extract_mert_embeds(input_audios)
-            if('spk' in additional_feats):
-                self.xvecmodel.train()
-                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
-        else:
-            with torch.no_grad():
-                with autocast(enabled=False):
-                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
-                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])
-                bestrq_emb = bestrq_emb.detach()
-        if('lyric' in additional_feats):
-            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
-        else:
-            text_encoder_hidden_states, text_mask = None, None
-        if(train_rvq):
-            random_num=random.random()
-            if(random_num<0.6):
-                rvq_layer = 1
-            elif(random_num<0.8):
-                rvq_layer = 2
-            else:
-                rvq_layer = 4
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb,n_quantizers=rvq_layer) # b,d,t
-        else:
-            bestrq_emb = bestrq_emb.float()
-            self.rvq_bestrq_emb.eval()
-            # with autocast(enabled=False):
-            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
-            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
-            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
-            quantized_bestrq_emb = quantized_bestrq_emb.detach()
-        commitment_loss = commitment_loss_bestrq_emb
-        codebook_loss = codebook_loss_bestrq_emb
-        alpha=1
-        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        scenario = np.random.choice(['start_seg', 'other_seg'])
-        if(scenario == 'other_seg'):
-            for binx in range(input_audios.shape[0]):
-                # latent_masks[binx,0:64] = 1
-                latent_masks[binx,0:random.randint(64,128)] = 1
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
-        # print("latent_masks.shape:",latent_masks.shape)
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        if self.uncondition:
-            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
-            if len(mask_indices) > 0:
-                quantized_bestrq_emb[mask_indices] = 0
-        # print("latents.shape:",latents.shape)
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.project_sample(latents)
-        latents = latents.permute(0,2,1).contiguous()
-        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        # print("incontext_latents.shape:",incontext_latents.shape)
-        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        latent_mask_input = self.mask_emb(latent_masks)
-        #64+48+64+1024
-        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
-        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer,rvq_num=1):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch_ds(self, input_audios, additional_feats, layer, rvq_num=1, ds=250):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.bestrq.eval()
-        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
-        # bestrq_middle = bestrq_middle.detach()
-        # bestrq_last = bestrq_last.detach()
-        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
-        bestrq_emb = bestrq_emb.detach()
-        # self.rvq_bestrq_middle.eval()
-        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
-        # self.rvq_bestrq_last.eval()
-        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t
-        self.rvq_bestrq_emb.eval()
-        bestrq_emb = torch.nn.functional.avg_pool1d(bestrq_emb, kernel_size=ds, stride=ds)
-        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb)
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        codes_bestrq_emb = codes_bestrq_emb[:,:rvq_num,:]
-        # print("codes_bestrq_emb.shape:",codes_bestrq_emb.shape)
-        # exit()
-        if('spk' in additional_feats):
-            self.xvecmodel.eval()
-            spk_embeds = self.extract_spk_embeds(input_audios)
-        else:
-            spk_embeds = None
-        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
-        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
-        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
-        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
-        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        # codes_bestrq_middle, codes_bestrq_last = codes
-        codes_bestrq_emb = codes[0]
-        batch_size = codes_bestrq_emb.shape[0]
-        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
-        print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
-        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)
-        if('spk' in additional_feats):
-            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
-        num_frames = quantized_bestrq_emb.shape[1]
-        num_channels_latents = self.num_channels
-        shape = (batch_size,  num_frames, 64)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
-            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        true_latents = self.normfeat.project_sample(true_latents)
-        true_latents = true_latents.permute(0,2,1).contiguous()
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        attention_mask=(latent_masks > 0.5)
-        B, L = attention_mask.size()
-        attention_mask = attention_mask.view(B, 1, L)
-        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
-        attention_mask = attention_mask.unsqueeze(1)
-        latent_mask_input = self.mask_emb(latent_masks)
-        if('spk' in additional_feats):
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
-        else:
-            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
-            additional_model_input = torch.cat([quantized_bestrq_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)
-        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
-        latents = latents.permute(0,2,1).contiguous()
-        latents = self.normfeat.return_sample(latents)
-        # latents = latents.permute(0,2,1).contiguous()
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg',rvq_num=1):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer,rvq_num)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    @torch.no_grad()
-    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        import time
-        start = time.time()
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents,time.time()-start
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

codeclm/tokenizer/Flow1dVAE/model_septoken.py CHANGED Viewed

@@ -252,8 +252,6 @@ class PromptCondAudioDiffusion(nn.Module):
         unet_model_config_path=None,
         snr_gamma=None,
         uncondition=True,
-        out_paint=False,
-        ssl_path='ckpt/encode-s12k.pt'
     ):
         super().__init__()

         unet_model_config_path=None,
         snr_gamma=None,
         uncondition=True,
     ):
         super().__init__()

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_AS2M.yaml DELETED Viewed

@@ -1,122 +0,0 @@
-# @package _group_
-common:
-  fp16: true
-  log_format: json
-  log_interval: 200
-  tensorboard_logdir: tb
-  min_loss_scale: 1e-6
-  fp16_no_flatten_grads: true
-  user_dir: ${env:PWD}
-  seed: 1
-checkpoint:
-  save_interval: 1
-  save_interval_updates: 10000
-  keep_interval_updates: 1
-  no_epoch_checkpoints: true
-task:
-  _name: mae_image_pretraining
-  data: unbalanced_train
-  rebuild_batches: true
-  key: source
-  precompute_mask_config: {}
-  downsr_16hz: true
-  audio_mae: true
-  h5_format: false
-  target_length: 1024
-  flexible_mask: false
-dataset:
-  num_workers: 10
-  batch_size: 12
-  skip_invalid_size_inputs_valid_test: true
-  required_batch_size_multiple: 1
-  disable_validation: true
-distributed_training:
-  distributed_world_size: 4
-  ddp_backend: c10d
-criterion:
-  _name: model
-  log_keys:
-    - ema_decay
-    - target_var
-    - pred_var
-    - model_norm
-    - ema_norm
-    - masked_pct
-optimization:
-  max_update: 400000
-  lr: [ 0.0005 ]
-  debug_param_names: true
-  clip_norm: 4
-optimizer:
-  _name: composite
-  dynamic_groups: true
-  groups:
-    default:
-      lr_float: 0.0005
-      optimizer:
-        _name: adam
-        adam_betas: [0.9,0.95]
-        weight_decay: 0.05
-      lr_scheduler:
-        _name: cosine
-        warmup_updates: 53333
-lr_scheduler: pass_through
-model:
-  _name: data2vec_multi
-  ema_decay: 0.9998
-  ema_end_decay: 0.99999
-  ema_anneal_end_step: 100000
-  instance_norm_target_layer: true
-  layer_norm_target_layer: false
-  layer_norm_targets: true
-  end_of_block_targets: false
-  depth: 12
-  average_top_k_layers: 12
-  clone_batch: 16
-  norm_eps: 1e-6
-  min_target_var: 0
-  min_pred_var: 0
-  encoder_dropout: 0
-  post_mlp_drop: 0
-  attention_dropout: 0
-  activation_dropout: 0
-  supported_modality: IMAGE
-  cls_loss: 1
-  ema_encoder_only: false
-  modalities:
-    image:
-      in_chans: 1
-      inverse_mask: true
-      mask_prob: 0.8
-      mask_prob_adjust: 0.07
-      mask_length: 5
-      mask_noise_std: 0.01
-      prenet_depth: 0
-      ema_local_encoder: true
-      num_extra_tokens: 1
-      init_extra_token_zero: false
-      use_alibi_encoder: false
-      decoder:
-        decoder_dim: 768
-        decoder_groups: 16
-        decoder_kernel: 3
-        decoder_layers: 6
-        input_dropout: 0

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/EAT_pretraining_music_multinodes.yaml DELETED Viewed

@@ -1,125 +0,0 @@
-# @package _group_
-common:
-  fp16: true
-  log_format: json
-  log_interval: 200
-  tensorboard_logdir: tb
-  min_loss_scale: 1e-6
-  fp16_no_flatten_grads: true
-  user_dir: ${env:PWD}
-  seed: 1
-checkpoint:
-  save_interval: 1
-  save_interval_updates: 10000
-  keep_interval_updates: 1000
-  no_epoch_checkpoints: true
-task:
-  _name: mae_image_pretraining
-  data: music4all_sh/
-  rebuild_batches: true
-  key: source
-  precompute_mask_config: {}
-  downsr_16hz: false
-  audio_mae: true
-  h5_format: false
-  target_length: 752
-  flexible_mask: false
-  sample_rate: 24000
-  fixed_duration: 30
-dataset:
-  num_workers: 10
-  batch_size: 12
-  skip_invalid_size_inputs_valid_test: true
-  required_batch_size_multiple: 1
-  disable_validation: true
-distributed_training:
-  distributed_world_size: 4
-  ddp_backend: c10d
-criterion:
-  _name: model
-  log_keys:
-    - ema_decay
-    - target_var
-    - pred_var
-    - model_norm
-    - ema_norm
-    - masked_pct
-optimization:
-  max_update: 400000
-  lr: [ 0.0001 ]
-  # debug_param_names: true
-  clip_norm: 4
-optimizer:
-  _name: composite
-  # dynamic_groups: true
-  groups:
-    default:
-      lr_float: 0.0005
-      optimizer:
-        _name: adam
-        adam_betas: [0.9,0.95]
-        weight_decay: 0.05
-      lr_scheduler:
-        _name: cosine
-        warmup_updates: 10000 # 53333
-lr_scheduler: pass_through
-model:
-  _name: data2vec_multi
-  ema_decay: 0.9998
-  ema_end_decay: 0.99999
-  ema_anneal_end_step: 100000
-  instance_norm_target_layer: true
-  layer_norm_target_layer: false
-  layer_norm_targets: true
-  end_of_block_targets: false
-  depth: 12
-  average_top_k_layers: 12
-  clone_batch: 16
-  norm_eps: 1e-6
-  min_target_var: 0
-  min_pred_var: 0
-  encoder_dropout: 0
-  post_mlp_drop: 0
-  attention_dropout: 0
-  activation_dropout: 0
-  supported_modality: IMAGE
-  cls_loss: 1
-  ema_encoder_only: false
-  modalities:
-    image:
-      in_chans: 1
-      inverse_mask: true
-      mask_prob: 0.8
-      mask_prob_adjust: 0.07
-      mask_length: 5
-      mask_noise_std: 0.01
-      prenet_depth: 0
-      ema_local_encoder: true
-      num_extra_tokens: 1
-      init_extra_token_zero: false
-      use_alibi_encoder: false
-      decoder:
-        decoder_dim: 768
-        decoder_groups: 16
-        decoder_kernel: 3
-        decoder_layers: 6
-        input_dropout: 0
-      target_length: 752

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M.yaml DELETED Viewed

@@ -1,137 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-  # reset-dataloader: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: -1 #数据分块
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: 900000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 1000000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes.yaml DELETED Viewed

@@ -1,139 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  # model_parallel_size: 8
-  # amp: true
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-  # reset-dataloader: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: -1 #数据分块
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: 900000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 1000000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: run
-  sweep:
-    dir: sweep
-    subdir: subdir

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug1node.yaml DELETED Viewed

@@ -1,138 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  # amp: true
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-  # reset-dataloader: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: -1 #数据分块
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: 900000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 1000000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: run
-  sweep:
-    dir: sweep
-    subdir: subdir

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_multinodes_debug2node.yaml DELETED Viewed

@@ -1,139 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  model_parallel_size: 8
-  # amp: true
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-  # reset-dataloader: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: -1 #数据分块
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: null
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 1000000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: run
-  sweep:
-    dir: sweep
-    subdir: subdir

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_orig.yaml DELETED Viewed

@@ -1,135 +0,0 @@
-# @package _group_
-common:
-  fp16: true
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: 6
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: 900000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_330M_tune.yaml DELETED Viewed

@@ -1,137 +0,0 @@
-# @package _group_
-common:
-  fp16: true
-  log_format: json
-  log_interval: 100
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 5000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-  # reset-dataloader: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sharding_data: -1 #数据分块
-  load_random_data_shard: false
-  sample_rate: 24000
-  # crop to 5s
-  # max_sample_size: 120000
-  # crop to 5.12s, refers to 384 token per audio, which can be devided by 8.
-  max_sample_size: 122880
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  # normalize: true # must be consistent with extractor_mode: layer_norm
-  normalize: false # must be consistent with extractor_mode: default (groupnorm)
-dataset:
-  num_workers: 6
-  max_tokens: 900000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0015]
-  clip_norm: 1.0
-  update_freq: [8]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  # freeze_parameters:true
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 36000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  final_dim: 128
-  encoder_layers: 24
-  encoder_embed_dim: 1024
-  encoder_ffn_embed_dim: 4096
-  encoder_attention_heads: 16
-  # default refers to group norm
-  extractor_mode: default
-  # extractor_mode: layer_norm
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  encoder_layerdrop: 0.0
-  dropout_input: 0.0
-  dropout_features: 0.0
-  dropout: 0.0
-  attention_dropout: 0.0
-  layer_norm_first: true
-  feature_grad_mult: 1.0
-  untie_final_proj: true
-  activation_dropout: 0.0
-  deepnorm: false
-  attention_relax: 32.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M.yaml DELETED Viewed

@@ -1,116 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 25000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: w2v_conv
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq.yaml DELETED Viewed

@@ -1,125 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 25000
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 8 # 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  audio_rq_loss_seed: 42
-  audio_rq_loss_use_norm: true
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_chroma_multinodes.yaml DELETED Viewed

@@ -1,128 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  audio_rq_loss_seed: 42
-  audio_rq_loss_use_norm: true
-  audio_rq_loss_use_chroma: true
-  audio_rq_loss_seed_chroma: 123
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 32
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_multinodes.yaml DELETED Viewed

@@ -1,126 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  audio_rq_loss_seed: 42
-  audio_rq_loss_use_norm: true
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_multinodes.yaml DELETED Viewed

@@ -1,128 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  audio_rq_loss_seed: 42
-  audio_rq_loss_use_norm: true
-  audio_rq_loss_use_chroma: false
-  audio_rq_loss_seed_chroma: 123
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrq_norm_speech_multinodes.yaml DELETED Viewed

@@ -1,128 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0 # 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 80 # 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  audio_rq_loss_seed: 42
-  audio_rq_loss_use_norm: true
-  audio_rq_loss_use_chroma: false
-  audio_rq_loss_seed_chroma: 123
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: false
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_bestrvq_multinodes.yaml DELETED Viewed

@@ -1,121 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: w2v_conv
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # ---- codec target
-  audio_codec_type: rvq
-  audio_codec_ckpt_path: RVQ_3000.pth
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac.yaml DELETED Viewed

File without changes

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_dac_multinodes.yaml DELETED Viewed

@@ -1,121 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: w2v_conv
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # ---- codec target
-  audio_codec_type: dac
-  audio_codec_dac_model_path: weights_24khz_8kbps_0.0.4.pth  #nj
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_groupbestrq_multinodes.yaml DELETED Viewed

@@ -1,125 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: true
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 64 # 32
-  audio_rq_loss_num_embeds: 1024
-  audio_rq_loss_seed: 42
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 16 # 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MERT_RVQ-VAE_CQT_95M_mel_multinodes.yaml DELETED Viewed

@@ -1,124 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # crop to 5s
-  max_sample_size: 120000
-  min_sample_size: 72000
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: hubert
-  pred_masked_weight: 1.0
-  pred_nomask_weight: 0.0
-  loss_weights: [10, 1]
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [4]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: mert
-  label_rate: ???
-  skip_masked: false
-  skip_nomask: true
-  mask_prob: 0.8
-  mask_length: 5
-  logit_temp: 0.1
-  # ----- mixture ------
-  mixture_prob: 0.5
-  inbatch_noise_augment_len_range: "[12000, 24000]"
-  inbatch_noise_augment_number_range: "[1, 3]"
-  inbatch_noise_augment_volume: 1.0
-  # ------------------------
-  extractor_mode: default
-  audio_extract_type: melspec # use melspec (instead of `w2v_conv`)
-  melspec_n_bins: 120 # for melspec we use 120, means 12 bins per octave
-  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
-  # best-rq loss
-  audio_rq_loss_m: false
-  audio_rq_loss_embed_dim: 16
-  audio_rq_loss_num_codebooks: 1
-  audio_rq_loss_num_embeds: 8192
-  # ---- cqt reconstruction, need to add loss weight ---
-  audio_cqt_loss_m: true
-  audio_cqt_bins: 336
-  # -----------
-  final_dim: 64
-  encoder_layerdrop: 0.05
-  dropout_input: 0.1
-  dropout_features: 0.1
-  dropout: 0.1
-  attention_dropout: 0.1
-  feature_grad_mult: 0.1
-  untie_final_proj: true
-  activation_dropout: 0.0
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_bestrvq_multinodes.yaml DELETED Viewed

@@ -1,108 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # # crop to 5s
-  # max_sample_size: 120000
-  # min_sample_size: 72000
-  # crop to 30s
-  max_sample_size: 720000
-  min_sample_size: 432000
-  clip_secs: 30
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: model
-  # log_keys:
-  #   - accuracies
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [1]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: musicfm
-  label_rate: 25
-  num_codebooks: 1
-  codebook_dim: 16
-  codebook_size: 8192 # 4096
-  features: ["melspec_2048"]
-  hop_length: 240
-  n_mels: 128
-  conv_dim: 512
-  encoder_dim: 1024
-  encoder_depth: 12
-  mask_hop: 0.4
-  mask_prob: 0.6
-  is_flash: false
-  stat_path: msd_stats.json
-  model_path: null
-  w2v2_config_path: our-MERT/data/models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
-  use_rvq_target: true
-  rvq_ckpt_path: RVQ_4000.pth
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_multinodes.yaml DELETED Viewed

@@ -1,105 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 12500
-  keep_interval_updates: -1
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # # crop to 5s
-  # max_sample_size: 120000
-  # min_sample_size: 72000
-  # crop to 30s
-  max_sample_size: 720000
-  min_sample_size: 432000
-  clip_secs: 30
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-criterion:
-  _name: model
-  # log_keys:
-  #   - accuracies
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [1]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: musicfm
-  label_rate: 25
-  num_codebooks: 1
-  codebook_dim: 16
-  codebook_size: 4096
-  features: ["melspec_2048"]
-  hop_length: 240
-  n_mels: 128
-  conv_dim: 512
-  encoder_dim: 1024
-  encoder_depth: 12
-  mask_hop: 0.4
-  mask_prob: 0.6
-  is_flash: false
-  stat_path: msd_stats.json
-  model_path: pretrained_msd.pt
-  w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/MusicFM_95M_speech_multinodes.yaml DELETED Viewed

@@ -1,106 +0,0 @@
-# @package _group_
-common:
-  fp16: false
-  log_format: json
-  log_interval: 200
-  seed: 1337
-  # tensorboard_logdir: tblog_proj_name
-  # wandb_project: wandb_proj_name
-checkpoint:
-  save_interval_updates: 2500
-  keep_interval_updates: 10000
-  no_epoch_checkpoints: true
-distributed_training:
-  ddp_backend: no_c10d
-  distributed_backend: 'nccl'
-  distributed_world_size: 64
-  nprocs_per_node: 8
-  find_unused_parameters: true
-task:
-  _name: mert_pretraining
-  data: ???
-  label_dir: ???
-  labels: ???
-  label_rate: ${model.label_rate}
-  sample_rate: 24000
-  # # crop to 5s
-  # max_sample_size: 120000
-  # min_sample_size: 72000
-  # crop to 30s
-  max_sample_size: 720000
-  min_sample_size: 12000
-  # clip_secs: 30
-  pad_audio: false
-  random_crop: true
-  normalize: false # must be consistent with extractor
-dataset:
-  num_workers: 6
-  max_tokens: 2000000
-  skip_invalid_size_inputs_valid_test: true
-  validate_interval: 1
-  validate_interval_updates: 10000
-  disable_validation: true
-criterion:
-  _name: model
-  # log_keys:
-  #   - accuracies
-optimization:
-  max_update: 400000
-  lr: [0.0005]
-  clip_norm: 10.0
-  update_freq: [1]
-optimizer:
-  _name: adam
-  adam_betas: (0.9,0.98)
-  adam_eps: 1e-06
-  weight_decay: 0.01
-lr_scheduler:
-  _name: polynomial_decay
-  warmup_updates: 32000
-model:
-  _name: musicfm
-  label_rate: 25
-  num_codebooks: 1
-  codebook_dim: 16
-  codebook_size: 4096
-  features: ["melspec_2048"]
-  hop_length: 240
-  n_mels: 128
-  conv_dim: 512
-  encoder_dim: 1024
-  encoder_depth: 12
-  mask_hop: 0.4
-  mask_prob: 0.6
-  is_flash: false
-  stat_path: msd_stats.json
-  model_path: null
-  w2v2_config_path: models--facebook--wav2vec2-conformer-rope-large-960h-ft/snapshots/6b36ef01c6443c67ae7ed0822876d091ab50e4aa
-hydra:
-  job:
-    config:
-      override_dirname:
-        kv_sep: '-'
-        item_sep: '__'
-        exclude_keys:
-          - run
-          - task.data
-          - task.label_dir
-  run:
-    dir: ???
-  sweep:
-    dir: ???
-    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/config/pretrain/run/submitit_reg.yaml DELETED Viewed

@@ -1,20 +0,0 @@
-# @package _global_
-hydra:
-  launcher:
-    cpus_per_task: 8
-    gpus_per_node: 8
-    tasks_per_node: ${hydra.launcher.gpus_per_node}
-    nodes: 4
-    comment: null
-    mem_gb: 384
-    timeout_min: 4320
-    max_num_timeout: 100
-    constraint: volta32gb
-    name: ${hydra.job.config_name}/${hydra.job.override_dirname}
-    submitit_folder: ${hydra.sweep.dir}/submitit/%j
-distributed_training:
-  distributed_world_size: 32
-  distributed_port: 29671
-  nprocs_per_node: 8

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .mert_dataset import MERTDataset
2	- from .eat_data import *

codeclm/tokenizer/Flow1dVAE/our_MERT_BESTRQ/mert_fairseq/data/ark_dataset.py DELETED Viewed

@@ -1,115 +0,0 @@
-import logging
-import torch
-import torch.nn.functional as F
-from fairseq.data.audio.raw_audio_dataset import RawAudioDataset
-from typing import Tuple
-try:
-    import kaldiio
-except:
-    kaldiio = None
-import warnings
-logger = logging.getLogger(__name__)
-class ArkDataset(RawAudioDataset):
-    def __init__(
-        self,
-        wav_scp,
-        dur_scp,
-        sr = 24000,
-        max_dur = 20,
-        num_buckets=0,
-        normalize=False,
-    ):
-        super().__init__(
-            sample_rate=sr,
-            max_sample_size=max_dur*sr,
-            min_sample_size=1200,
-            shuffle=True,
-            pad=True,
-            normalize=normalize,
-            compute_mask=False,
-        )
-        self.sr = sr
-        self.max_dur = max_dur
-        self.normalize = normalize
-        logger.info("Loading Kaldi scp files from {}".format(wav_scp))
-        self.wav_data = kaldiio.load_scp(wav_scp)
-        self.keys = list(self.wav_data.keys())
-        dur_data = {}
-        keys_set = set(self.keys)
-        with open(dur_scp, 'r') as f:
-            for line in f:
-                line = line.strip().split()
-                if line[0] in keys_set:
-                    dur_data[line[0]] = float(line[-1])
-        self.sizes = [int(dur_data[k]*self.sr/100) for k in self.keys]
-        logger.info("Loading Kaldi scp files done")
-        self.dataset_len = len(self.keys)
-        self.set_bucket_info(num_buckets)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        # print("getitem idx: ", idx)
-        try_cnt = 0
-        while True:
-            idx = idx + try_cnt
-            try:
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore")
-                    key = self.keys[idx]
-                    # print(self.wav_data[key].keys())
-                    wav = self.wav_data[key]['wav']
-                wav = torch.from_numpy(wav).float()
-                wav = self.postprocess(wav)
-                # print("success load", idx, " shape =", wav.shape)
-                return {"id": idx, "source": wav}
-            except Exception as e:
-                # from traceback import print_exc
-                # print_exc()
-                # print("Error loadding ", idx)
-                # return {"id": idx, "source": None}
-                try_cnt += 1
-                if try_cnt > 50:
-                    return {"id": idx, "source": None}
-                continue
-    def size(self, idx):
-        return self.sizes[idx]
-    def postprocess(self, wav):
-        if wav.dim() == 2:
-            wav = wav.mean(-1)
-        assert wav.dim() == 1, wav.dim()
-        if self.normalize:
-            with torch.no_grad():
-                wav = F.layer_norm(wav, wav.shape)
-        return wav
-    def collater(self, samples):
-        # print("collate from:", [s['source'].shape for s in samples if s['source'] is not None])
-        return super().collater(samples)
-if __name__ == '__main__':
-    import torch
-    raw_tensor_str = torch.Tensor.__repr__
-    torch.Tensor.__str__ = torch.Tensor.__repr__ = lambda self: f'Tensor{{Size({[*self.shape]}) {self.device} {str(self.dtype)[6]}{str(self.dtype)[-2:]}}}' if self.numel() > 10 else raw_tensor_str(self)
-    ds = ArkDataset(
-        wav_scp='data/ark_demo/wav_ark.scp',
-        dur_scp='data/ark_demo/dur_ark.scp',
-        sr=24000,
-    )
-    for i in range(len(ds)):
-        print(ds[i])