DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on Mar 2

Commit

33facbc

1 Parent(s): a58e0e9

test

Browse files

Files changed (25) hide show

app.py +318 -0
diffrhythm/config/defaults.ini +94 -0
diffrhythm/config/diffrhythm-1b.json +13 -0
diffrhythm/model/__init__.py +6 -0
diffrhythm/model/__pycache__/__init__.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/__init__.cpython-312.pyc +0 -0
diffrhythm/model/__pycache__/cfm.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/cfm.cpython-312.pyc +0 -0
diffrhythm/model/__pycache__/custom_dataset.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/custom_dataset_lrc_emb.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/dataset.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/dit.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/modules.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/trainer.cpython-310.pyc +0 -0
diffrhythm/model/__pycache__/utils.cpython-310.pyc +0 -0
diffrhythm/model/cfm.py +315 -0
diffrhythm/model/dit.py +221 -0
diffrhythm/model/modules.py +611 -0
diffrhythm/model/trainer.py +350 -0
diffrhythm/model/utils.py +182 -0
prompt/gift_of_the_world.wav +0 -0
prompt/little_happiness.wav +0 -0
prompt/little_talks.wav +0 -0
prompt/ltwyl.wav +0 -0
prompt/most_beautiful_expectation.wav +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import gradio as gr
+from openai import OpenAI
+import requests
+import json
+# from volcenginesdkarkruntime import Ark
+import torch
+import torchaudio
+from einops import rearrange
+import argparse
+import json
+import os
+from tqdm import tqdm
+import random
+import numpy as np
+import sys
+from diffrhythm.infer.infer_utils import (
+    get_reference_latent,
+    get_lrc_token,
+    get_style_prompt,
+    prepare_model,
+    get_negative_style_prompt
+)
+from diffrhythm.infer.infer import inference
+device='cuda'
+cfm, tokenizer, muq, vae = prepare_model(device)
+cfm = torch.compile(cfm)
+def infer_music(lrc, ref_audio_path, max_frames=2048, device='cuda'):
+    # lrc_list = lrc.split("\n")
+    # print(lrc_list)
+    # return "./gift_of_the_world.wav"
+    lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
+    style_prompt = get_style_prompt(muq, ref_audio_path)
+    negative_style_prompt = get_negative_style_prompt(device)
+    latent_prompt = get_reference_latent(device, max_frames)
+    generated_song = inference(cfm_model=cfm,
+                               vae_model=vae,
+                               cond=latent_prompt,
+                               text=lrc_prompt,
+                               duration=max_frames,
+                               style_prompt=style_prompt,
+                               negative_style_prompt=negative_style_prompt,
+                               start_time=start_time
+                               )
+    return generated_song
+def R1_infer1(theme, tags_gen, language):
+    try:
+        client = OpenAI(api_key="XXXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
+        llm_prompt = """
+        请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
+        ### **歌曲结构要求**
+        1. 歌词应富有变化，使情绪递进，整体连贯有层次感。**每行歌词长度应自然变化**，切勿长度一致，导致很格式化。
+        2. **时间戳分配应根据歌曲的标签\歌词的情感、节奏来合理推测**，而非机械地按照歌词长度分配。
+        ### **歌曲内容要求**
+        1. **第一句歌词的时间戳应考虑前奏长度**，避免歌词从 `[00:00.00]` 直接开始。
+        2. **严格按照 LRC 格式输出歌词**，每行格式为 `[mm:ss.xx]歌词内容`。
+        3. 输出的歌词不能有空行、括号，不能有其他解释内容，例如：副歌、桥段、结尾。
+        4. 输出必须是**纯净的 LRC**。
+        """
+        response = client.chat.completions.create(
+            model="ep-20250215195652-lrff7",
+            messages=[
+                {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."},
+                {"role": "user", "content": llm_prompt.format(theme=theme, tags=tags_gen, language=language)},
+            ],
+            stream=False
+        )
+        info = response.choices[0].message.content
+        return info
+    except requests.exceptions.RequestException as e:
+        print(f'请求出错: {e}')
+        return {}
+def R1_infer2(tags_lyrics, lyrics_input):
+    client = OpenAI(api_key="XXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
+    llm_prompt = """
+    {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格，我现在想要给这首歌的每一句歌词打时间戳得到LRC，我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测，而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度，避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词，每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
+    """
+    response = client.chat.completions.create(
+        model="ep-20250215195652-lrff7",
+        messages=[
+            {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."},
+            {"role": "user", "content": llm_prompt.format(lyrics_input=lyrics_input, tags_lyrics=tags_lyrics)},
+        ],
+        stream=False
+    )
+    info = response.choices[0].message.content
+    return info
+css = """
+/* 固定文本域高度并强制滚动条 */
+.lyrics-scroll-box textarea {
+    height: 300px !important;  /* 固定高度 */
+    max-height: 500px !important;  /* 最大高度 */
+    overflow-y: auto !important;  /* 垂直滚动 */
+    white-space: pre-wrap;  /* 保留换行 */
+    line-height: 1.5;  /* 行高优化 */
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# DiffRhythm")
+    with gr.Tabs() as tabs:
+        # page 1
+        with gr.Tab("Music Generate", id=0):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Accordion("Best Practices Guide", open=False):
+                        gr.Markdown("""
+                        1. **Lyrics Format Requirements**
+                        - Each line must follow: `[mm:ss.xx]Lyric content`
+                        - Example of valid format:
+                            ```
+                            [00:07.23]Fight me fight me fight me
+                            [00:08.73]You made me so unlike me
+                            ```
+                        2. **Generation Duration Limits**
+                        - Current version supports maximum **95 seconds** of music generation
+                        - Total timestamps should not exceed 01:35.00 (95 seconds)
+                        3. **Audio Prompt Requirements**
+                        - Reference audio should be ≥10 seconds for optimal results
+                        - Shorter clips may lead to incoherent generation
+                        """)
+                    lrc = gr.Textbox(
+                        label="Lrc",
+                        placeholder="Input the full lyrics",
+                        lines=12,
+                        max_lines=50,
+                        elem_classes="lyrics-scroll-box"
+                    )
+                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
+                with gr.Column():
+                    lyrics_btn = gr.Button("Submit", variant="primary")
+                    audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
+            gr.Examples(
+                examples=[
+                    ["./gift_of_the_world.wav"],
+                    ["./most_beautiful_expectation.wav"],
+                    ["./ltwyl.wav"]
+                ],
+                inputs=[audio_prompt],
+                label="Audio Examples",
+                examples_per_page=3
+            )
+            gr.Examples(
+                examples=[
+                    ["""[00:10.00]Moonlight spills through broken blinds
+[00:13.20]Your shadow dances on the dashboard shrine
+[00:16.85]Neon ghosts in gasoline rain
+[00:20.40]I hear your laughter down the midnight train
+[00:24.15]Static whispers through frayed wires
+[00:27.65]Guitar strings hum our cathedral choirs
+[00:31.30]Flicker screens show reruns of June
+[00:34.90]I'm drowning in this mercury lagoon
+[00:38.55]Electric veins pulse through concrete skies
+[00:42.10]Your name echoes in the hollow where my heartbeat lies
+[00:45.75]We're satellites trapped in parallel light
+[00:49.25]Burning through the atmosphere of endless night
+[01:00.00]Dusty vinyl spins reverse
+[01:03.45]Our polaroid timeline bleeds through the verse
+[01:07.10]Telescope aimed at dead stars
+[01:10.65]Still tracing constellations through prison bars
+[01:14.30]Electric veins pulse through concrete skies
+[01:17.85]Your name echoes in the hollow where my heartbeat lies
+[01:21.50]We're satellites trapped in parallel light
+[01:25.05]Burning through the atmosphere of endless night
+[02:10.00]Clockwork gears grind moonbeams to rust
+[02:13.50]Our fingerprint smudged by interstellar dust
+[02:17.15]Velvet thunder rolls through my veins
+[02:20.70]Chasing phantom trains through solar plane
+[02:24.35]Electric veins pulse through concrete skies
+[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
+                ["""[00:05.00]Stardust whispers in your eyes
+[00:09.30]Moonlight paints our silhouettes
+[00:13.75]Tides bring secrets from the deep
+[00:18.20]Where forever's breath is kept
+[00:22.90]We dance through constellations' maze
+[00:27.15]Footprints melt in cosmic waves
+[00:31.65]Horizons hum our silent vow
+[00:36.10]Time unravels here and now
+[00:40.85]Eternal embers in the night oh oh oh
+[00:45.25]Healing scars with liquid light
+[00:49.70]Galaxies write our refrain
+[00:54.15]Love reborn in endless rain
+[01:15.30]Paper boats of memories
+[01:19.75]Float through veins of ancient trees
+[01:24.20]Your laughter spins aurora threads
+[01:28.65]Weaving dawn through featherbed"""]
+                ],
+                inputs=[lrc],  # 只绑定到歌词输入
+                label="Lrc Examples",
+                examples_per_page=2
+            )
+        # page 2
+        with gr.Tab("LLM Generate LRC", id=1):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Accordion("Notice", open=False):
+                        gr.Markdown("**Two Generation Modes:**\n1. Generate from theme & tags\n2. Add timestamps to existing lyrics")
+                    with gr.Group():
+                        gr.Markdown("### Method 1: Generate from Theme")
+                        theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
+                        tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
+                        language = gr.Dropdown(["zh", "en"], label="language", value="en")
+                        gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
+                    with gr.Group(visible=True):
+                        gr.Markdown("### Method 2: Add Timestamps to Lyrics")
+                        tags_lyrics = gr.Textbox(label="tags", placeholder="Example: female ballad piano slow")
+                        lyrics_input = gr.Textbox(
+                            label="Raw Lyrics (without timestamps)",
+                            placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
+                            lines=12,
+                            max_lines=50,
+                            elem_classes="lyrics-scroll-box"
+                        )
+                        gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
+                with gr.Column():
+                    lrc_output = gr.Textbox(
+                        label="Generated LRC Lyrics",
+                        placeholder="Timed lyrics will appear here",
+                        lines=50,
+                        elem_classes="lrc-output",
+                        show_copy_button=True
+                    )
+            # Examples section
+            gr.Examples(
+                examples=[
+                    [
+                        "Love and Heartbreak",
+                        "female vocal emotional piano pop",
+                        "en"
+                    ],
+                    [
+                        "Heroic Epic",
+                        "male choir orchestral powerful",
+                        "zh"
+                    ]
+                ],
+                inputs=[theme, tags_gen, language],
+                label="Examples: Generate from Theme"
+            )
+            gr.Examples(
+                examples=[
+                    [
+                        "acoustic folk happy",
+                        """I'm sitting here in the boring room
+                        It's just another rainy Sunday afternoon"""
+                    ],
+                    [
+                        "electronic dance energetic",
+                        """We're living in a material world
+                        And I am a material girl"""
+                    ]
+                ],
+                inputs=[tags_lyrics, lyrics_input],
+                label="Examples: Generate from Lyrics"
+            )
+            # Bind functions
+            gen_from_theme_btn.click(
+                fn=R1_infer1,
+                inputs=[theme, tags_gen, language],
+                outputs=lrc_output
+            )
+            gen_from_lyrics_btn.click(
+                fn=R1_infer2,
+                inputs=[tags_lyrics, lyrics_input],
+                outputs=lrc_output
+            )
+    tabs.select(
+    lambda s: None,
+    None,
+    None
+    )
+    lyrics_btn.click(
+        fn=infer_music,
+        inputs=[lrc, audio_prompt],
+        outputs=audio_output
+    )
+demo.queue().launch(show_api=False, show_error=True)
+if __name__ == "__main__":
+    demo.launch()

diffrhythm/config/defaults.ini ADDED Viewed

	@@ -0,0 +1,94 @@

+[DEFAULTS]
+#name of the run
+exp_name = F5
+# the batch size
+batch_size = 8
+# the chunk size
+max_frames = 3000
+min_frames = 10
+# number of CPU workers for the DataLoader
+num_workers = 4
+# the random seed
+seed = 42
+# Batches for gradient accumulation
+accum_batches = 1
+# Number of steps between checkpoints
+checkpoint_every = 10000
+# trainer checkpoint file to restart training from
+ckpt_path = ''
+# model checkpoint file to start a new training run from
+pretrained_ckpt_path = ''
+# Checkpoint path for the pretransform model if needed
+pretransform_ckpt_path = ''
+# configuration model specifying model hyperparameters
+model_config = ''
+# configuration for datasets
+dataset_config = ''
+# directory to save the checkpoints in
+save_dir = ''
+# grad norm
+max_grad_norm = 1.0
+# grad accu
+grad_accumulation_steps = 1
+# lr
+learning_rate = 7.5e-5
+# epoch
+epochs = 110
+# warmup steps
+num_warmup_updates = 2000
+# save checkpoint per steps
+save_per_updates = 5000
+# save last checkpoint per steps
+last_per_steps = 5000
+prompt_path = "/mnt/sfs/music/lance/style-lance-full|/mnt/sfs/music/lance/style-lance-cnen-music-second"
+lrc_path = "/mnt/sfs/music/lance/lrc-lance-emb-full|/mnt/sfs/music/lance/lrc-lance-cnen-second"
+latent_path = "/mnt/sfs/music/lance/latent-lance|/mnt/sfs/music/lance/latent-lance-cnen-music-second-1|/mnt/sfs/music/lance/latent-lance-cnen-music-second-2"
+audio_drop_prob = 0.3
+cond_drop_prob = 0.0
+style_drop_prob = 0.1
+lrc_drop_prob = 0.1
+align_lyrics = 0
+lyrics_slice = 0
+parse_lyrics = 1
+skip_empty_lyrics = 0
+lyrics_shift = -1
+use_style_prompt = 1
+tokenizer_type = gpt2
+reset_lr = 0
+resumable_with_seed = 666
+downsample_rate = 2048
+grad_ckpt = 0
+dataset_path = "/mnt/sfs/music/hkchen/workspace/F5-TTS-HW/filelists/music123latent_asred_bpmstyle_cnen_pure1"
+pure_prob = 0.0

diffrhythm/config/diffrhythm-1b.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "model_type": "diffrhythm",
+    "model": {
+        "dim": 2048,
+        "depth": 16,
+        "heads": 32,
+        "ff_mult": 4,
+        "text_dim": 512,
+        "conv_layers": 4,
+        "mel_dim": 64,
+        "text_num_embeds": 363
+    }
+}

diffrhythm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from diffrhythm.model.cfm import CFM
+from diffrhythm.model.dit import DiT
+__all__ = ["CFM"]

diffrhythm/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (290 Bytes). View file

diffrhythm/model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (508 Bytes). View file

diffrhythm/model/__pycache__/cfm.cpython-310.pyc ADDED Viewed

Binary file (6.28 kB). View file

diffrhythm/model/__pycache__/cfm.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

diffrhythm/model/__pycache__/custom_dataset.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

diffrhythm/model/__pycache__/custom_dataset_lrc_emb.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

diffrhythm/model/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (8.04 kB). View file

diffrhythm/model/__pycache__/dit.cpython-310.pyc ADDED Viewed

Binary file (5.61 kB). View file

diffrhythm/model/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (15.9 kB). View file

diffrhythm/model/__pycache__/trainer.cpython-310.pyc ADDED Viewed

Binary file (9.13 kB). View file

diffrhythm/model/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.03 kB). View file

diffrhythm/model/cfm.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+from typing import Callable
+from random import random
+import torch
+from torch import nn
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from torchdiffeq import odeint
+from diffrhythm.model.modules import MelSpec
+from diffrhythm.model.utils import (
+    default,
+    exists,
+    list_str_to_idx,
+    list_str_to_tensor,
+    lens_to_mask,
+    mask_from_frac_lengths,
+)
+def custom_mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"], device, max_seq_len):  # noqa: F722 F821
+    max_seq_len = max_seq_len
+    seq = torch.arange(max_seq_len, device=device).long()
+    start_mask = seq[None, :] >= start[:, None]
+    end_mask = seq[None, :] < end[:, None]
+    return start_mask & end_mask
+class CFM(nn.Module):
+    def __init__(
+        self,
+        transformer: nn.Module,
+        sigma=0.0,
+        odeint_kwargs: dict = dict(
+            # atol = 1e-5,
+            # rtol = 1e-5,
+            method="euler" # 'midpoint'
+            # method="adaptive_heun"  # dopri5
+        ),
+        odeint_options: dict = dict(
+            min_step=0.05
+        ),
+        audio_drop_prob=0.3,
+        cond_drop_prob=0.2,
+        style_drop_prob=0.1,
+        lrc_drop_prob=0.1,
+        num_channels=None,
+        frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
+        vocab_char_map: dict[str:int] | None = None,
+        use_style_prompt: bool = False
+    ):
+        super().__init__()
+        self.frac_lengths_mask = frac_lengths_mask
+        self.num_channels = num_channels
+        # classifier-free guidance
+        self.audio_drop_prob = audio_drop_prob
+        self.cond_drop_prob = cond_drop_prob
+        self.style_drop_prob = style_drop_prob
+        self.lrc_drop_prob = lrc_drop_prob
+        print(f"audio drop prob -> {self.audio_drop_prob}; style_drop_prob -> {self.style_drop_prob}; lrc_drop_prob: {self.lrc_drop_prob}")
+        # transformer
+        self.transformer = transformer
+        dim = transformer.dim
+        self.dim = dim
+        # conditional flow related
+        self.sigma = sigma
+        # sampling related
+        self.odeint_kwargs = odeint_kwargs
+        # print(f"ODE SOLVER: {self.odeint_kwargs['method']}")
+        self.odeint_options = odeint_options
+        # vocab map for tokenization
+        self.vocab_char_map = vocab_char_map
+        self.use_style_prompt = use_style_prompt
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: float["b n d"] | float["b nw"],  # noqa: F722
+        text: int["b nt"] | list[str],  # noqa: F722
+        duration: int | int["b"],  # noqa: F821
+        *,
+        style_prompt = None,
+        style_prompt_lens = None,
+        negative_style_prompt = None,
+        lens: int["b"] | None = None,  # noqa: F821
+        steps=32,
+        cfg_strength=4.0,
+        sway_sampling_coef=None,
+        seed: int | None = None,
+        max_duration=4096,
+        vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None,  # noqa: F722
+        no_ref_audio=False,
+        duplicate_test=False,
+        t_inter=0.1,
+        edit_mask=None,
+        start_time=None,
+        latent_pred_start_frame=0,
+        latent_pred_end_frame=2048,
+    ):
+        self.eval()
+        if next(self.parameters()).dtype == torch.float16:
+            cond = cond.half()
+        # raw wave
+        if cond.shape[1] > duration:
+            cond = cond[:, :duration, :]
+        if cond.ndim == 2:
+            cond = self.mel_spec(cond)
+            cond = cond.permute(0, 2, 1)
+            assert cond.shape[-1] == self.num_channels
+        batch, cond_seq_len, device = *cond.shape[:2], cond.device
+        if not exists(lens):
+            lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
+        # text
+        if isinstance(text, list):
+            if exists(self.vocab_char_map):
+                text = list_str_to_idx(text, self.vocab_char_map).to(device)
+            else:
+                text = list_str_to_tensor(text).to(device)
+            assert text.shape[0] == batch
+        if exists(text):
+            text_lens = (text != -1).sum(dim=-1)
+            #lens = torch.maximum(text_lens, lens)  # make sure lengths are at least those of the text characters
+        # duration
+        # import pdb; pdb.set_trace()
+        cond_mask = lens_to_mask(lens)
+        if edit_mask is not None:
+            cond_mask = cond_mask & edit_mask
+        latent_pred_start_frame = torch.tensor([latent_pred_start_frame]).to(cond.device)
+        latent_pred_end_frame = duration
+        latent_pred_end_frame = torch.tensor([latent_pred_end_frame]).to(cond.device)
+        fixed_span_mask = custom_mask_from_start_end_indices(cond_seq_len, latent_pred_start_frame, latent_pred_end_frame, device=cond.device, max_seq_len=duration)
+        fixed_span_mask = fixed_span_mask.unsqueeze(-1)
+        step_cond = torch.where(fixed_span_mask, torch.zeros_like(cond), cond)
+        if isinstance(duration, int):
+            duration = torch.full((batch,), duration, device=device, dtype=torch.long)
+        # duration = torch.maximum(lens + 1, duration)  # just add one token so something is generated
+        duration = duration.clamp(max=max_duration)
+        max_duration = duration.amax()
+        # duplicate test corner for inner time step oberservation
+        if duplicate_test:
+            test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
+        # cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0) # [b, t, d]
+        # cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False) # [b, max_duration]
+        # cond_mask = cond_mask.unsqueeze(-1) #[b, t, d]
+        # step_cond = torch.where(
+        #     cond_mask, cond, torch.zeros_like(cond)
+        # )  # allow direct control (cut cond audio) with lens passed in
+        if batch > 1:
+            mask = lens_to_mask(duration)
+        else:  # save memory and speed up, as single inference need no mask currently
+            mask = None
+        # test for no ref audio
+        if no_ref_audio:
+            cond = torch.zeros_like(cond)
+        def fn(t, x):
+            # at each step, conditioning is fixed
+            # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
+            # predict flow
+            pred = self.transformer(
+                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False, drop_prompt=False,
+                style_prompt=style_prompt, style_prompt_lens=style_prompt_lens, start_time=start_time
+            )
+            if cfg_strength < 1e-5:
+                return pred
+            null_pred = self.transformer(
+                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True, drop_prompt=False,
+                style_prompt=negative_style_prompt, style_prompt_lens=style_prompt_lens, start_time=start_time
+            )
+            return pred + (pred - null_pred) * cfg_strength
+        # noise input
+        # to make sure batch inference result is same with different batch size, and for sure single inference
+        # still some difference maybe due to convolutional layers
+        y0 = []
+        for dur in duration:
+            if exists(seed):
+                torch.manual_seed(seed)
+            y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
+        y0 = pad_sequence(y0, padding_value=0, batch_first=True)
+        t_start = 0
+        # duplicate test corner for inner time step oberservation
+        if duplicate_test:
+            t_start = t_inter
+            y0 = (1 - t_start) * y0 + t_start * test_cond
+            steps = int(steps * (1 - t_start))
+        t = torch.linspace(t_start, 1, steps, device=self.device, dtype=step_cond.dtype)
+        if sway_sampling_coef is not None:
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+        trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
+        sampled = trajectory[-1]
+        out = sampled
+        # out = torch.where(cond_mask, cond, out)
+        out = torch.where(fixed_span_mask, out, cond)
+        if exists(vocoder):
+            out = out.permute(0, 2, 1)
+            out = vocoder(out)
+        return out, trajectory
+    def forward(
+        self,
+        inp: float["b n d"] | float["b nw"],  # mel or raw wave  # noqa: F722
+        text: int["b nt"] | list[str],  # noqa: F722
+        style_prompt = None,
+        style_prompt_lens = None,
+        lens: int["b"] | None = None,  # noqa: F821
+        noise_scheduler: str | None = None,
+        grad_ckpt = False,
+        start_time = None,
+    ):
+        batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
+        # lens and mask
+        if not exists(lens):
+            lens = torch.full((batch,), seq_len, device=device)
+        mask = lens_to_mask(lens, length=seq_len)  # useless here, as collate_fn will pad to max length in batch
+        # get a random span to mask out for training conditionally
+        frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
+        rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
+        if exists(mask):
+            rand_span_mask = mask
+            # rand_span_mask &= mask
+        # mel is x1
+        x1 = inp
+        # x0 is gaussian noise
+        x0 = torch.randn_like(x1)
+        # time step
+        # time = torch.rand((batch,), dtype=dtype, device=self.device)
+        time = torch.normal(mean=0, std=1, size=(batch,), device=self.device)
+        time = torch.nn.functional.sigmoid(time)
+        # TODO. noise_scheduler
+        # sample xt (φ_t(x) in the paper)
+        t = time.unsqueeze(-1).unsqueeze(-1)
+        φ = (1 - t) * x0 + t * x1
+        flow = x1 - x0
+        # only predict what is within the random mask span for infilling
+        cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
+        # transformer and cfg training with a drop rate
+        drop_audio_cond = random() < self.audio_drop_prob  # p_drop in voicebox paper
+        drop_text = random() < self.lrc_drop_prob
+        drop_prompt = random() < self.style_drop_prob
+        # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
+        # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
+        pred = self.transformer(
+            x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text, drop_prompt=drop_prompt,
+            style_prompt=style_prompt, style_prompt_lens=style_prompt_lens, grad_ckpt=grad_ckpt, start_time=start_time
+        )
+        # flow matching loss
+        loss = F.mse_loss(pred, flow, reduction="none")
+        loss = loss[rand_span_mask]
+        return loss.mean(), cond, pred

diffrhythm/model/dit.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+import torch
+import torch.nn.functional as F
+from x_transformers.x_transformers import RotaryEmbedding
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.llama import LlamaConfig
+from torch.utils.checkpoint import checkpoint
+from diffrhythm.model.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis,
+    get_pos_embed_indices,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        #text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        #text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
+        #text = F.pad(text, (0, seq_len - text_len), value=0)
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim, cond_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim + cond_dim * 2, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], style_emb, time_emb, drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+        style_emb = style_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
+        time_emb = time_emb.unsqueeze(1).repeat(1, x.shape[1], 1)
+        # print(x.shape, cond.shape, text_embed.shape, style_emb.shape, time_emb.shape)
+        x = self.proj(torch.cat((x, cond, text_embed, style_emb, time_emb), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_num_embeds=256,
+        text_dim=None,
+        conv_layers=0,
+        long_skip_connection=False,
+        use_style_prompt=False
+    ):
+        super().__init__()
+        cond_dim = 512
+        self.time_embed = TimestepEmbedding(cond_dim)
+        self.start_time_embed = TimestepEmbedding(cond_dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim, cond_dim=cond_dim)
+        #self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        #self.transformer_blocks = nn.ModuleList(
+        #    [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout, use_style_prompt=use_style_prompt) for _ in range(depth)]
+        #)
+        llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
+        llama_config._attn_implementation = 'sdpa'
+        self.transformer_blocks = nn.ModuleList(
+            [LlamaDecoderLayer(llama_config, layer_idx=i) for i in range(depth)]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        self.text_fusion_linears = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(cond_dim, dim),
+                    nn.SiLU()
+                ) for i in range(depth // 2)
+            ]
+        )
+        for layer in self.text_fusion_linears:
+            for p in layer.parameters():
+                p.detach().zero_()
+        self.norm_out = AdaLayerNormZero_Final(dim, cond_dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+        # if use_style_prompt:
+        #     self.prompt_rnn = nn.LSTM(64, cond_dim, 1, batch_first=True)
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        drop_prompt=False,
+        style_prompt=None, # [b d t]
+        style_prompt_lens=None,
+        mask: bool["b n"] | None = None,  # noqa: F722
+        grad_ckpt=False,
+        start_time=None,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        s_t = self.start_time_embed(start_time)
+        c = t + s_t
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
+        # import pdb; pdb.set_trace()
+        if drop_prompt:
+            style_prompt = torch.zeros_like(style_prompt)
+        # if self.training:
+        #     packed_style_prompt = torch.nn.utils.rnn.pack_padded_sequence(style_prompt.transpose(1, 2), style_prompt_lens.cpu(), batch_first=True, enforce_sorted=False)
+        # else:
+        #     packed_style_prompt = style_prompt.transpose(1, 2)
+        #print(packed_style_prompt.shape)
+        # _, style_emb = self.prompt_rnn.forward(packed_style_prompt)
+        # _, (h_n, c_n) = self.prompt_rnn.forward(packed_style_prompt)
+        # style_emb = h_n.squeeze(0) # 1, B, dim -> B, dim
+        style_emb = style_prompt # [b, 512]
+        x = self.input_embed(x, cond, text_embed, style_emb, c, drop_audio_cond=drop_audio_cond)
+        if self.long_skip_connection is not None:
+            residual = x
+        pos_ids = torch.arange(x.shape[1], device=x.device)
+        pos_ids = pos_ids.unsqueeze(0).repeat(x.shape[0], 1)
+        for i, block in enumerate(self.transformer_blocks):
+            if not grad_ckpt:
+                x, *_ = block(x, position_ids=pos_ids)
+            else:
+                x, *_ = checkpoint(block, x, position_ids=pos_ids, use_reentrant=False)
+            if i < self.depth // 2:
+                x = x + self.text_fusion_linears[i](text_embed)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, c)
+        output = self.proj_out(x)
+        return output

diffrhythm/model/modules.py ADDED Viewed

	@@ -0,0 +1,611 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+from typing import Optional
+import math
+import torch
+from torch import nn
+import torch
+import torch.nn.functional as F
+import torchaudio
+from x_transformers.x_transformers import apply_rotary_pos_emb
+class FiLMLayer(nn.Module):
+    """
+    Feature-wise Linear Modulation (FiLM) layer
+    Reference: https://arxiv.org/abs/1709.07871
+    """
+    def __init__(self, in_channels, cond_channels):
+        super(FiLMLayer, self).__init__()
+        self.in_channels = in_channels
+        self.film = nn.Conv1d(cond_channels, in_channels * 2, 1)
+    def forward(self, x, c):
+        gamma, beta = torch.chunk(self.film(c.unsqueeze(2)), chunks=2, dim=1)
+        gamma = gamma.transpose(1, 2)
+        beta = beta.transpose(1, 2)
+        # print(gamma.shape, beta.shape)
+        return gamma * x + beta
+# raw wav to mel spec
+class MelSpec(nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=100,
+        target_sample_rate=24_000,
+        normalize=False,
+        power=1,
+        norm=None,
+        center=True,
+    ):
+        super().__init__()
+        self.n_mel_channels = n_mel_channels
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=target_sample_rate,
+            n_fft=filter_length,
+            win_length=win_length,
+            hop_length=hop_length,
+            n_mels=n_mel_channels,
+            power=power,
+            center=center,
+            normalized=normalize,
+            norm=norm,
+        )
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+    def forward(self, inp):
+        if len(inp.shape) == 3:
+            inp = inp.squeeze(1)  # 'b 1 nw -> b nw'
+        assert len(inp.shape) == 2
+        if self.dummy.device != inp.device:
+            self.to(inp.device)
+        mel = self.mel_stft(inp)
+        mel = mel.clamp(min=1e-5).log()
+        return mel
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# convolutional position embedding
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = self.conv1d(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+# rotary positional embedding related
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+def get_pos_embed_indices(start, length, max_pos, scale=1.0):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
+    pos = (
+        start.unsqueeze(1)
+        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
+    )
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+# Global Response Normalization layer (Instance Normalization ?)
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim, cond_dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(cond_dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        activation = nn.GELU(approximate=approximate)
+        #activation = nn.SiLU()
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+    def forward(self, x):
+        return self.ff(x)
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class Attention(nn.Module):
+    def __init__(
+        self,
+        processor: JointAttnProcessor | AttnProcessor,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,  # if not None -> joint attention
+        context_pre_only=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.processor = processor
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.context_dim = context_dim
+        self.context_pre_only = context_pre_only
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if self.context_dim is not None:
+            self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+            if self.context_pre_only is not None:
+                self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_out_c = nn.Linear(self.inner_dim, dim)
+    def forward(
+        self,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b n d"] = None,  # context c  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.Tensor:
+        if c is not None:
+            return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
+        else:
+            return self.processor(self, x, mask=mask, rope=rope)
+# Attention processor
+class AttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = mask
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+        return x
+# Joint Attention processor for MM-DiT
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class JointAttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b nt d"] = None,  # context c, here text # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.FloatTensor:
+        residual = x
+        batch_size = c.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # `context` projections.
+        c_query = attn.to_q_c(c)
+        c_key = attn.to_k_c(c)
+        c_value = attn.to_v_c(c)
+        # apply rope for context and noised input independently
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if c_rope is not None:
+            freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
+            c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
+        # attention
+        query = torch.cat([query, c_query], dim=1)
+        key = torch.cat([key, c_key], dim=1)
+        value = torch.cat([value, c_value], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # Split the attention outputs.
+        x, c = (
+            x[:, : residual.shape[1]],
+            x[:, residual.shape[1] :],
+        )
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if not attn.context_pre_only:
+            c = attn.to_out_c(c)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+            # c = c.masked_fill(~mask, 0.)  # no mask for c (text)
+        return x, c
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, use_style_prompt=False):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+        )
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+        self.use_style_prompt = use_style_prompt
+        if use_style_prompt:
+            #self.film = FiLMLayer(dim, dim)
+            self.prompt_norm = AdaLayerNormZero_Final(dim)
+    def forward(self, x, t, c=None, mask=None, rope=None):  # x: noised input, t: time embedding
+        if c is not None and self.use_style_prompt:
+            #x = self.film(x, c)
+            x = self.prompt_norm(x, c)
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+# MMDiT Block https://arxiv.org/abs/2403.03206
+class MMDiTBlock(nn.Module):
+    r"""
+    modified from diffusers/src/diffusers/models/attention.py
+    notes.
+    _c: context related. text, cond, etc. (left part in sd3 fig2.b)
+    _x: noised input related. (right part)
+    context_pre_only: last layer only do prenorm + modulation cuz no more ffn
+    """
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
+        self.attn_norm_x = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=JointAttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            context_dim=dim,
+            context_pre_only=context_pre_only,
+        )
+        if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+        else:
+            self.ff_norm_c = None
+            self.ff_c = None
+        self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, c, t, mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
+        # pre-norm & modulation for attention input
+        if self.context_pre_only:
+            norm_c = self.attn_norm_c(c, t)
+        else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
+        # attention
+        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
+        # process attention output for context c
+        if self.context_pre_only:
+            c = None
+        else:  # if not last layer
+            c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            c_ff_output = self.ff_c(norm_c)
+            c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
+        # process attention output for input x
+        x = x + x_gate_msa.unsqueeze(1) * x_attn_output
+        norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
+        x_ff_output = self.ff_x(norm_x)
+        x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
+        return c, x
+# time step conditioning embedding
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+    def forward(self, timestep: float["b"]):  # noqa: F821
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time

diffrhythm/model/trainer.py ADDED Viewed

	@@ -0,0 +1,350 @@

+from __future__ import annotations
+import os
+import gc
+from tqdm import tqdm
+import wandb
+import torch
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LinearLR, SequentialLR, ConstantLR
+from accelerate import Accelerator
+from accelerate.utils import DistributedDataParallelKwargs
+from diffrhythm.dataset.custom_dataset_align2f5 import LanceDiffusionDataset
+from torch.utils.data import DataLoader, DistributedSampler
+from ema_pytorch import EMA
+from diffrhythm.model import CFM
+from diffrhythm.model.utils import exists, default
+import time
+# from apex.optimizers.fused_adam import FusedAdam
+# trainer
+class Trainer:
+    def __init__(
+        self,
+        model: CFM,
+        args,
+        epochs,
+        learning_rate,
+        #dataloader,
+        num_warmup_updates=20000,
+        save_per_updates=1000,
+        checkpoint_path=None,
+        batch_size=32,
+        batch_size_type: str = "sample",
+        max_samples=32,
+        grad_accumulation_steps=1,
+        max_grad_norm=1.0,
+        noise_scheduler: str | None = None,
+        duration_predictor: torch.nn.Module | None = None,
+        wandb_project="test_e2-tts",
+        wandb_run_name="test_run",
+        wandb_resume_id: str = None,
+        last_per_steps=None,
+        accelerate_kwargs: dict = dict(),
+        ema_kwargs: dict = dict(),
+        bnb_optimizer: bool = False,
+        reset_lr: bool = False,
+        use_style_prompt: bool = False,
+        grad_ckpt: bool = False
+    ):
+        self.args = args
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False, )
+        logger = "wandb" if wandb.api.api_key else None
+        #logger = None
+        print(f"Using logger: {logger}")
+        # print("-----------1-------------")
+        import tbe.common
+        # print("-----------2-------------")
+        self.accelerator = Accelerator(
+            log_with=logger,
+            kwargs_handlers=[ddp_kwargs],
+            gradient_accumulation_steps=grad_accumulation_steps,
+            **accelerate_kwargs,
+        )
+        # print("-----------3-------------")
+        if logger == "wandb":
+            if exists(wandb_resume_id):
+                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
+            else:
+                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
+            self.accelerator.init_trackers(
+                project_name=wandb_project,
+                init_kwargs=init_kwargs,
+                config={
+                    "epochs": epochs,
+                    "learning_rate": learning_rate,
+                    "num_warmup_updates": num_warmup_updates,
+                    "batch_size": batch_size,
+                    "batch_size_type": batch_size_type,
+                    "max_samples": max_samples,
+                    "grad_accumulation_steps": grad_accumulation_steps,
+                    "max_grad_norm": max_grad_norm,
+                    "gpus": self.accelerator.num_processes,
+                    "noise_scheduler": noise_scheduler,
+                },
+            )
+        self.precision = self.accelerator.state.mixed_precision
+        self.precision = self.precision.replace("no", "fp32")
+        print("!!!!!!!!!!!!!!!!!", self.precision)
+        self.model = model
+        #self.model = torch.compile(model)
+        #self.dataloader = dataloader
+        if self.is_main:
+            self.ema_model = EMA(model, include_online_model=False, **ema_kwargs)
+            self.ema_model.to(self.accelerator.device)
+            if self.accelerator.state.distributed_type in ["DEEPSPEED", "FSDP"]:
+                self.ema_model.half()
+        self.epochs = epochs
+        self.num_warmup_updates = num_warmup_updates
+        self.save_per_updates = save_per_updates
+        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
+        self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
+        self.max_samples = max_samples
+        self.grad_accumulation_steps = grad_accumulation_steps
+        self.max_grad_norm = max_grad_norm
+        self.noise_scheduler = noise_scheduler
+        self.duration_predictor = duration_predictor
+        self.reset_lr = reset_lr
+        self.use_style_prompt = use_style_prompt
+        self.grad_ckpt = grad_ckpt
+        if bnb_optimizer:
+            import bitsandbytes as bnb
+            self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
+        else:
+            self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        #self.optimizer = FusedAdam(model.parameters(), lr=learning_rate)
+        #self.model = torch.compile(self.model)
+        if self.accelerator.state.distributed_type == "DEEPSPEED":
+            self.accelerator.state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = batch_size
+        self.get_dataloader()
+        self.get_scheduler()
+        # self.get_constant_scheduler()
+        self.model, self.optimizer, self.scheduler, self.train_dataloader = self.accelerator.prepare(self.model, self.optimizer, self.scheduler, self.train_dataloader)
+    def get_scheduler(self):
+        warmup_steps = (
+            self.num_warmup_updates * self.accelerator.num_processes
+        )  # consider a fixed warmup steps while using accelerate multi-gpu ddp
+        total_steps = len(self.train_dataloader) * self.epochs / self.grad_accumulation_steps
+        decay_steps = total_steps - warmup_steps
+        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
+        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
+        # constant_scheduler = ConstantLR(self.optimizer, factor=1, total_iters=decay_steps)
+        self.scheduler = SequentialLR(
+            self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_steps]
+        )
+    def get_constant_scheduler(self):
+        total_steps = len(self.train_dataloader) * self.epochs / self.grad_accumulation_steps
+        self.scheduler = ConstantLR(self.optimizer, factor=1, total_iters=total_steps)
+    def get_dataloader(self):
+        prompt_path = self.args.prompt_path.split('|')
+        lrc_path = self.args.lrc_path.split('|')
+        latent_path = self.args.latent_path.split('|')
+        ldd = LanceDiffusionDataset(*LanceDiffusionDataset.init_data(self.args.dataset_path), \
+                                        max_frames=self.args.max_frames, min_frames=self.args.min_frames, \
+                                        align_lyrics=self.args.align_lyrics, lyrics_slice=self.args.lyrics_slice, \
+                                        use_style_prompt=self.args.use_style_prompt, parse_lyrics=self.args.parse_lyrics,
+                                        lyrics_shift=self.args.lyrics_shift, downsample_rate=self.args.downsample_rate, \
+                                        skip_empty_lyrics=self.args.skip_empty_lyrics, tokenizer_type=self.args.tokenizer_type, precision=self.precision, \
+                                        start_time=time.time(), pure_prob=self.args.pure_prob)
+        # start_time = time.time()
+        self.train_dataloader = DataLoader(
+            dataset=ldd,
+            batch_size=self.args.batch_size,      # 每个批次的样本数
+            shuffle=True,      # 是否随机打乱数据
+            num_workers=4,     # 用于加载数据的子进程数
+            pin_memory=True,   # 加速GPU训练
+            collate_fn=ldd.custom_collate_fn,
+            persistent_workers=True
+        )
+    @property
+    def is_main(self):
+        return self.accelerator.is_main_process
+    def save_checkpoint(self, step, last=False):
+        self.accelerator.wait_for_everyone()
+        if self.is_main:
+            checkpoint = dict(
+                model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
+                ema_model_state_dict=self.ema_model.state_dict(),
+                scheduler_state_dict=self.scheduler.state_dict(),
+                step=step,
+            )
+            if not os.path.exists(self.checkpoint_path):
+                os.makedirs(self.checkpoint_path)
+            if last:
+                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
+                print(f"Saved last checkpoint at step {step}")
+            else:
+                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
+    def load_checkpoint(self):
+        if (
+            not exists(self.checkpoint_path)
+            or not os.path.exists(self.checkpoint_path)
+            or not os.listdir(self.checkpoint_path)
+        ):
+            return 0
+        self.accelerator.wait_for_everyone()
+        if "model_last.pt" in os.listdir(self.checkpoint_path):
+            latest_checkpoint = "model_last.pt"
+        else:
+            latest_checkpoint = sorted(
+                [f for f in os.listdir(self.checkpoint_path) if f.endswith(".pt")],
+                key=lambda x: int("".join(filter(str.isdigit, x))),
+            )[-1]
+        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location="cpu")
+        ### **1. 过滤 `ema_model` 的不匹配参数**
+        if self.is_main:
+            ema_dict = self.ema_model.state_dict()
+            ema_checkpoint_dict = checkpoint["ema_model_state_dict"]
+            filtered_ema_dict = {
+                k: v for k, v in ema_checkpoint_dict.items()
+                if k in ema_dict and ema_dict[k].shape == v.shape  # 仅加载 shape 匹配的参数
+            }
+            print(f"Loading {len(filtered_ema_dict)} / {len(ema_checkpoint_dict)} ema_model params")
+            self.ema_model.load_state_dict(filtered_ema_dict, strict=False)
+        ### **2. 过滤 `model` 的不匹配参数**
+        model_dict = self.accelerator.unwrap_model(self.model).state_dict()
+        checkpoint_model_dict = checkpoint["model_state_dict"]
+        filtered_model_dict = {
+            k: v for k, v in checkpoint_model_dict.items()
+            if k in model_dict and model_dict[k].shape == v.shape  # 仅加载 shape 匹配的参数
+        }
+        print(f"Loading {len(filtered_model_dict)} / {len(checkpoint_model_dict)} model params")
+        self.accelerator.unwrap_model(self.model).load_state_dict(filtered_model_dict, strict=False)
+        ### **3. 加载优化器、调度器和步数**
+        if "step" in checkpoint:
+            if self.scheduler and not self.reset_lr:
+                self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+            step = checkpoint["step"]
+        else:
+            step = 0
+        del checkpoint
+        gc.collect()
+        print("Checkpoint loaded at step", step)
+        return step
+    def train(self, resumable_with_seed: int = None):
+        train_dataloader = self.train_dataloader
+        start_step = self.load_checkpoint()
+        global_step = start_step
+        if resumable_with_seed > 0:
+            orig_epoch_step = len(train_dataloader)
+            skipped_epoch = int(start_step // orig_epoch_step)
+            skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
+        else:
+            skipped_epoch = 0
+        for epoch in range(skipped_epoch, self.epochs):
+            self.model.train()
+            if resumable_with_seed > 0 and epoch == skipped_epoch:
+                progress_bar = tqdm(
+                    skipped_dataloader,
+                    desc=f"Epoch {epoch+1}/{self.epochs}",
+                    unit="step",
+                    disable=not self.accelerator.is_local_main_process,
+                    initial=skipped_batch,
+                    total=orig_epoch_step,
+                    smoothing=0.15
+                )
+            else:
+                progress_bar = tqdm(
+                    train_dataloader,
+                    desc=f"Epoch {epoch+1}/{self.epochs}",
+                    unit="step",
+                    disable=not self.accelerator.is_local_main_process,
+                    smoothing=0.15
+                )
+            for batch in progress_bar:
+                with self.accelerator.accumulate(self.model):
+                    text_inputs = batch["lrc"]
+                    mel_spec = batch["latent"].permute(0, 2, 1)
+                    mel_lengths = batch["latent_lengths"]
+                    style_prompt = batch["prompt"]
+                    style_prompt_lens = batch["prompt_lengths"]
+                    start_time = batch["start_time"]
+                    loss, cond, pred = self.model(
+                        mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler,
+                        style_prompt=style_prompt if self.use_style_prompt else None,
+                        style_prompt_lens=style_prompt_lens if self.use_style_prompt else None,
+                        grad_ckpt=self.grad_ckpt, start_time=start_time
+                    )
+                    self.accelerator.backward(loss)
+                    if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+                    self.optimizer.step()
+                    self.scheduler.step()
+                    self.optimizer.zero_grad()
+                if self.is_main:
+                    self.ema_model.update()
+                global_step += 1
+                if self.accelerator.is_local_main_process:
+                    self.accelerator.log({"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_step)
+                progress_bar.set_postfix(step=str(global_step), loss=loss.item())
+                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
+                    self.save_checkpoint(global_step)
+                if global_step % self.last_per_steps == 0:
+                    self.save_checkpoint(global_step, last=True)
+        self.save_checkpoint(global_step, last=True)
+        self.accelerator.end_training()

diffrhythm/model/utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from __future__ import annotations
+import os
+import random
+from collections import defaultdict
+from importlib.resources import files
+import torch
+from torch.nn.utils.rnn import pad_sequence
+# seed everything
+def seed_everything(seed=0):
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+# helpers
+def exists(v):
+    return v is not None
+def default(v, d):
+    return v if exists(v) else d
+# tensor helpers
+def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]:  # noqa: F722 F821
+    if not exists(length):
+        length = t.amax()
+    seq = torch.arange(length, device=t.device)
+    return seq[None, :] < t[:, None]
+def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]):  # noqa: F722 F821
+    max_seq_len = 2048
+    seq = torch.arange(max_seq_len, device=start.device).long()
+    start_mask = seq[None, :] >= start[:, None]
+    end_mask = seq[None, :] < end[:, None]
+    return start_mask & end_mask
+def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]):  # noqa: F722 F821
+    lengths = (frac_lengths * seq_len).long()
+    max_start = seq_len - lengths
+    rand = torch.rand_like(frac_lengths)
+    start = (max_start * rand).long().clamp(min=0)
+    end = start + lengths
+    return mask_from_start_end_indices(seq_len, start, end)
+def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]:  # noqa: F722
+    if not exists(mask):
+        return t.mean(dim=1)
+    t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
+    num = t.sum(dim=1)
+    den = mask.float().sum(dim=1)
+    return num / den.clamp(min=1.0)
+# simple utf-8 tokenizer, since paper went character based
+def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]:  # noqa: F722
+    list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text]  # ByT5 style
+    text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
+    return text
+# char tokenizer, based on custom dataset's extracted .txt file
+def list_str_to_idx(
+    text: list[str] | list[list[str]],
+    vocab_char_map: dict[str, int],  # {char: idx}
+    padding_value=-1,
+) -> int["b nt"]:  # noqa: F722
+    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
+    text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
+    return text
+# Get tokenizer
+def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
+    """
+    tokenizer   - "pinyin" do g2p for only chinese characters, need .txt vocab_file
+                - "char" for char-wise tokenizer, need .txt vocab_file
+                - "byte" for utf-8 tokenizer
+                - "custom" if you're directly passing in a path to the vocab.txt you want to use
+    vocab_size  - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
+                - if use "char", derived from unfiltered character & symbol counts of custom dataset
+                - if use "byte", set to 256 (unicode byte range)
+    """
+    if tokenizer in ["pinyin", "char"]:
+        tokenizer_path = os.path.join(files("diffrhythm").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
+        with open(tokenizer_path, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+        assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
+    elif tokenizer == "byte":
+        vocab_char_map = None
+        vocab_size = 256
+    elif tokenizer == "custom":
+        with open(dataset_name, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+    return vocab_char_map, vocab_size
+# convert char to pinyin
+def convert_char_to_pinyin(text_list, polyphone=True):
+    final_text_list = []
+    god_knows_why_en_testset_contains_zh_quote = str.maketrans(
+        {"“": '"', "”": '"', "‘": "'", "’": "'"}
+    )  # in case librispeech (orig no-pc) test-clean
+    custom_trans = str.maketrans({";": ","})  # add custom trans here, to address oov
+    for text in text_list:
+        char_list = []
+        text = text.translate(god_knows_why_en_testset_contains_zh_quote)
+        text = text.translate(custom_trans)
+        for seg in jieba.cut(text):
+            seg_byte_len = len(bytes(seg, "UTF-8"))
+            if seg_byte_len == len(seg):  # if pure alphabets and symbols
+                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
+                    char_list.append(" ")
+                char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure chinese characters
+                seg = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
+                for c in seg:
+                    if c not in "。，、；：？！《》【】—…":
+                        char_list.append(" ")
+                    char_list.append(c)
+            else:  # if mixed chinese characters, alphabets and symbols
+                for c in seg:
+                    if ord(c) < 256:
+                        char_list.extend(c)
+                    else:
+                        if c not in "。，、；：？！《》【】—…":
+                            char_list.append(" ")
+                            char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
+                        else:  # if is zh punc
+                            char_list.append(c)
+        final_text_list.append(char_list)
+    return final_text_list
+# filter func for dirty data with many repetitions
+def repetition_found(text, length=2, tolerance=10):
+    pattern_count = defaultdict(int)
+    for i in range(len(text) - length + 1):
+        pattern = text[i : i + length]
+        pattern_count[pattern] += 1
+    for pattern, count in pattern_count.items():
+        if count > tolerance:
+            return True
+    return False

prompt/gift_of_the_world.wav ADDED Viewed

Binary file (960 kB). View file

prompt/little_happiness.wav ADDED Viewed

Binary file (960 kB). View file

prompt/little_talks.wav ADDED Viewed

Binary file (960 kB). View file

prompt/ltwyl.wav ADDED Viewed

Binary file (882 kB). View file

prompt/most_beautiful_expectation.wav ADDED Viewed

Binary file (960 kB). View file