Upload 15 files

Browse files

Files changed (15) hide show

LICENSE +201 -0
README.md +43 -3
acestep/checkpoints/music_dcae_f8c8/config.json +69 -0
acestep/checkpoints/music_dcae_f8c8/diffusion_pytorch_model.safetensors +3 -0
acestep/checkpoints/music_vocoder/config.json +79 -0
acestep/checkpoints/music_vocoder/diffusion_pytorch_model.safetensors +3 -0
acestep/music_dcae/__init__.py +0 -0
acestep/music_dcae/music_dcae_pipeline.py +379 -0
acestep/music_dcae/music_log_mel.py +115 -0
acestep/music_dcae/music_vocoder.py +587 -0
checkpoints/checkpoint_461260.safetensors +3 -0
checkpoints/tag_mapping.json +858 -0
gradio_app.py +265 -0
model.py +490 -0
requirements.txt +10 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,43 @@
----
-license: apache-2.0
----

+# LocalSong
+LocalSong is an audio generation model focused on melodic instrumental music that uses tag-based conditioning to generate audio.
+## Installation
+### Prerequisites
+- Python 3.10 or higher
+- CUDA-capable GPU recommended
+### Setup
+git clone https://huggingface.co/Localsong/LocalSong
+cd localsong
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+### Run
+python gradio_app.py
+The interface will be available at `http://localhost:7860`
+### Generation Advice
+Generations should use one of the soundtrack, soundtrack1 or soundtrack2 tags, as well as at least one other tag. They can use up to 8 tags; try combining genres and instruments.
+The default settings (CFG 3.5, steps 200) have been tested as optimal.
+The first generation will be slower due to torch.compile, then speed will increase.
+The model was trained on vocals but not lyrics. Vocals will not have recognizable words.
+## Credits
+This project builds upon the following open-source projects:
+- **Model Architecture**: Adapted from [DDT](https://github.com/MCG-NJU/DDT)
+- **Flow Matching**: Adapted from [minRF](https://github.com/cloneofsimo/minRF)
+- **Audio VAE**: [ACE-Step](https://github.com/ACE-Step/ACE-Step)
+## License
+This project is licensed under the Apache License 2.0

acestep/checkpoints/music_dcae_f8c8/config.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "_class_name": "AutoencoderDC",
+  "_diffusers_version": "0.32.2",
+  "_name_or_path": "checkpoints/music_dcae_f8c8",
+  "attention_head_dim": 32,
+  "decoder_act_fns": "silu",
+  "decoder_block_out_channels": [
+    128,
+    256,
+    512,
+    1024
+  ],
+  "decoder_block_types": [
+    "ResBlock",
+    "ResBlock",
+    "ResBlock",
+    "EfficientViTBlock"
+  ],
+  "decoder_layers_per_block": [
+    3,
+    3,
+    3,
+    3
+  ],
+  "decoder_norm_types": "rms_norm",
+  "decoder_qkv_multiscales": [
+    [],
+    [],
+    [
+      5
+    ],
+    [
+      5
+    ]
+  ],
+  "downsample_block_type": "Conv",
+  "encoder_block_out_channels": [
+    128,
+    256,
+    512,
+    1024
+  ],
+  "encoder_block_types": [
+    "ResBlock",
+    "ResBlock",
+    "ResBlock",
+    "EfficientViTBlock"
+  ],
+  "encoder_layers_per_block": [
+    2,
+    2,
+    3,
+    3
+  ],
+  "encoder_qkv_multiscales": [
+    [],
+    [],
+    [
+      5
+    ],
+    [
+      5
+    ]
+  ],
+  "in_channels": 2,
+  "latent_channels": 8,
+  "scaling_factor": 0.41407,
+  "upsample_block_type": "interpolate"
+}

acestep/checkpoints/music_dcae_f8c8/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0cb469307ac50659d1880db2a99bae47d0df335cbb36853964662d4b80e8ee
+size 313646516

acestep/checkpoints/music_vocoder/config.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "_class_name": "ADaMoSHiFiGANV1",
+  "_diffusers_version": "0.32.2",
+  "depths": [
+    3,
+    3,
+    9,
+    3
+  ],
+  "dims": [
+    128,
+    256,
+    384,
+    512
+  ],
+  "drop_path_rate": 0.0,
+  "f_max": 16000,
+  "f_min": 40,
+  "hop_length": 512,
+  "input_channels": 128,
+  "kernel_sizes": [
+    7
+  ],
+  "n_fft": 2048,
+  "n_mels": 128,
+  "num_mels": 512,
+  "post_conv_kernel_size": 13,
+  "pre_conv_kernel_size": 13,
+  "resblock_dilation_sizes": [
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ]
+  ],
+  "resblock_kernel_sizes": [
+    3,
+    7,
+    11,
+    13
+  ],
+  "sampling_rate": 44100,
+  "upsample_initial_channel": 1024,
+  "upsample_kernel_sizes": [
+    8,
+    8,
+    4,
+    4,
+    4,
+    4,
+    4
+  ],
+  "upsample_rates": [
+    4,
+    4,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "use_template": false,
+  "win_length": 2048
+}

acestep/checkpoints/music_vocoder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c92c9b46e28ab7b37b777780cf4308ad7ddac869636bb77aa61599358c4bc1c0
+size 206350988

acestep/music_dcae/__init__.py ADDED Viewed

File without changes

acestep/music_dcae/music_dcae_pipeline.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
+import os
+import torch
+from diffusers import AutoencoderDC
+import torchaudio
+import torchvision.transforms as transforms
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from tqdm import tqdm
+from acestep.music_dcae.music_vocoder import ADaMoSHiFiGANV1
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DEFAULT_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_dcae_f8c8")
+VOCODER_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_vocoder")
+class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    @register_to_config
+    def __init__(
+        self,
+        source_sample_rate=None,
+        dcae_checkpoint_path=DEFAULT_PRETRAINED_PATH,
+        vocoder_checkpoint_path=VOCODER_PRETRAINED_PATH,
+    ):
+        super(MusicDCAE, self).__init__()
+        self.dcae = AutoencoderDC.from_pretrained(dcae_checkpoint_path)
+        self.vocoder = ADaMoSHiFiGANV1.from_pretrained(vocoder_checkpoint_path)
+        if source_sample_rate is None:
+            source_sample_rate = 48000
+        self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
+        self.transform = transforms.Compose(
+            [
+                transforms.Normalize(0.5, 0.5),
+            ]
+        )
+        self.min_mel_value = -11.0
+        self.max_mel_value = 3.0
+        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
+        self.mel_chunk_size = 1024
+        self.time_dimention_multiple = 8
+        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
+        self.scale_factor = 0.1786
+        self.shift_factor = -1.9091
+    def load_audio(self, audio_path):
+        audio, sr = torchaudio.load(audio_path)
+        if audio.shape[0] == 1:
+            audio = audio.repeat(2, 1)
+        return audio, sr
+    def forward_mel(self, audios):
+        mels = []
+        for i in range(len(audios)):
+            image = self.vocoder.mel_transform(audios[i])
+            mels.append(image)
+        mels = torch.stack(mels)
+        return mels
+    @torch.no_grad()
+    def encode(self, audios, audio_lengths=None, sr=None):
+        if audio_lengths is None:
+            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
+            audio_lengths = audio_lengths.to(audios.device)
+        # audios: N x 2 x T, 48kHz
+        device = audios.device
+        dtype = audios.dtype
+        if sr is None:
+            sr = 48000
+            resampler = self.resampler
+        else:
+            resampler = torchaudio.transforms.Resample(sr, 44100).to(device).to(dtype)
+        audio = resampler(audios)
+        max_audio_len = audio.shape[-1]
+        if max_audio_len % (8 * 512) != 0:
+            audio = torch.nn.functional.pad(
+                audio, (0, 8 * 512 - max_audio_len % (8 * 512))
+            )
+        mels = self.forward_mel(audio)
+        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
+        mels = self.transform(mels)
+        latents = []
+        for mel in mels:
+            latent = self.dcae.encoder(mel.unsqueeze(0))
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        latent_lengths = (
+            audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple
+        ).long()
+        latents = (latents - self.shift_factor) * self.scale_factor
+        return latents, latent_lengths
+    @torch.no_grad()
+    def decode(self, latents, audio_lengths=None, sr=None):
+        latents = latents / self.scale_factor + self.shift_factor
+        pred_wavs = []
+        for latent in latents:
+            mels = self.dcae.decoder(latent.unsqueeze(0))
+            mels = mels * 0.5 + 0.5
+            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            # wav = self.vocoder.decode(mels[0]).squeeze(1)
+            # decode waveform for each channels to reduce vram footprint
+            wav_ch1 = self.vocoder.decode(mels[:,0,:,:]).squeeze(1).cpu()
+            wav_ch2 = self.vocoder.decode(mels[:,1,:,:]).squeeze(1).cpu()
+            wav = torch.cat([wav_ch1, wav_ch2],dim=0)
+            if sr is not None:
+                resampler = (
+                    torchaudio.transforms.Resample(44100, sr)
+                )
+                wav = resampler(wav.cpu().float())
+            else:
+                sr = 44100
+            pred_wavs.append(wav)
+        if audio_lengths is not None:
+            pred_wavs = [
+                wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)
+            ]
+        return sr, pred_wavs
+    @torch.no_grad()
+    def decode_overlap(self, latents, audio_lengths=None, sr=None):
+        """
+        Decodes latents into waveforms using an overlapped DCAE and Vocoder.
+        """
+        print("Using Overlapped DCAE and Vocoder")
+        MODEL_INTERNAL_SR = 44100
+        DCAE_LATENT_TO_MEL_STRIDE = 8
+        VOCODER_AUDIO_SAMPLES_PER_MEL_FRAME = 512
+        pred_wavs = []
+        final_output_sr = sr if sr is not None else MODEL_INTERNAL_SR
+        # --- DCAE Parameters ---
+        # dcae_win_len_latent: Window length in the latent domain for DCAE processing
+        dcae_win_len_latent = 512
+        # dcae_mel_win_len: Expected mel window length from DCAE decoder output (latent_win * stride)
+        dcae_mel_win_len = dcae_win_len_latent * 8
+        # dcae_anchor_offset: Offset from anchor point to actual start of latent window slice
+        dcae_anchor_offset = dcae_win_len_latent // 4
+        # dcae_anchor_hop: Hop size for anchor points in latent domain
+        dcae_anchor_hop = dcae_win_len_latent // 2
+        # dcae_mel_overlap_len: Overlap length in the mel domain to be trimmed/blended
+        dcae_mel_overlap_len = dcae_mel_win_len // 4
+        # --- Vocoder Parameters ---
+        # vocoder_win_len_audio: Audio samples per vocoder processing window
+        vocoder_win_len_audio = 512 * 512 # Example: 262144 samples
+        # vocoder_overlap_len_audio: Audio samples for overlap between vocoder windows
+        vocoder_overlap_len_audio = 1024
+        # vocoder_hop_len_audio: Hop size in audio samples for vocoder processing
+        vocoder_hop_len_audio = vocoder_win_len_audio - 2 * vocoder_overlap_len_audio
+        # vocoder_input_mel_frames_per_block: Number of mel frames fed to vocoder in one go
+        vocoder_input_mel_frames_per_block = vocoder_win_len_audio // VOCODER_AUDIO_SAMPLES_PER_MEL_FRAME
+        crossfade_len_audio = 128 # Audio samples for crossfading vocoder outputs
+        cf_win_tail = torch.linspace(1, 0, crossfade_len_audio, device=self.device).unsqueeze(0).unsqueeze(0)
+        cf_win_head = torch.linspace(0, 1, crossfade_len_audio, device=self.device).unsqueeze(0).unsqueeze(0)
+        for latent_idx, latent_item in enumerate(latents):
+            latent_item = latent_item.to(self.device)
+            current_latent = (latent_item / self.scale_factor + self.shift_factor).unsqueeze(0) # (1, C, H, W_latent)
+            latent_len = current_latent.shape[3]
+            # 1. DCAE: Latent to Mel Spectrogram (Overlapped)
+            mels_segments = []
+            if latent_len == 0:
+                pass # No mel segments to generate
+            else:
+                # Determine anchor points for DCAE windows
+                # An anchor marks a reference point for a window slice.
+                # Window slice: current_latent[..., anchor - offset : anchor - offset + win_len]
+                # First anchor ensures window starts at 0. Last anchor ensures tail is covered.
+                dcae_anchors = list(range(dcae_anchor_offset, latent_len - dcae_anchor_offset, dcae_anchor_hop))
+                if not dcae_anchors: # If latent is too short for the range, use one anchor
+                    dcae_anchors = [dcae_anchor_offset]
+                for i, anchor in enumerate(dcae_anchors):
+                    win_start_idx = max(0, anchor - dcae_anchor_offset)
+                    win_end_idx = min(latent_len, win_start_idx + dcae_win_len_latent)
+                    dcae_input_segment = current_latent[:, :, :, win_start_idx:win_end_idx]
+                    if dcae_input_segment.shape[3] == 0: continue
+                    mel_output_full = self.dcae.decoder(dcae_input_segment) # (1, C, H_mel, W_mel_fixed_from_dcae)
+                    is_first = (i == 0)
+                    is_last = (i == len(dcae_anchors) - 1)
+                    if is_first and is_last: # Only one segment
+                        # Use mel corresponding to actual input latent length
+                        true_mel_content_len = dcae_input_segment.shape[3] * DCAE_LATENT_TO_MEL_STRIDE
+                        mel_to_keep = mel_output_full[:, :, :, :min(true_mel_content_len, mel_output_full.shape[3])]
+                    elif is_first: # First segment, trim end overlap
+                        mel_to_keep = mel_output_full[:, :, :, :-dcae_mel_overlap_len]
+                    elif is_last: # Last segment, trim start overlap
+                        # And ensure we only take content relevant to the (potentially partial) last latent window
+                        # The mel_output_full is fixed length. The useful part starts after overlap.
+                        # The length of the useful part depends on how much of dcae_input_segment was actual content.
+                        # For simplicity in overlap-add, typically trim fixed overlap.
+                        # If dcae_input_segment was shorter than dcae_win_len_latent, mel_output_full might contain padding effects.
+                        # Standard OLA keeps the corresponding tail.
+                        mel_to_keep = mel_output_full[:, :, :, dcae_mel_overlap_len:]
+                    else: # Middle segment, trim both overlaps
+                        mel_to_keep = mel_output_full[:, :, :, dcae_mel_overlap_len:-dcae_mel_overlap_len]
+                    if mel_to_keep.shape[3] > 0:
+                        mels_segments.append(mel_to_keep)
+            if not mels_segments:
+                num_mel_channels = current_latent.shape[1]
+                mel_height = self.dcae.decoder_output_mel_height
+                concatenated_mels = torch.empty(
+                    (1, num_mel_channels, mel_height, 0),
+                    device=current_latent.device, dtype=current_latent.dtype
+                )
+            else:
+                concatenated_mels = torch.cat(mels_segments, dim=3)
+            # Denormalize mels
+            concatenated_mels = concatenated_mels * 0.5 + 0.5
+            concatenated_mels = concatenated_mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            mel_total_frames = concatenated_mels.shape[3]
+            # 2. Vocoder: Mel Spectrogram to Waveform (Overlapped)
+            if mel_total_frames == 0:
+                # Assuming mono or stereo output based on mel channels (typically mono for vocoder from single mel)
+                num_audio_channels = 1 # Or determine from vocoder capabilities / mel channels
+                final_wav = torch.zeros((num_audio_channels, 0), device=self.device, dtype=torch.float32)
+            else:
+                # Initial vocoder window
+                # Vocoder expects (C_mel, H_mel, W_mel_block)
+                mel_block = concatenated_mels[0, :, :, :vocoder_input_mel_frames_per_block].to(self.device)
+                # Pad mel_block if it's shorter than vocoder_input_mel_frames_per_block (e.g. very short audio)
+                if 0 < mel_block.shape[2] < vocoder_input_mel_frames_per_block:
+                    pad_len = vocoder_input_mel_frames_per_block - mel_block.shape[2]
+                    mel_block = torch.nn.functional.pad(mel_block, (0, pad_len), mode='constant', value=0) # Pad last dim
+                current_audio_output = self.vocoder.decode(mel_block) # (C_audio, 1, Samples)
+                current_audio_output = current_audio_output[:, :, :-vocoder_overlap_len_audio] # Remove end overlap
+                # p_audio_samples tracks the start of the *next* audio segment to generate (in conceptual total audio samples)
+                p_audio_samples = vocoder_hop_len_audio
+                conceptual_total_audio_len_native_sr = mel_total_frames * VOCODER_AUDIO_SAMPLES_PER_MEL_FRAME
+                pbar_total = 1 + max(0, (conceptual_total_audio_len_native_sr - (vocoder_win_len_audio - vocoder_overlap_len_audio))) // vocoder_hop_len_audio
+                # Use tqdm if you want a progress bar for the vocoder part
+                # with tqdm(total=pbar_total, desc=f"Vocoder {latent_idx+1}/{len(latents)}", leave=False) as pbar:
+                # pbar.update(1) # For initial window
+                # The loop for subsequent windows
+                while p_audio_samples < conceptual_total_audio_len_native_sr:
+                    mel_frame_start = p_audio_samples // VOCODER_AUDIO_SAMPLES_PER_MEL_FRAME
+                    mel_frame_end = mel_frame_start + vocoder_input_mel_frames_per_block
+                    if mel_frame_start >= mel_total_frames: break # No more mel frames
+                    mel_block = concatenated_mels[0, :, :, mel_frame_start:min(mel_frame_end, mel_total_frames)].to(self.device)
+                    if mel_block.shape[2] == 0: break # Should not happen if mel_frame_start is valid
+                    # Pad if current mel_block is too short (end of sequence)
+                    if mel_block.shape[2] < vocoder_input_mel_frames_per_block:
+                        pad_len = vocoder_input_mel_frames_per_block - mel_block.shape[2]
+                        mel_block = torch.nn.functional.pad(mel_block, (0, pad_len), mode='constant', value=0)
+                    new_audio_win = self.vocoder.decode(mel_block) # (C_audio, 1, Samples)
+                    # Crossfade
+                    # Determine actual crossfade length based on available audio
+                    actual_cf_len = min(crossfade_len_audio, current_audio_output.shape[2], new_audio_win.shape[2] - (vocoder_overlap_len_audio - crossfade_len_audio))
+                    if actual_cf_len > 0: # Ensure valid slice lengths for crossfade
+                        tail_part = current_audio_output[:, :, -actual_cf_len:]
+                        head_part = new_audio_win[:, :, vocoder_overlap_len_audio - actual_cf_len : vocoder_overlap_len_audio]
+                        crossfaded_segment = tail_part * cf_win_tail[:,:,:actual_cf_len] + \
+                                             head_part * cf_win_head[:,:,:actual_cf_len]
+                        current_audio_output = torch.cat([current_audio_output[:, :, :-actual_cf_len], crossfaded_segment], dim=2)
+                    # Append non-overlapping part of new_audio_win
+                    is_final_append = (p_audio_samples + vocoder_hop_len_audio >= conceptual_total_audio_len_native_sr)
+                    if is_final_append:
+                        segment_to_append = new_audio_win[:, :, vocoder_overlap_len_audio:]
+                    else:
+                        segment_to_append = new_audio_win[:, :, vocoder_overlap_len_audio:-vocoder_overlap_len_audio]
+                    current_audio_output = torch.cat([current_audio_output, segment_to_append], dim=2)
+                    p_audio_samples += vocoder_hop_len_audio
+                    # pbar.update(1) # if using tqdm
+                final_wav = current_audio_output.squeeze(1) # (C_audio, Samples)
+            # 3. Resampling (if necessary)
+            if final_output_sr != MODEL_INTERNAL_SR and final_wav.numel() > 0:
+                # Resample expects CPU tensor if using torchaudio.transforms on older versions or for some backends
+                resampler = torchaudio.transforms.Resample(
+                    MODEL_INTERNAL_SR, final_output_sr, dtype=final_wav.dtype
+                )
+                final_wav = resampler(final_wav.cpu()).to(self.device) # Move back to device if needed later
+            pred_wavs.append(final_wav)
+        # 4. Final Truncation
+        processed_pred_wavs = []
+        for i, wav in enumerate(pred_wavs):
+            # Calculate expected length based on original latent, at the FINAL output sample rate
+            _num_latent_frames = latents[i].shape[-1] # Use original latent item for shape
+            _num_mel_frames = _num_latent_frames * DCAE_LATENT_TO_MEL_STRIDE
+            _conceptual_native_audio_len = _num_mel_frames * VOCODER_AUDIO_SAMPLES_PER_MEL_FRAME
+            max_possible_len = int(_conceptual_native_audio_len * final_output_sr / MODEL_INTERNAL_SR)
+            current_wav_len = wav.shape[1]
+            if audio_lengths is not None:
+                # User-provided length is the primary target, capped by actual and max possible
+                target_len = min(audio_lengths[i], current_wav_len, max_possible_len)
+            else:
+                # No user length, use max possible capped by actual
+                target_len = min(max_possible_len, current_wav_len)
+            processed_pred_wavs.append(wav[:, :max(0, target_len)].cpu()) # Ensure length is non-negative
+        return final_output_sr, processed_pred_wavs
+    def forward(self, audios, audio_lengths=None, sr=None):
+        latents, latent_lengths = self.encode(
+            audios=audios, audio_lengths=audio_lengths, sr=sr
+        )
+        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
+        return sr, pred_wavs, latents, latent_lengths
+if __name__ == "__main__":
+    audio, sr = torchaudio.load("test.wav")
+    audio_lengths = torch.tensor([audio.shape[1]])
+    audios = audio.unsqueeze(0)
+    # test encode only
+    model = MusicDCAE()
+    # latents, latent_lengths = model.encode(audios, audio_lengths)
+    # print("latents shape: ", latents.shape)
+    # print("latent_lengths: ", latent_lengths)
+    # test encode and decode
+    sr, pred_wavs, latents, latent_lengths = model(audios, audio_lengths, sr)
+    print("reconstructed wavs: ", pred_wavs[0].shape)
+    print("latents shape: ", latents.shape)
+    print("latent_lengths: ", latent_lengths)
+    print("sr: ", sr)
+    torchaudio.save("test_reconstructed.wav", pred_wavs[0], sr)
+    print("test_reconstructed.wav")

acestep/music_dcae/music_log_mel.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchaudio.transforms import MelScale
+class LinearSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        center=False,
+        mode="pow2_sqrt",
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.mode = mode
+        self.register_buffer("window", torch.hann_window(win_length))
+    def forward(self, y: Tensor) -> Tensor:
+        if y.ndim == 3:
+            y = y.squeeze(1)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                (self.win_length - self.hop_length) // 2,
+                (self.win_length - self.hop_length + 1) // 2,
+            ),
+            mode="reflect",
+        ).squeeze(1)
+        dtype = y.dtype
+        spec = torch.stft(
+            y.float(),
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        if self.mode == "pow2_sqrt":
+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        spec = spec.to(dtype)
+        return spec
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate=44100,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        center=False,
+        f_min=0.0,
+        f_max=None,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max or sample_rate // 2
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
+        self.mel_scale = MelScale(
+            self.n_mels,
+            self.sample_rate,
+            self.f_min,
+            self.f_max,
+            self.n_fft // 2 + 1,
+            "slaney",
+            "slaney",
+        )
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
+        linear = self.spectrogram(x)
+        x = self.mel_scale(linear)
+        x = self.compress(x)
+        # print(x.shape)
+        if return_linear:
+            return x, self.compress(linear)
+        return x

acestep/music_dcae/music_vocoder.py ADDED Viewed

	@@ -0,0 +1,587 @@

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
+import librosa
+import torch
+from torch import nn
+from functools import partial
+from math import prod
+from typing import Callable, Tuple, List
+import numpy as np
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+try:
+    from music_log_mel import LogMelSpectrogram
+except ImportError:
+    from .music_log_mel import LogMelSpectrogram
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """  # noqa: E501
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """  # noqa: E501
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None] * x + self.bias[:, None]
+            return x
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+    def __init__(
+        self,
+        dim: int,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        x = self.drop_path(x)
+        if apply_residual:
+            x = input + x
+        return x
+class ParallelConvNeXtBlock(nn.Module):
+    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
+                for kernel_size in kernel_sizes
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.stack(
+            [block(x, apply_residual=False) for block in self.blocks] + [x],
+            dim=1,
+        ).sum(dim=1)
+class ConvNeXtEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        kernel_sizes: Tuple[int] = (7,),
+    ):
+        super().__init__()
+        assert len(depths) == len(dims)
+        self.channel_layers = nn.ModuleList()
+        stem = nn.Sequential(
+            nn.Conv1d(
+                input_channels,
+                dims[0],
+                kernel_size=7,
+                padding=3,
+                padding_mode="replicate",
+            ),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.channel_layers.append(stem)
+        for i in range(len(depths) - 1):
+            mid_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
+            )
+            self.channel_layers.append(mid_layer)
+        block_fn = (
+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
+            if len(kernel_sizes) == 1
+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
+        )
+        self.stages = nn.ModuleList()
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(len(depths)):
+            stage = nn.Sequential(
+                *[
+                    block_fn(
+                        dim=dims[i],
+                        drop_path=drop_path_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        for channel_layer, stage in zip(self.channel_layers, self.stages):
+            x = channel_layer(x)
+            x = stage(x)
+        return self.norm(x)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.silu(x)
+            xt = c1(xt)
+            xt = F.silu(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for conv in self.convs1:
+            remove_weight_norm(conv)
+        for conv in self.convs2:
+            remove_weight_norm(conv)
+class HiFiGANGenerator(nn.Module):
+    def __init__(
+        self,
+        *,
+        hop_length: int = 512,
+        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 128,
+        upsample_initial_channel: int = 512,
+        use_template: bool = True,
+        pre_conv_kernel_size: int = 7,
+        post_conv_kernel_size: int = 7,
+        post_activation: Callable = partial(nn.SiLU, inplace=True),
+    ):
+        super().__init__()
+        assert (
+            prod(upsample_rates) == hop_length
+        ), f"hop_length must be {prod(upsample_rates)}"
+        self.conv_pre = weight_norm(
+            nn.Conv1d(
+                num_mels,
+                upsample_initial_channel,
+                pre_conv_kernel_size,
+                1,
+                padding=get_padding(pre_conv_kernel_size),
+            )
+        )
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.noise_convs = nn.ModuleList()
+        self.use_template = use_template
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    nn.ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if not use_template:
+                continue
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
+                self.resblocks.append(ResBlock1(ch, k, d))
+        self.activation_post = post_activation()
+        self.conv_post = weight_norm(
+            nn.Conv1d(
+                ch,
+                1,
+                post_conv_kernel_size,
+                1,
+                padding=get_padding(post_conv_kernel_size),
+            )
+        )
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, template=None):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.silu(x, inplace=True)
+            x = self.ups[i](x)
+            if self.use_template:
+                x = x + self.noise_convs[i](template)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for up in self.ups:
+            remove_weight_norm(up)
+        for block in self.resblocks:
+            block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class ADaMoSHiFiGANV1(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_channels: int = 128,
+        depths: List[int] = [3, 3, 9, 3],
+        dims: List[int] = [128, 256, 384, 512],
+        drop_path_rate: float = 0.0,
+        kernel_sizes: Tuple[int] = (7,),
+        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+        ),
+        num_mels: int = 512,
+        upsample_initial_channel: int = 1024,
+        use_template: bool = False,
+        pre_conv_kernel_size: int = 13,
+        post_conv_kernel_size: int = 13,
+        sampling_rate: int = 44100,
+        n_fft: int = 2048,
+        win_length: int = 2048,
+        hop_length: int = 512,
+        f_min: int = 40,
+        f_max: int = 16000,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+        self.backbone = ConvNeXtEncoder(
+            input_channels=input_channels,
+            depths=depths,
+            dims=dims,
+            drop_path_rate=drop_path_rate,
+            kernel_sizes=kernel_sizes,
+        )
+        self.head = HiFiGANGenerator(
+            hop_length=hop_length,
+            upsample_rates=upsample_rates,
+            upsample_kernel_sizes=upsample_kernel_sizes,
+            resblock_kernel_sizes=resblock_kernel_sizes,
+            resblock_dilation_sizes=resblock_dilation_sizes,
+            num_mels=num_mels,
+            upsample_initial_channel=upsample_initial_channel,
+            use_template=use_template,
+            pre_conv_kernel_size=pre_conv_kernel_size,
+            post_conv_kernel_size=post_conv_kernel_size,
+        )
+        self.sampling_rate = sampling_rate
+        self.mel_transform = LogMelSpectrogram(
+            sample_rate=sampling_rate,
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            f_min=f_min,
+            f_max=f_max,
+            n_mels=n_mels,
+        )
+        self.eval()
+    @torch.no_grad()
+    def decode(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+    @torch.no_grad()
+    def encode(self, x):
+        return self.mel_transform(x)
+    def forward(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+if __name__ == "__main__":
+    import soundfile as sf
+    x = "test_audio.wav"
+    model = ADaMoSHiFiGANV1.from_pretrained(
+        "./checkpoints/music_vocoder", local_files_only=True
+    )
+    wav, sr = librosa.load(x, sr=44100, mono=True)
+    wav = torch.from_numpy(wav).float()[None]
+    mel = model.encode(wav)
+    wav = model.decode(mel)[0].mT
+    sf.write("test_audio_vocoder_rec.wav", wav.cpu().numpy(), 44100)

checkpoints/checkpoint_461260.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:796a66a9a098ec75554897e830868c8eb4a9a90c35bb4f972ce317420bb1bbb5
+size 2920814816

checkpoints/tag_mapping.json ADDED Viewed

	@@ -0,0 +1,858 @@

+{
+  "rock": 1697,
+  "male vocalist": 1698,
+  "pop": 1699,
+  "energetic": 1700,
+  "instrumental": 1701,
+  "electronic": 1702,
+  "rhythmic": 1703,
+  "female vocalist": 1704,
+  "passionate": 1705,
+  "atmospheric": 1706,
+  "rap": 1707,
+  "hip hop": 1708,
+  "uplifting": 1709,
+  "metal": 1710,
+  "alternative rock": 1711,
+  "pop rock": 1712,
+  "dark": 1713,
+  "anthemic": 1714,
+  "male vocals": 1715,
+  "melancholic": 1716,
+  "epic": 1717,
+  "bittersweet": 1718,
+  "love": 1719,
+  "dance": 1720,
+  "warm": 1721,
+  "electronic dance music": 1722,
+  "female vocals": 1723,
+  "lush": 1724,
+  "trap": 1725,
+  "introspective": 1726,
+  "aggressive": 1727,
+  "r&b": 1728,
+  "playful": 1729,
+  "regional music": 1730,
+  "dance-pop": 1731,
+  "hard rock": 1732,
+  "ambient": 1733,
+  "ethereal": 1734,
+  "emotional": 1735,
+  "heavy": 1736,
+  "piano": 1737,
+  "mellow": 1738,
+  "jazz": 1739,
+  "folk": 1740,
+  "country": 1741,
+  "house": 1742,
+  "party": 1743,
+  "romantic": 1744,
+  "orchestral": 1745,
+  "pop rap": 1746,
+  "acoustic": 1747,
+  "electropop": 1748,
+  "electro": 1749,
+  "nocturnal": 1750,
+  "bass": 1751,
+  "guitar": 1752,
+  "urban": 1753,
+  "soul": 1754,
+  "psychedelic": 1755,
+  "edm": 1756,
+  "experimental": 1757,
+  "funk": 1758,
+  "futuristic": 1759,
+  "boastful": 1760,
+  "hypnotic": 1761,
+  "heavy metal": 1762,
+  "contemporary r&b": 1763,
+  "techno": 1764,
+  "eclectic": 1765,
+  "longing": 1766,
+  "violin": 1767,
+  "sentimental": 1768,
+  "synthpop": 1769,
+  "cinematic": 1770,
+  "happy": 1771,
+  "repetitive": 1772,
+  "progressive": 1773,
+  "catchy": 1774,
+  "sad": 1775,
+  "indie pop": 1776,
+  "indie rock": 1777,
+  "singer-songwriter": 1778,
+  "classical music": 1779,
+  "slow": 1780,
+  "northern american music": 1781,
+  "sampling": 1782,
+  "trance": 1783,
+  "western classical music": 1784,
+  "upbeat": 1785,
+  "blues": 1786,
+  "hip-hop": 1787,
+  "ballad": 1788,
+  "soothing": 1789,
+  "synthwave": 1790,
+  "electric guitar": 1791,
+  "calm": 1792,
+  "raw": 1793,
+  "downtempo": 1794,
+  "hardcore hip hop": 1795,
+  "soft": 1796,
+  "dubstep": 1797,
+  "classical": 1798,
+  "film score": 1799,
+  "synth": 1800,
+  "triumphant": 1801,
+  "drums": 1802,
+  "punk": 1803,
+  "female voice": 1804,
+  "angry": 1805,
+  "alternative metal": 1806,
+  "acoustic guitar": 1807,
+  "lo-fi": 1808,
+  "male voice": 1809,
+  "dense": 1810,
+  "progressive rock": 1811,
+  "optimistic": 1812,
+  "ominous": 1813,
+  "reggae": 1814,
+  "sombre": 1815,
+  "mysterious": 1816,
+  "complex": 1817,
+  "contemporary folk": 1818,
+  "disco": 1819,
+  "drum and bass": 1820,
+  "new wave": 1821,
+  "nu metal": 1822,
+  "summer": 1823,
+  "sensual": 1824,
+  "powerful": 1825,
+  "folk rock": 1826,
+  "glitch": 1827,
+  "symphonic metal": 1828,
+  "emo": 1829,
+  "power metal": 1830,
+  "conscious": 1831,
+  "technical": 1832,
+  "suspenseful": 1833,
+  "dramatic": 1834,
+  "electro house": 1835,
+  "deep": 1836,
+  "swing": 1837,
+  "punk rock": 1838,
+  "gangsta rap": 1839,
+  "soulful": 1840,
+  "intense": 1841,
+  "industrial": 1842,
+  "cinematic classical": 1843,
+  "k-pop": 1844,
+  "new age": 1845,
+  "hedonistic": 1846,
+  "synth-pop": 1847,
+  "meditative": 1848,
+  "cello": 1849,
+  "pop punk": 1850,
+  "chillout": 1851,
+  "metalcore": 1852,
+  "dreamy": 1853,
+  "rebellious": 1854,
+  "east coast hip hop": 1855,
+  "progressive metal": 1856,
+  "lonely": 1857,
+  "conscious hip hop": 1858,
+  "flute": 1859,
+  "chill": 1860,
+  "phonk": 1861,
+  "blues rock": 1862,
+  "drum": 1863,
+  "quirky": 1864,
+  "pop soul": 1865,
+  "j-pop": 1866,
+  "groovy": 1867,
+  "trip hop": 1868,
+  "fantasy": 1869,
+  "dream pop": 1870,
+  "psychedelic rock": 1871,
+  "beat": 1872,
+  "country rock": 1873,
+  "surreal": 1874,
+  "gospel": 1875,
+  "fast": 1876,
+  "soft rock": 1877,
+  "smooth": 1878,
+  "peaceful": 1879,
+  "poetic": 1880,
+  "opera": 1881,
+  "power pop": 1882,
+  "indie folk": 1883,
+  "indie": 1884,
+  "mechanical": 1885,
+  "breakbeat": 1886,
+  "anxious": 1887,
+  "female vocal": 1888,
+  "deep bass": 1889,
+  "post-punk": 1890,
+  "grunge": 1891,
+  "breakup": 1892,
+  "choir": 1893,
+  "orchestra": 1894,
+  "avant-garde": 1895,
+  "deep house": 1896,
+  "boom bap": 1897,
+  "folk pop": 1898,
+  "pastoral": 1899,
+  "jazz fusion": 1900,
+  "progressive house": 1901,
+  "synthesizer": 1902,
+  "nostalgic": 1903,
+  "funky": 1904,
+  "country pop": 1905,
+  "death": 1906,
+  "spiritual": 1907,
+  "soundtrack": 1908,
+  "2000s": 1909,
+  "choral": 1910,
+  "strings": 1911,
+  "fun": 1912,
+  "electric": 1913,
+  "post-grunge": 1914,
+  "female singer": 1915,
+  "male vocal": 1916,
+  "modern classical": 1917,
+  "death metal": 1918,
+  "post-hardcore": 1919,
+  "humorous": 1920,
+  "heartfelt": 1921,
+  "psychedelia": 1922,
+  "haunting": 1923,
+  "afrobeat": 1924,
+  "medieval": 1925,
+  "progressive electronic": 1926,
+  "adult contemporary": 1927,
+  "reggaeton": 1928,
+  "dynamic": 1929,
+  "contemporary country": 1930,
+  "beats": 1931,
+  "idm": 1932,
+  "southern hip hop": 1933,
+  "80s": 1934,
+  "cold": 1935,
+  "big band": 1936,
+  "saxophone": 1937,
+  "future bass": 1938,
+  "noisy": 1939,
+  "gritty": 1940,
+  "dark ambient": 1941,
+  "trumpet": 1942,
+  "art rock": 1943,
+  "chaotic": 1944,
+  "smooth soul": 1945,
+  "post-industrial": 1946,
+  "bluegrass": 1947,
+  "industrial & noise": 1948,
+  "anime": 1949,
+  "drill": 1950,
+  "electro swing": 1951,
+  "dancehall": 1952,
+  "epic music": 1953,
+  "witch house": 1954,
+  "minimalistic": 1955,
+  "hispanic american music": 1956,
+  "electronica": 1957,
+  "americana": 1958,
+  "political": 1959,
+  "latin": 1960,
+  "tech house": 1961,
+  "neo-soul": 1962,
+  "hispanic music": 1963,
+  "heavy bass": 1964,
+  "knee surgery": 1965,
+  "horror": 1966,
+  "psychedelic pop": 1967,
+  "industrial metal": 1968,
+  "space": 1969,
+  "dub": 1970,
+  "art pop": 1971,
+  "spoken word": 1972,
+  "reverb": 1973,
+  "caribbean music": 1974,
+  "alternative": 1975,
+  "symphonic": 1976,
+  "cloud rap": 1977,
+  "neo-psychedelia": 1978,
+  "gothic metal": 1979,
+  "classic rock": 1980,
+  "female": 1981,
+  "bossa nova": 1982,
+  "thrash metal": 1983,
+  "djent": 1984,
+  "teen pop": 1985,
+  "cyberpunk": 1986,
+  "hardcore": 1987,
+  "glam rock": 1988,
+  "slow tempo": 1989,
+  "jazz rap": 1990,
+  "sexy": 1991,
+  "harp": 1992,
+  "outlaw country": 1993,
+  "progressive trance": 1994,
+  "european music": 1995,
+  "west coast hip hop": 1996,
+  "vocal": 1997,
+  "alternative dance": 1998,
+  "accordion": 1999,
+  "minimal": 2000,
+  "tribal": 2001,
+  "sarcastic": 2002,
+  "vocal jazz": 2003,
+  "jamaican music": 2004,
+  "alternative r&b": 2005,
+  "smooth jazz": 2006,
+  "gothic": 2007,
+  "ska": 2008,
+  "manic": 2009,
+  "bass guitar": 2010,
+  "chillwave": 2011,
+  "improvisation": 2012,
+  "melancholy": 2013,
+  "shoegaze": 2014,
+  "big beat": 2015,
+  "keyboard": 2016,
+  "groove metal": 2017,
+  "90s": 2018,
+  "latin pop": 2019,
+  "hardcore [punk]": 2020,
+  "darkwave": 2021,
+  "modern": 2022,
+  "glam metal": 2023,
+  "reflective": 2024,
+  "eerie": 2025,
+  "chamber pop": 2026,
+  "martial": 2027,
+  "flamenco": 2028,
+  "male singer": 2029,
+  "indietronica": 2030,
+  "beautiful": 2031,
+  "gothic rock": 2032,
+  "vocaloid": 2033,
+  "world": 2034,
+  "math rock": 2035,
+  "dark pop": 2036,
+  "jazz-funk": 2037,
+  "symphonic rock": 2038,
+  "club": 2039,
+  "bouncy": 2040,
+  "easy listening": 2041,
+  "j-rock": 2042,
+  "baroque": 2043,
+  "percussion": 2044,
+  "acid jazz": 2045,
+  "hardstyle": 2046,
+  "rock & roll": 2047,
+  "hymn": 2048,
+  "dissonant": 2049,
+  "ambient pop": 2050,
+  "eurodance": 2051,
+  "danceable": 2052,
+  "turntablism": 2053,
+  "dolby atmos": 2054,
+  "depressive": 2055,
+  "doom metal": 2056,
+  "hyperpop": 2057,
+  "existential": 2058,
+  "melodic metalcore": 2059,
+  "male": 2060,
+  "chanson": 2061,
+  "vaporwave": 2062,
+  "salsa": 2063,
+  "war": 2064,
+  "melodic": 972,
+  "fiddle": 2065,
+  "film soundtrack": 2066,
+  "inspirational": 2067,
+  "nu jazz": 2068,
+  "vulgar": 2069,
+  "abstract": 2070,
+  "brass": 2071,
+  "confident": 2072,
+  "black metal": 2073,
+  "video game music": 2074,
+  "creepy": 2075,
+  "uncommon time signatures": 2076,
+  "intimate": 2077,
+  "relaxing": 2078,
+  "post-rock": 2079,
+  "lofi": 2080,
+  "roots reggae": 2081,
+  "industrial rock": 2082,
+  "remix": 2083,
+  "storytelling": 2084,
+  "funny": 2085,
+  "ambient techno": 2086,
+  "high-energy": 2087,
+  "experimental rock": 2088,
+  "southern rock": 2089,
+  "celtic": 2090,
+  "banjo": 2091,
+  "rockabilly": 2092,
+  "tabla": 2093,
+  "melodic death metal": 2094,
+  "minor key": 2095,
+  "rap rock": 2096,
+  "synth funk": 2097,
+  "harmonies": 2098,
+  "fast tempo": 2099,
+  "garage rock": 2100,
+  "breakcore": 2101,
+  "harmony": 2102,
+  "uptempo": 2103,
+  "harmonica": 2104,
+  "duet": 2105,
+  "alt-pop": 2106,
+  "bounce": 2107,
+  "hiphop": 2108,
+  "funk rock": 2109,
+  "jungle": 2110,
+  "acoustic rock": 2111,
+  "tropical house": 2112,
+  "piano rock": 2113,
+  "sound effects": 2114,
+  "glitch hop": 2115,
+  "dance pop": 2116,
+  "aquatic": 2117,
+  "organ": 2118,
+  "baroque pop": 2119,
+  "comedy": 2120,
+  "theatrical": 2121,
+  "sparse": 2122,
+  "bassline": 2123,
+  "scary": 2124,
+  "cute": 2125,
+  "drone": 2126,
+  "horrorcore": 2127,
+  "bass house": 2128,
+  "emo rap": 2129,
+  "moody": 2130,
+  "drums (drum set)": 2131,
+  "fast-paced": 2132,
+  "double bass": 2133,
+  "progressive pop": 2134,
+  "apocalyptic": 2135,
+  "hardcore punk": 2136,
+  "anthem": 2137,
+  "europop": 2138,
+  "upright bass": 2139,
+  "groove": 2140,
+  "psytrance": 2141,
+  "dark wave": 2142,
+  "kpop": 2143,
+  "minimal techno": 2144,
+  "rock and roll": 2145,
+  "grime": 2146,
+  "lively": 2147,
+  "rave": 2148,
+  "syncopated": 2149,
+  "show tunes": 2150,
+  "autotune": 2151,
+  "sitar": 2152,
+  "nu-disco": 2153,
+  "folk metal": 2154,
+  "traditional pop": 2155,
+  "surf rock": 2156,
+  "noise": 2157,
+  "brostep": 2158,
+  "serious": 2159,
+  "traditional": 2160,
+  "pessimistic": 2161,
+  "ebm": 2162,
+  "female vocalists": 2163,
+  "speed metal": 2164,
+  "classic": 2165,
+  "post-punk revival": 2166,
+  "lounge": 2167,
+  "electric blues": 2168,
+  "winter": 2169,
+  "clear vocals": 2170,
+  "retro": 2171,
+  "raspy": 2172,
+  "progressive country": 2173,
+  "vibrant": 2174,
+  "mystical": 2175,
+  "deathcore": 2176,
+  "alt-country": 2177,
+  "theme": 2178,
+  "8-bit": 2179,
+  "jangle pop": 2180,
+  "aor": 2181,
+  "delta blues": 2182,
+  "light": 2183,
+  "lyrical": 2184,
+  "distorted guitars": 2185,
+  "jazz-rock": 2186,
+  "classical crossover": 2187,
+  "fusion": 2188,
+  "doo-wop": 2189,
+  "television music": 2190,
+  "clean": 2191,
+  "symphony": 2192,
+  "whimsical": 2193,
+  "honky tonk": 2194,
+  "chamber music": 2195,
+  "breathy": 2196,
+  "echo": 2197,
+  "uk garage": 2198,
+  "acid techno": 2199,
+  "ritualistic": 2200,
+  "scratch": 2201,
+  "darksynth": 2202,
+  "edgy": 2203,
+  "layered harmonies": 2204,
+  "rhythm & blues": 2205,
+  "80's": 2206,
+  "experimental hip hop": 2207,
+  "808": 2208,
+  "expressive": 2209,
+  "1960s": 2210,
+  "cryptic": 2211,
+  "g-funk": 2212,
+  "oud": 2213,
+  "male vocalists": 2214,
+  "uk drill": 2215,
+  "gentle": 2216,
+  "musical": 2217,
+  "sultry": 2218,
+  "samba": 2219,
+  "violins": 2220,
+  "soul jazz": 2221,
+  "alienation": 2222,
+  "deep voice": 2223,
+  "layered": 2224,
+  "screamo": 2225,
+  "drift phonk": 2226,
+  "shamisen": 2227,
+  "rap metal": 2228,
+  "strong": 2229,
+  "062  final fantasy ii": 3,
+  "063  final fantasy iii": 4,
+  "064  final fantasy iii remake": 5,
+  "066 final fantasy iv": 7,
+  "067 final fantasy iv remake": 8,
+  "068 final fantasy v": 9,
+  "069 final fantasy vi": 10,
+  "070 final fantasy vii": 11,
+  "071 final fantasy vii remake": 12,
+  "072 final fantasy viii": 13,
+  "073 final fantasy ix": 14,
+  "075 final fantasy x": 15,
+  "076 final fantasy xi": 16,
+  "077 final fantasy xii": 17,
+  "078 final fantasy xiii": 18,
+  "079 final fantasy xiv": 19,
+  "081 final fantasy xv": 20,
+  "082 final fantasy 0": 21,
+  "089 final fantasy x2": 26,
+  "093 final fantasy xiii2": 29,
+  "094 final fantasy xiii3": 30,
+  "097 dissidia final fantasy": 33,
+  "13 sentinels aegis rim": 40,
+  "ace combat 7": 143,
+  "advance wars": 144,
+  "advance wars  days of ruin": 145,
+  "advance wars  dual strike": 146,
+  "advance wars 2  black hole rising": 148,
+  "advance wars dual strike": 149,
+  "animal crossing  wild world": 166,
+  "animal crossing new horizons": 167,
+  "ar tonelico": 171,
+  "armored core": 173,
+  "atelier escher and logy": 182,
+  "atelier iris": 183,
+  "atelier iris 2": 184,
+  "atelier iris 3": 185,
+  "atelier marie": 186,
+  "atelier resleriana": 187,
+  "atelier rorona": 188,
+  "atelier ryza": 189,
+  "atelier ryza 2": 190,
+  "atelier ryza 3": 191,
+  "atelier totori": 192,
+  "atlantis kitsune": 193,
+  "attack on titan": 194,
+  "azur lane": 198,
+  "baldurs gate 3": 255,
+  "banjo kazooie": 261,
+  "banjo tooie": 262,
+  "black clover": 272,
+  "black myth wukong": 273,
+  "blazblue": 277,
+  "bleach": 278,
+  "blue reflection": 285,
+  "bocchi the rock": 289,
+  "castlevania": 329,
+  "castlevania  dawn of sorrow": 330,
+  "castlevania  order of ecclesia": 331,
+  "castlevania  portrait of ruin": 332,
+  "castlevania  symphony of the night": 333,
+  "castlevania aria of sorrow": 334,
+  "cave story": 337,
+  "celeste": 339,
+  "chiptune": 350,
+  "chrono cross": 359,
+  "chrono trigger": 360,
+  "clair obscur": 367,
+  "clannad": 368,
+  "contra": 374,
+  "crosscode": 382,
+  "cuphead": 384,
+  "dmc4": 397,
+  "dmcv": 398,
+  "danganronpa": 414,
+  "danganronpa 2": 415,
+  "deltarune 2": 423,
+  "deltarune34": 424,
+  "diddy kong racing": 428,
+  "gb sounds": 431,
+  "disgaea 5": 432,
+  "doki doki literature club": 435,
+  "donkey kong 64": 436,
+  "donkey kong country": 437,
+  "donkey kong country 2": 438,
+  "donkey kong country 3": 439,
+  "doom": 442,
+  "dragalia lost": 445,
+  "dragon quest ix": 448,
+  "drakengard 3": 449,
+  "elder scrolls 3 morrowind": 474,
+  "etrian odyssey ii": 479,
+  "etrian odyssey iii": 480,
+  "fzero": 495,
+  "fzero  maximum velocity": 496,
+  "fzero gx": 497,
+  "fzero x": 498,
+  "fairy tail": 499,
+  "far cry 6": 500,
+  "fate grand order": 501,
+  "fate stay night": 505,
+  "fire emblem": 512,
+  "fire emblem awakening": 513,
+  "fire emblem three houses": 515,
+  "fruits basket": 521,
+  "fuga melodies of steel": 523,
+  "fullmetal alchemist": 524,
+  "fullmetal alchemist brotherhood": 525,
+  "genshin impact": 554,
+  "ghost in the shell": 555,
+  "goldeneye 007": 565,
+  "granblue fantasy": 566,
+  "granblue fantasy versus": 567,
+  "gurren lagann": 574,
+  "gust": 575,
+  "hades": 616,
+  "haikyuu": 617,
+  "harvest moon": 621,
+  "hearthstone": 625,
+  "hollow knight": 638,
+  "hololive": 639,
+  "homestuck": 640,
+  "homestuck alternia": 645,
+  "homestuck alterniabound": 646,
+  "homestuck cherubim": 647,
+  "honkai impact 3rd": 658,
+  "honkai star rail": 661,
+  "jojos bizarre adventure": 755,
+  "journey": 756,
+  "kid icarus uprising": 800,
+  "kill la kill": 803,
+  "kingdom hearts 3582 days": 816,
+  "kingdom hearts 3d  dream drop distance": 817,
+  "kingdom hearts recoded": 818,
+  "kirby": 819,
+  "kirby 64  the crystal shards": 820,
+  "kirby ds": 821,
+  "kirbys dream land 3": 822,
+  "konosuba": 827,
+  "lamulana": 859,
+  "legend of zelda the": 878,
+  "legend of zelda the  a link to the past": 879,
+  "legend of zelda the  majoras mask": 880,
+  "legend of zelda the  ocarina of time": 881,
+  "legend of zelda the  phantom hourglass": 882,
+  "legend of zelda the  spirit tracks": 883,
+  "legend of zelda the  twilight princess": 884,
+  "mana khemia": 936,
+  "maple story": 937,
+  "mario  luigi  bowsers inside story": 938,
+  "mario  luigi  dream team": 939,
+  "mario  luigi  partners in time": 940,
+  "mario  luigi  superstar saga": 941,
+  "mario 3d land": 942,
+  "mario golf": 943,
+  "mario kart  super circuit": 944,
+  "mario kart 64": 945,
+  "mario kart 7": 946,
+  "mario kart ds": 947,
+  "mario kart wii": 948,
+  "mario kart 8": 949,
+  "mario party 3": 952,
+  "mario party 4": 953,
+  "mario party 5": 954,
+  "mario tennis": 955,
+  "mega man": 961,
+  "mega man 3": 962,
+  "mega man 4": 963,
+  "mega man 7": 964,
+  "mega man battle network": 965,
+  "mega man x": 966,
+  "mega man x2": 967,
+  "mega man x3": 968,
+  "mega man x4": 969,
+  "mega man zero zx": 970,
+  "metal gear solid 2": 978,
+  "metroid": 979,
+  "metroid  zero mission": 980,
+  "metroid prime 2  echoes": 981,
+  "metroid prime 3": 982,
+  "metroid prime": 983,
+  "minecraft": 989,
+  "monogatari": 1411,
+  "my hero academia": 1006,
+  "nausicaa valley of the wind": 1034,
+  "neon genesis evangelion": 1039,
+  "neon white": 1040,
+  "new super mario bros": 1045,
+  "new super mario bros wii": 1046,
+  "ni no kuni": 1050,
+  "ni no kuni 2": 1051,
+  "nier automata": 1053,
+  "night in the woods": 1054,
+  "ninja gaiden 1": 1055,
+  "ninja gaiden 2": 1056,
+  "omori": 1080,
+  "one piece": 1081,
+  "outer wilds": 1084,
+  "parasite eve": 1114,
+  "perfect dark": 1122,
+  "phoenix wright  ace attorney": 1124,
+  "phoenix wright ace attorney 2": 1125,
+  "pokemon anime": 1131,
+  "pokemon black and white": 1133,
+  "pokemon crystal": 1134,
+  "pokemon diamond": 1135,
+  "pokemon fire red and leaf green": 1137,
+  "pokemon heart gold  soul silver": 1138,
+  "pokemon mystery dungeon  blue rescue team": 1140,
+  "pokemon mystery dungeon  explorers of sky": 1141,
+  "pokemon mystery dungeon  gates to infinity": 1142,
+  "pokemon omega ruby": 1143,
+  "pokemon red": 1144,
+  "pokemon ruby": 1145,
+  "pokemon scarlet": 1147,
+  "pokemon sun and moon": 1148,
+  "pokemon super mystery dungeon": 1149,
+  "pokemon x and y": 1150,
+  "pokemon xd  gale of darkness": 1151,
+  "professor layton and the curious village": 1173,
+  "resident evil": 1208,
+  "scottpilgrim": 1277,
+  "secret of mana": 1284,
+  "shin megami tensei iv": 1298,
+  "shovelknight": 1300,
+  "skyrim": 1320,
+  "sonic advance 3": 1335,
+  "sonic adventure 2": 1337,
+  "sonic mania": 1338,
+  "sonic the hedgehog": 1339,
+  "sonic the hedgehog 2": 1340,
+  "sonic the hedgehog 3": 1341,
+  "spirited away": 1347,
+  "star fox": 1348,
+  "star ocean": 1349,
+  "starcraft 2": 1350,
+  "stardew valley": 1351,
+  "stellaris": 1354,
+  "street fighter ii": 1359,
+  "super mario 64": 1370,
+  "super mario bros 3": 1374,
+  "super mario galaxy": 1375,
+  "super mario rpg": 1377,
+  "super mario sunshine": 1378,
+  "super monkey ball 2": 1379,
+  "super smash bros brawl": 1381,
+  "tales of symphonia": 1391,
+  "the sims 2": 1395,
+  "totalwar": 1396,
+  "touhou 10": 1397,
+  "touhou 11": 1398,
+  "touhou 12": 1399,
+  "touhou 14": 1400,
+  "touhou 15": 1401,
+  "touhou 6": 1402,
+  "touhou 7": 1403,
+  "touhou 8": 1405,
+  "touhou 9": 1406,
+  "tunic": 1407,
+  "undertale": 1410,
+  "violet evergarden": 1413,
+  "wild arms 2": 1417,
+  "witcher 3": 1418,
+  "wow": 1419,
+  "wuthering waves": 1421,
+  "xenoblade chronicles": 1423,
+  "xenoblade chronicles 2": 1424,
+  "xenoblade chronicles 2  torna": 1425,
+  "xenoblade chronicles 3": 1426,
+  "xenogears": 1427,
+  "ys": 1429,
+  "zenless zone zero": 1434,
+  "beatmania": 1459,
+  "berserk": 1460,
+  "castle crashers": 1465,
+  "everquest": 1470,
+  "mortal kombat": 1482,
+  "nier": 1483,
+  "persona": 1484,
+  "sayonara wild hearts": 1487,
+  "touhou remixes": 1550,
+  "yakuza": 1659,
+  "sea shanty": 2230,
+  "emo-pop": 2231,
+  "skate punk": 2232,
+  "bright": 2233,
+  "cumbia": 2234,
+  "world music": 2235,
+  "synth pop": 2236,
+  "chorus": 2237,
+  "japanese": 2238,
+  "schlager": 2239,
+  "asian music": 2240,
+  "glam pop": 2241,
+  "lute": 2242,
+  "misanthropic": 2243,
+  "christian": 2244,
+  "bubblegum pop": 2245,
+  "808s": 2246,
+  "remastered": 2247,
+  "christmas music": 2248,
+  "wave": 2249,
+  "tango": 2250,
+  "hateful": 2251,
+  "high energy": 2252,
+  "neoclassical darkwave": 2253,
+  "electroclash": 2254,
+  "seductive": 2255,
+  "dungeon synth": 2256,
+  "city pop": 2257,
+  "heroic": 2258,
+  "freestyle": 2259,
+  "space ambient": 2260,
+  "bounce drop": 2261,
+  "afrobeats": 2262,
+  "power ballad": 2263,
+  "trombone": 2264,
+  "guitar solo": 2265,
+  "battle": 2266,
+  "ending": 2267,
+  "soundtrack1": 2268,
+  "soundtrack2": 2269
+}

gradio_app.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import os
+from pathlib import Path
+from typing import List, Tuple
+import uuid
+import json
+import gradio as gr
+import torch
+import torchaudio
+from safetensors.torch import load_file
+from model import LocalSongModel
+from acestep.music_dcae.music_dcae_pipeline import MusicDCAE
+class TagEmbedder:
+    def __init__(self, mapping_file: str = "checkpoints/tag_mapping.json"):
+        with open(mapping_file, 'r', encoding='utf-8') as f:
+            self.tag_mapping = json.load(f)
+        print(f"Loaded {len(self.tag_mapping)} tags from {mapping_file}")
+        self.num_classes = 2304
+class AudioVAE:
+    def __init__(self, device: torch.device):
+        self.model = MusicDCAE().to(device)
+        self.model.eval()
+        self.device = device
+        self.latent_mean = torch.tensor(
+            [0.1207, -0.0186, -0.0947, -0.3779, 0.5956, 0.3422, 0.1796, -0.0526],
+            device=device,
+        ).view(1, -1, 1, 1)
+        self.latent_std = torch.tensor(
+            [0.4638, 0.3154, 0.6244, 1.5078, 0.4696, 0.4633, 0.5614, 0.2707],
+            device=device,
+        ).view(1, -1, 1, 1)
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            latents = latents * self.latent_std + self.latent_mean
+            sr, audio_list = self.model.decode(latents, sr=48000)
+            audio_batch = torch.stack(audio_list).to(self.device)
+        return audio_batch
+class RF:
+    def __init__(self, model: torch.nn.Module):
+        self.model = model
+    def sample(
+        self,
+        z: torch.Tensor,
+        cond: List[List[int]],
+        null_cond: List[List[int]] | None = None,
+        sample_steps: int = 100,
+        cfg: float = 3.0,
+    ) -> List[torch.Tensor]:
+        batch = z.size(0)
+        dt = 1.0 / sample_steps
+        dt = torch.tensor([dt] * batch, device=z.device).view([batch, *([1] * len(z.shape[1:]))])
+        images = [z]
+        for i in range(sample_steps, 0, -1):
+            t = torch.tensor([i / sample_steps] * batch, device=z.device)
+            if null_cond is not None:
+                z_batched = torch.cat([z, z], dim=0)
+                t_batched = torch.cat([t, t], dim=0)
+                cond_batched = cond + null_cond
+                v_batched = self.model(z_batched, t_batched, cond_batched)
+                vc, vu = v_batched.chunk(2, dim=0)
+                vc = vu + cfg * (vc - vu)
+            else:
+                vc = self.model(z, t, cond)
+            z = z - dt * vc
+            images.append(z)
+        return images
+model: torch.nn.Module | None = None
+vae: AudioVAE | None = None
+tag_embedder: TagEmbedder | None = None
+rf_sampler: RF | None = None
+device: torch.device | None = None
+_available_tags: List[str] | None = None
+def load_resources() -> List[str]:
+    torch.set_float32_matmul_precision('high')
+    global model, vae, tag_embedder, rf_sampler, device, _available_tags
+    if _available_tags is not None:
+        return _available_tags
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tag_embedder = TagEmbedder()
+    model = LocalSongModel(
+        in_channels=8,
+        num_groups=16,
+        hidden_size=1024,
+        decoder_hidden_size=2048,
+        num_blocks=36,
+        patch_size=(16, 1),
+        num_classes=tag_embedder.num_classes,
+        max_tags=8,
+    ).to(device)
+    checkpoint_path = "checkpoints/checkpoint_461260.safetensors"
+    print(f"Loading checkpoint: {checkpoint_path}")
+    state_dict = load_file(checkpoint_path, device=str(device))
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    vae = AudioVAE(device)
+    rf_sampler = RF(model)
+    _available_tags = sorted(tag_embedder.tag_mapping.keys())
+    return _available_tags
+def _tags_to_indices(tags: List[str]) -> List[int]:
+    assert tag_embedder is not None
+    indices = []
+    for tag in tags:
+        tag_lower = tag.lower().strip()
+        if tag_lower in tag_embedder.tag_mapping:
+            indices.append(tag_embedder.tag_mapping[tag_lower])
+    return indices
+def generate_audio(
+    tags: List[str],
+    cfg: float,
+    sample_steps: int,
+) -> Tuple[Tuple[int, object], str]:
+    load_resources()
+    assert model is not None and vae is not None and rf_sampler is not None and device is not None
+    if not tags:
+        tags = []
+    if len(tags) > 8:
+        raise gr.Error("A maximum of 8 tags is supported.")
+    tag_indices = _tags_to_indices(tags)
+    batch = 1
+    channels = 8
+    height = 16
+    width = 512
+    z = torch.randn(batch, channels, height, width, device=device)
+    cond = [tag_indices]
+    null_cond = [[]]
+    with torch.no_grad():
+        sampled_latents = rf_sampler.sample(
+            z=z,
+            cond=cond,
+            null_cond=null_cond,
+            sample_steps=sample_steps,
+            cfg=cfg,
+        )[-1]
+        audio = vae.decode(sampled_latents)
+    audio_tensor = audio[0].cpu()
+    sr = 48000
+    audio_numpy = audio_tensor.transpose(0, 1).numpy()
+    os.makedirs("generated", exist_ok=True)
+    output_path = f"generated/generated_{uuid.uuid4().hex}.wav"
+    torchaudio.save(str(output_path), audio_tensor, sr)
+    return (sr, audio_numpy), str(output_path)
+def build_interface() -> gr.Blocks:
+    available_tags = load_resources()
+    # Define preset tag combinations
+    presets = [
+        ["soundtrack1", "female vocalist","rock","melodic"],
+        ["soundtrack", "chrono trigger", "emotional", "piano", "strings"],
+        ["soundtrack", "touhou 10", "trumpet"],
+        ["soundtrack", "christmas music","winter","melodic"],
+        ["soundtrack2", "male vocalist","pop","melodic","acoustic guitar","ballad"],
+    ]
+    with gr.Blocks(title="LocalSong") as demo:
+        gr.Markdown("# LocalSong")
+        with gr.Row():
+            tag_input = gr.Dropdown(
+                label="Tags (select up to 8)",
+                choices=available_tags,
+                multiselect=True,
+                max_choices=8,
+                value=presets[0],
+            )
+        gr.Markdown("**Presets:**")
+        with gr.Row():
+            for preset in presets:
+                btn = gr.Button(f"{' + '.join(preset)}", size="sm")
+                def make_preset_fn(p):
+                    return lambda: p
+                btn.click(fn=make_preset_fn(preset), inputs=None, outputs=tag_input)
+        with gr.Row():
+            cfg_slider = gr.Slider(
+                label="CFG Scale",
+                minimum=1.0,
+                maximum=7.0,
+                step=0.5,
+                value=3.5,
+            )
+            sample_steps_slider = gr.Slider(
+                label="Sample Steps",
+                minimum=50,
+                maximum=200,
+                step=10,
+                value=200,
+            )
+        with gr.Row():
+            seed_input = gr.Number(
+                label="Seed",
+                value=45,
+                precision=0,
+            )
+        generate_button = gr.Button("Generate Audio", variant="primary")
+        audio_output = gr.Audio(label="Generated Audio", type="numpy")
+        download_output = gr.File(label="Download WAV")
+        def generate_wrapper(tags, cfg, steps, seed):
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+            return generate_audio(tags, cfg, steps)
+        generate_button.click(
+            fn=generate_wrapper,
+            inputs=[
+                tag_input,
+                cfg_slider,
+                sample_steps_slider,
+                seed_input,
+            ],
+            outputs=[
+                audio_output,
+                download_output,
+            ],
+        )
+    return demo
+demo = build_interface()
+if __name__ == "__main__":
+    demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,490 @@

+from typing import Tuple
+import torch
+import torch.nn as nn
+import math
+from einops import rearrange
+from torch.nn.functional import scaled_dot_product_attention
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class Embed(nn.Module):
+    def __init__(
+            self,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            norm_layer = None,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        in_channels=8,
+        embed_dim=1152,
+        bias=True,
+        patch_size=1,
+    ):
+        super().__init__()
+        self.patch_h, self.patch_w = patch_size
+        self.patch_size = patch_size
+        self.proj = nn.Linear(in_channels * self.patch_h * self.patch_w, embed_dim, bias=bias)
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+    def forward(self, latent):
+        x = rearrange(latent, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)', p1=self.patch_h, p2=self.patch_w)
+        x = self.proj(x)
+        return x
+class FinalLayer(nn.Module):
+    """Final layer with configurable patch_size support"""
+    def __init__(self, hidden_size, out_channels=8, patch_size=1):
+        super().__init__()
+        self.patch_h, self.patch_w = patch_size
+        self.linear = nn.Linear(hidden_size, out_channels * self.patch_h * self.patch_w, bias=True)
+        self.out_channels = out_channels
+        self.patch_size = patch_size
+    def forward(self, x, target_height, target_width):
+        x = self.linear(x)
+        x = rearrange(x, 'b (h w) (c p1 p2) -> b c (h p1) (w p2)',
+                        h=target_height, w=target_width,
+                        p1=self.patch_h, p2=self.patch_w, c=self.out_channels)
+        return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10):
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+        )
+        args = t[..., None].float() * freqs[None, ...]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x):
+        x =  self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
+        return x
+def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float = 10000.0, scale=1.0):
+    if isinstance(scale, float):
+        scale = (scale, scale)
+    x_pos = torch.linspace(0, width * scale[0], width)
+    y_pos = torch.linspace(0, height * scale[1], height)
+    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+    y_pos = y_pos.reshape(-1)
+    x_pos = x_pos.reshape(-1)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    x_freqs = torch.outer(x_pos, freqs).float()
+    y_freqs = torch.outer(y_pos, freqs).float()
+    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+    freqs_cis = freqs_cis.reshape(height * width, -1)
+    return freqs_cis
+@torch.compiler.disable
+def apply_rotary_emb_2d(
+        xq: torch.Tensor,
+        xk: torch.Tensor,
+        freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    freqs_cis = freqs_cis[None, None, :, :]
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class RAttention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = True,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = RMSNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, pos, mask) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = self.q_norm(q.contiguous())
+        k = self.k_norm(k.contiguous())
+        q, k = apply_rotary_emb_2d(q, k, freqs_cis=pos)
+        q = q.view(B, self.num_heads, -1, C // self.num_heads)
+        k = k.view(B, self.num_heads, -1, C // self.num_heads).contiguous()
+        v = v.view(B, self.num_heads, -1, C // self.num_heads).contiguous()
+        x = scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=self.attn_drop.p if self.training else 0.0)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        num_heads: int,
+        qkv_bias: bool = False,
+        proj_drop: float = 0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv_proj = nn.Linear(context_dim, dim * 2, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, context: torch.Tensor, context_mask: torch.Tensor = None) -> torch.Tensor:
+        B, N, C = x.shape
+        B_ctx, M, C_ctx = context.shape
+        q = self.q_proj(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        kv = self.kv_proj(context).reshape(B_ctx, M, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn_mask = None
+        if context_mask is not None:
+            attn_mask = torch.zeros(B, 1, 1, M, dtype=q.dtype, device=q.device)
+            attn_mask.masked_fill_(~context_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
+        attn = scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.proj_drop.p if self.training else 0.0)
+        x = attn.permute(0, 2, 1, 3).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class DDTBlock(nn.Module):
+    def __init__(self, hidden_size, groups, mlp_ratio=4.0, context_dim=None, is_encoder_block=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = RAttention(hidden_size, num_heads=groups, qkv_bias=False)
+        self.norm_cross = RMSNorm(hidden_size, eps=1e-6) if context_dim else nn.Identity()
+        self.cross_attn = CrossAttention(hidden_size, context_dim, groups) if context_dim else None
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp = FeedForward(hidden_size, mlp_hidden_dim)
+        self.is_encoder_block = is_encoder_block
+        if not is_encoder_block:
+            self.adaLN_modulation = nn.Sequential(
+                nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+            )
+    def forward(self, x, c, pos, mask=None, context=None, context_mask=None, shared_adaLN=None):
+        if self.is_encoder_block:
+            adaLN_output = shared_adaLN(c)
+        else:
+            adaLN_output = self.adaLN_modulation(c)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = adaLN_output.chunk(6, dim=-1)
+        x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), pos, mask=mask)
+        if self.cross_attn is not None and context is not None:
+            x = x + self.cross_attn(self.norm_cross(x), context=context, context_mask=context_mask)
+        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class LocalSongModel(nn.Module):
+    def __init__(
+            self,
+            in_channels=8,
+            num_groups=16,
+            hidden_size=1024,
+            decoder_hidden_size=2048,
+            num_blocks=36,
+            patch_size=(16,1),
+            num_classes=2304,
+            max_tags=8,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.hidden_size = hidden_size
+        self.decoder_hidden_size = decoder_hidden_size
+        self.num_groups = num_groups
+        self.num_groups = num_groups
+        self.num_blocks = num_blocks
+        self.patch_size = patch_size
+        self.num_classes = num_classes
+        self.max_tags = max_tags
+        self.patch_h, self.patch_w = patch_size
+        self.x_embedder = PatchEmbed(
+            in_channels=in_channels,
+            embed_dim=decoder_hidden_size,
+            bias=True,
+            patch_size=patch_size
+        )
+        self.s_embedder = PatchEmbed(
+            in_channels=in_channels,
+            embed_dim=decoder_hidden_size,
+            bias=True,
+            patch_size=patch_size
+        )
+        self.encoder_to_decoder = nn.Linear(hidden_size, decoder_hidden_size, bias=False)
+        self.a_to_b_proj = nn.Linear(decoder_hidden_size, hidden_size, bias=False)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = nn.Embedding(num_classes + 1, hidden_size, padding_idx=0)
+        self.final_layer = FinalLayer(
+            decoder_hidden_size,
+            out_channels=in_channels,
+            patch_size=patch_size
+        )
+        self.shared_encoder_adaLN = nn.Sequential(
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.shared_decoder_adaLN = nn.Sequential(
+            nn.Linear(hidden_size, 6 * decoder_hidden_size, bias=True)
+        )
+        self.blocks = nn.ModuleList()
+        for i in range(self.num_blocks):
+            is_encoder = i < self.num_blocks
+            if is_encoder:
+                if i < 1:
+                    block_hidden_size = decoder_hidden_size
+                    num_heads = self.num_groups
+                elif i >= self.num_blocks - 3:
+                    block_hidden_size = decoder_hidden_size
+                    num_heads = self.num_groups
+                else:
+                    block_hidden_size = hidden_size
+                    num_heads = self.num_groups
+            else:
+                block_hidden_size = decoder_hidden_size
+                num_heads = self.num_groups
+            context_dim = hidden_size if i % 2 == 0 and is_encoder else None
+            self.blocks.append(
+                DDTBlock(
+                    block_hidden_size,
+                    num_heads,
+                    context_dim=context_dim,
+                    is_encoder_block=is_encoder
+                )
+            )
+        self.bc_projection = nn.Linear(decoder_hidden_size + hidden_size, decoder_hidden_size, bias=False)
+        self.initialize_weights()
+        self.precompute_encoder_pos = dict()
+        self.precompute_decoder_pos = dict()
+    from functools import lru_cache
+    @lru_cache
+    def fetch_encoder_pos(self, height, width, device):
+        key = (height, width)
+        if key in self.precompute_encoder_pos:
+            return self.precompute_encoder_pos[key].to(device)
+        else:
+            pos = precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width).to(device)
+            self.precompute_encoder_pos[key] = pos
+            return pos
+    @lru_cache
+    def fetch_decoder_pos(self, height, width, device):
+        key = (height, width)
+        if key in self.precompute_decoder_pos:
+            return self.precompute_decoder_pos[key].to(device)
+        else:
+            pos = precompute_freqs_cis_2d(self.decoder_hidden_size // self.num_groups, height, width).to(device)
+            self.precompute_decoder_pos[key] = pos
+            return pos
+    def initialize_weights(self):
+        for embedder in [self.x_embedder, self.s_embedder]:
+            nn.init.xavier_uniform_(embedder.proj.weight)
+            if embedder.proj.bias is not None:
+                nn.init.constant_(embedder.proj.bias, 0)
+        nn.init.xavier_uniform_(self.encoder_to_decoder.weight)
+        nn.init.xavier_uniform_(self.a_to_b_proj.weight)
+        nn.init.normal_(self.y_embedder.weight, std=0.02)
+        with torch.no_grad():
+            self.y_embedder.weight[0].fill_(0)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.constant_(self.shared_encoder_adaLN[-1].weight, 0)
+        nn.init.constant_(self.shared_encoder_adaLN[-1].bias, 0)
+        nn.init.constant_(self.shared_decoder_adaLN[-1].weight, 0)
+        nn.init.constant_(self.shared_decoder_adaLN[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+        nn.init.xavier_uniform_(self.bc_projection.weight)
+    def embed_condition(self, cond):
+        device = self.y_embedder.weight.device
+        max_len = self.max_tags
+        batch_size = len(cond)
+        padded_tags = torch.zeros(batch_size, max_len, dtype=torch.long, device=device)
+        for i, tags in enumerate(cond):
+            truncated_tags = tags[:max_len]
+            padded_tags[i, :len(truncated_tags)] = torch.tensor(truncated_tags, dtype=torch.long, device=device)
+        padding_mask = (padded_tags != 0)
+        embedded = self.y_embedder(padded_tags)
+        return embedded, padding_mask
+    def forward(self, x, t, y):
+        y_emb, padding_mask = self.embed_condition(y)
+        return self.forward_emb(x, t, y_emb, padding_mask)
+    @torch.compile()
+    def forward_emb(self, x, t, y_emb, padding_mask=None):
+        B, _, H, W = x.shape
+        h_patches = H // self.patch_h
+        w_patches = W // self.patch_w
+        encoder_pos = self.fetch_encoder_pos(h_patches, w_patches, x.device)
+        decoder_pos = self.fetch_decoder_pos(h_patches, w_patches, x.device)
+        t_emb = self.t_embedder(t.view(-1)).view(B, 1, self.hidden_size)
+        t_cond = nn.functional.silu(t_emb)
+        s = self.s_embedder(x)
+        s_section_a = s
+        for i in range(min(1, self.num_blocks)):
+            block_context = y_emb if i % 2 == 0 else None
+            s_section_a = self.blocks[i](s_section_a, t_cond, decoder_pos, None, context=block_context, context_mask=padding_mask, shared_adaLN=self.shared_decoder_adaLN)
+        s_section_a_projected = self.a_to_b_proj(s_section_a)
+        s_section_b = s_section_a_projected
+        for i in range(1, self.num_blocks - 3):
+            block_context = y_emb if i % 2 == 0 else None
+            s_section_b = self.blocks[i](s_section_b, t_cond, encoder_pos, None, context=block_context, context_mask=padding_mask, shared_adaLN=self.shared_encoder_adaLN)
+        s_concat = torch.cat([s_section_a, s_section_b], dim=-1)
+        s = self.bc_projection(s_concat)
+        for i in range(max(1, self.num_blocks - 3), self.num_blocks):
+            block_context = y_emb if i % 2 == 0 else None
+            s = self.blocks[i](s, t_cond, decoder_pos, None, context=block_context, context_mask=padding_mask, shared_adaLN=self.shared_decoder_adaLN)
+        s = self.final_layer(s, H // self.patch_h, W // self.patch_w)
+        return s

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>=2.8.0
+torchaudio>=2.8.0
+torchvision>=0.23.0
+torchcodec>=0.8.0
+accelerate>=1.9.0
+diffusers>=0.34.0
+einops>=0.8.1
+librosa>=0.11.0
+safetensors>=0.4.0
+gradio>=5.45.0