Spaces:

mrfakename
/

Voice-Acting-TTS

Running on Zero

App Files Files Community

mrfakename commited on 27 days ago

Commit

ef96930

0 Parent(s):

init

Browse files

Files changed (19) hide show

.gitattributes +9 -0
.gitignore +10 -0
LICENSE +201 -0
README.md +33 -0
app.py +98 -0
inference_example_pretrain.py +44 -0
inference_example_sft.py +71 -0
requirements.txt +9 -0
requirements_space.txt +9 -0
run_mimo_audio.py +764 -0
src/mimo_audio/mimo_audio.py +1292 -0
src/mimo_audio/modeling_mimo_audio.py +835 -0
src/mimo_audio/process_speechdata.py +289 -0
src/mimo_audio/templates.py +54 -0
src/mimo_audio_tokenizer/__init__.py +6 -0
src/mimo_audio_tokenizer/configuration_audio_tokenizer.py +104 -0
src/mimo_audio_tokenizer/modeling_audio_tokenizer.py +857 -0
src/mimo_audio_tokenizer/modeling_rope_utils.py +878 -0
src/mimo_audio_tokenizer/quantization.py +480 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+build/
+dist/
+checkpoints/
+*.egg-info/
+*.egg
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 Xiaomi Corporation.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+title: MiMo-Audio TTS
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.46.1
+app_file: app.py
+pinned: false
+license: mit
+python_version: 3.12
+---
+# MiMo-Audio Text-to-Speech
+A simple text-to-speech interface powered by Xiaomi's MiMo-Audio model.
+## Features
+- Convert text to natural-sounding speech
+- Optional style descriptions to control voice characteristics
+- Powered by MiMo-Audio-7B-Instruct model
+## Usage
+1. Enter your text in the input box
+2. Optionally add a style description (e.g., "a calm, gentle voice")
+3. Click "Generate Speech"
+4. Listen to or download the generated audio
+## Model
+This Space uses the [MiMo-Audio-7B-Instruct](https://huggingface.co/XiaomiMiMo/MiMo-Audio-7B-Instruct) model, a 7B parameter audio language model developed by Xiaomi.

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import spaces
+import torch
+from huggingface_hub import snapshot_download
+from src.mimo_audio.mimo_audio import MimoAudio
+import tempfile
+import os
+# Download models from Hugging Face
+print("Downloading MiMo-Audio models from Hugging Face...")
+model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
+tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
+print(f"Models downloaded to: {model_path} and {tokenizer_path}")
+# Initialize model
+print("Loading MiMo-Audio model...")
+model = MimoAudio(
+    model_path=model_path,
+    tokenizer_path=tokenizer_path
+)
+print("Model loaded successfully!")
+@spaces.GPU
+def generate_speech(text, style_description=""):
+    """Generate speech from text using MiMo-Audio TTS"""
+    if not text or not text.strip():
+        return None, "Please enter some text to convert to speech."
+    try:
+        # Create temporary file for output
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            output_path = tmp_file.name
+        # Generate TTS
+        instruct = style_description if style_description.strip() else None
+        model.tts_sft(
+            text=text.strip(),
+            output_audio_path=output_path,
+            instruct=instruct
+        )
+        return output_path, "✅ Speech generated successfully!"
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="MiMo-Audio TTS") as demo:
+    gr.Markdown("""
+    # 🎵 MiMo-Audio Text-to-Speech
+    Convert text to natural-sounding speech using Xiaomi's MiMo-Audio model.
+    Optionally add a style description to control the voice characteristics.
+    """)
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text to convert to speech...",
+                lines=5
+            )
+            style_input = gr.Textbox(
+                label="Style Description (Optional)",
+                placeholder="e.g., 'a calm, gentle female voice' or 'an energetic male speaker'",
+                lines=2
+            )
+            generate_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath"
+            )
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Hello! This is MiMo-Audio text-to-speech synthesis.", ""],
+            ["The quick brown fox jumps over the lazy dog.", "a clear, professional voice"],
+            ["Welcome to the world of artificial intelligence and natural language processing.", "an enthusiastic, friendly tone"]
+        ],
+        inputs=[text_input, style_input]
+    )
+    # Event handler
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, style_input],
+        outputs=[audio_output, status_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

inference_example_pretrain.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright 2025 Xiaomi Corporation.
+from src.mimo_audio.mimo_audio import MimoAudio
+model_path = "models/MiMo-Audio-7B-Base"
+tokenizer_path = "models/MiMo-Audio-Tokenizer"
+model = MimoAudio(model_path, tokenizer_path)
+# in context learning: speech-to-speech generation
+instruction = "Convert the timbre of the input speech to target timbre."
+input_audio = "examples/ESD/0013_000200.wav"
+prompt_examples = [
+    {
+        "input_audio": "examples/ESD/0013_000139.wav",
+        "output_audio": "examples/ESD/0019_000139.wav",
+        "output_transcription": "Cuckoos is downheaded and crying.",
+    },
+    {
+        "input_audio": "examples/ESD/0013_000963.wav",
+        "output_audio": "examples/ESD/0019_000963.wav",
+        "output_transcription": "She said in subdued voice.",
+    },
+    {
+        "input_audio": "examples/ESD/0013_000559.wav",
+        "output_audio": "examples/ESD/0019_000559.wav",
+        "output_transcription": "A raging fire was-in his eyes.",
+    },
+    {
+        "input_audio": "examples/ESD/0013_001142.wav",
+        "output_audio": "examples/ESD/0019_001142.wav",
+        "output_transcription": "Does the one that wins get the crowned?",
+    },
+    {
+        "input_audio": "examples/ESD/0013_000769.wav",
+        "output_audio": "examples/ESD/0019_000769.wav",
+        "output_transcription": "Not much use is it, sam?",
+    },
+]
+output_audio_path = "examples/in_context_learning_s2s.wav"
+text_channel_output = model.in_context_learning_s2s(instruction, prompt_examples, input_audio, max_new_tokens=8192, output_audio_path=output_audio_path)

inference_example_sft.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright 2025 Xiaomi Corporation.
+from src.mimo_audio.mimo_audio import MimoAudio
+model_path = "models/MiMo-Audio-7B-Instruct"
+tokenizer_path = "models/MiMo-Audio-Tokenizer"
+model = MimoAudio(model_path, tokenizer_path)
+# tts
+text = "今天天气真好"
+output_audio_path = "examples/tts.wav"
+text_channel_output = model.tts_sft(text, output_audio_path)
+# instruct tts
+text = "今天天气真好"
+output_audio_path = "examples/instruct_tts.wav"
+instruct = "用小孩子的声音开心的说"
+text_channel_output = model.tts_sft(text, output_audio_path, instruct=instruct)
+# natural instruction tts
+text = "用气喘吁吁的年轻男性声音说：我跑不动了，你等等我！"
+output_audio_path = "examples/natural_instruction_tts.wav"
+text_channel_output = model.tts_sft(text, output_audio_path, read_text_only=False)
+# audio understanding
+audio_path = "examples/spoken_dialogue_assistant_turn_1.wav"
+text = "Summarize the audio."
+text_channel_output = model.audio_understanding_sft(audio_path, text)
+# audio understanding with thinking
+audio_path = "examples/spoken_dialogue_assistant_turn_1.wav"
+text = "Summarize the audio."
+text_channel_output = model.audio_understanding_sft(audio_path, text, thinking=True)
+# spoken dialogue
+first_turn_text_response = "我没办法获取实时的天气信息。不过呢，你可以试试几个方法来查看今天的天气。首先，你可以用手机自带的天气功能，比如苹果手机的天气应用，或者直接在系统设置里查看。其次，你也可以用一些专业的天气服务，像是国外的AccuWeather、Weather.com，或者国内的中国天气网、墨迹天气等等。再有就是，你还可以在谷歌或者百度里直接搜索你所在的城市加上天气这两个字。如果你能告诉我你所在的城市，我也可以帮你分析一下历史天气趋势，不过最新的数据还是需要你通过官方渠道去获取哦。"
+message_list = [
+    {"role": "user", "content": "examples/今天天气如何.mp3"},
+    {"role": "assistant", "content": {"text": first_turn_text_response, "audio": "examples/spoken_dialogue_assistant_turn_1.wav"}},
+    {"role": "user", "content": "examples/北京.mp3"},
+]
+output_audio_path = "examples/spoken_dialogue_assistant_turn_2.wav"
+text_channel_output = model.spoken_dialogue_sft_multiturn(message_list, output_audio_path=output_audio_path, system_prompt=None, prompt_speech="examples/prompt_speech_zh_m.wav")
+text_channel_output = text_channel_output.split("<|eot|>")[0].replace(".....", "")
+print(text_channel_output)
+# speech-to-text dialogue
+message_list = [
+    {"role": "user", "content": "./examples/今天天气如何.mp3"},
+    {"role": "assistant", "content": "你好，我没办法获取实时的天气信息。如果你能告诉我你所在的城市，我也可以帮你分析一下历史天气趋势，不过最新的数据还是需要你通过官方渠道去获取哦。"},
+    {"role": "user", "content": "./examples/北京.mp3"},
+]
+text_channel_output = model.speech2text_dialogue_sft_multiturn(message_list, thinking=True)
+# text dialogue
+message_list = [
+    {"role": "user", "content": "可以给我介绍一些中国的旅游景点吗？"},
+    {"role": "assistant", "content": "你好，您想去哪个城市旅游呢？"},
+    {"role": "user", "content": "北京"},
+]
+text_channel_output = model.text_dialogue_sft_multiturn(message_list, thinking=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate>=1.9.0
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.49.0
+librosa>=0.11.0
+scipy>=1.14.0
+gradio==5.46.1
+flash-attn==2.7.4.post1
+spaces

requirements_space.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate>=1.9.0
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.49.0
+librosa>=0.11.0
+scipy>=1.14.0
+gradio==5.46.1
+flash-attn==2.7.4.post1
+spaces

run_mimo_audio.py ADDED Viewed

	@@ -0,0 +1,764 @@

+# Copyright 2025 Xiaomi Corporation.
+import gradio as gr
+import torch
+import os
+import tempfile
+import argparse
+from pathlib import Path
+from src.mimo_audio.mimo_audio import MimoAudio
+class TTSGenerator:
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+    def generate(self, text, instruct, output_audio_path):
+        path = Path(output_audio_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        text_output = self.model.tts_sft(text, output_audio_path, instruct)
+        return text_output
+class AudioUnderstandingGenerator:
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+    def generate(self, input_speech, input_text, thinking=False):
+        text = self.model.audio_understanding_sft(input_speech, input_text, thinking=thinking)
+        return text
+class SpokenDialogueGenerator:
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+    def generate(self, input_speech, output_audio_path, system_prompt="You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", prompt_speech=None, add_history=False):
+        path = Path(output_audio_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        text_response = self.model.spoken_dialogue_sft(input_speech, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
+        return text_response
+    def clear_history(self):
+        self.model.clear_history()
+class Speech2TextDialogueGenerator:
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+    def generate(self, input_speech, thinking=False, add_history=False):
+        text = self.model.speech2text_dialogue_sft(input_speech, thinking=thinking, add_history=add_history)
+        return text
+    def clear_history(self):
+        self.model.clear_history()
+class TextDialogueGenerator:
+    def __init__(self, model, device=None):
+        self.model = model
+        self.device = device
+    def generate(self, input_text, thinking=False, add_history=False):
+        text = self.model.text_dialogue_sft(input_text, thinking=thinking, add_history=add_history)
+        return text
+    def clear_history(self):
+        self.model.clear_history()
+class MultiModalSpeechInterface:
+    def __init__(self):
+        self.model = None
+        self.tts_generator = None
+        self.audio_understanding_generator = None
+        self.spoken_dialogue_generator = None
+        self.speech2text_dialogue_generator = None
+        self.text_dialogue_generator = None
+        self.device = None
+        self.model_initialized = False
+    def initialize_model(self, model_path=None, tokenizer_path=None):
+        try:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            if model_path is None:
+                model_path = "./models/MiMo-Audio-7B-Instruct"
+            if tokenizer_path is None:
+                tokenizer_path = "./models/MiMo-Audio-Tokenizer"
+            print(f"Model path: {model_path}")
+            print(f"Tokenizer path: {tokenizer_path}")
+            self.model = MimoAudio(model_path, tokenizer_path)
+            self.tts_generator = TTSGenerator(self.model, self.device)
+            self.audio_understanding_generator = AudioUnderstandingGenerator(self.model, self.device)
+            self.spoken_dialogue_generator = SpokenDialogueGenerator(self.model, self.device)
+            self.speech2text_dialogue_generator = Speech2TextDialogueGenerator(self.model, self.device)
+            self.text_dialogue_generator = TextDialogueGenerator(self.model, self.device)
+            self.model_initialized = True
+            print("Model loaded successfully!")
+            return "✅ Model loaded successfully!"
+        except Exception as e:
+            error_msg = f"❌ Model loading failed: {str(e)}"
+            print(error_msg)
+            return error_msg
+    def generate_tts_audio(self, input_text, instruct="", use_instruct=False):
+        if not self.model_initialized:
+            return None, "❌ Error: Model not initialized, please load the model first"
+        if not input_text.strip():
+            return None, "❌ Error: Please input text content"
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                output_path = tmp_file.name
+            if not (use_instruct and instruct.strip()):
+                instruct = None
+            print(f"Generating TTS audio: {input_text}")
+            text_channel = self.tts_generator.generate(input_text, instruct, output_path)
+            status_msg = f"✅ TTS audio generated successfully!\n📝 Input text: {input_text}"
+            if use_instruct and instruct is not None and instruct.strip():
+                status_msg += f"\n🎭 Style description: {instruct}"
+            status_msg += f"\n🎵 Output text channel: {text_channel}"
+            return output_path, status_msg, gr.update(value=output_path, visible=True)
+        except Exception as e:
+            error_msg = f"❌ Error generating TTS audio: {str(e)}"
+            print(error_msg)
+            return None, error_msg, gr.update(visible=False)
+    def generate_audio_understanding_response(self, input_audio, input_text, thinking=False):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        if input_audio is None and not input_text.strip():
+            return "", "❌ Error: Please provide either audio input or text question"
+        if input_audio is None:
+            return "", "❌ Error: Please upload an audio file for Audio Understanding task"
+        if not input_text.strip():
+            return "", "❌ Error: Please input your question"
+        try:
+            print(f"Performing Audio Understanding task:")
+            print(f"Audio input: {input_audio}")
+            print(f"Text question: {input_text}")
+            audio_understanding_response = self.audio_understanding_generator.generate(input_audio, input_text.strip(), thinking=thinking)
+            status_msg = f"✅ Audio Understanding task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓ Question: {input_text}\n💬 Answer: {audio_understanding_response}"
+            return audio_understanding_response, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error performing Audio Understanding task: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def generate_spoken_dialogue_response(self, input_audio, system_prompt=None, prompt_speech=None, add_history=False):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        if input_audio is None:
+            return "", "❌ Error: Please upload an audio file"
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                output_audio_path = tmp_file.name
+            print(f"Performing spoken dialogue task:")
+            print(f"Audio input: {input_audio}")
+            print(f"Output path: {output_audio_path}")
+            dialogue_response = self.spoken_dialogue_generator.generate(input_audio, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
+            status_msg = f"✅ Spoken dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n💬 Response: {dialogue_response[:300]}..."
+            return output_audio_path, dialogue_response, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error performing spoken dialogue task: {str(e)}"
+            print(error_msg)
+            return None, None, error_msg
+    def generate_speech2text_dialogue_response(self, input_audio, thinking=False, add_history=False):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        if input_audio is None:
+            return "", "❌ Error: Please upload an audio file for S2T Dialogue task"
+        try:
+            print(f"Performing S2T Dialogue task:")
+            print(f"Audio input: {input_audio}")
+            s2t_response = self.speech2text_dialogue_generator.generate(input_audio, thinking=thinking, add_history=add_history)
+            status_msg = f"✅ S2T dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓💬 Answer: {s2t_response}"
+            return s2t_response, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error performing QA task: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def generate_text_dialogue_response(self, input_text, thinking=False, add_history=False):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        if not input_text or not input_text.strip():
+            return "", "❌ Error: Please input your text"
+        try:
+            print(f"Performing Text Dialogue task:")
+            print(f"Text input: {input_text}")
+            print(f"Thinking mode: {thinking}")
+            print(f"Add history: {add_history}")
+            t2t_response = self.text_dialogue_generator.generate(input_text.strip(), thinking=thinking, add_history=add_history)
+            status_msg = f"✅ T2T dialogue task completed successfully!\n💬 Input: {input_text}"
+            if thinking:
+                status_msg += f"\n🧠 Thinking mode: Enabled"
+            status_msg += f"\n💬 Answer: {t2t_response}"
+            return t2t_response, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error performing T2T dialogue task: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def clear_spoken_dialogue_history(self):
+        if not self.model_initialized:
+            return None, "", "❌ Error: Model not initialized, please load the model first"
+        try:
+            self.spoken_dialogue_generator.clear_history()
+            return None, "", "✅ Spoken dialogue history cleared successfully!"
+        except Exception as e:
+            error_msg = f"❌ Error clearing spoken dialogue history: {str(e)}"
+            print(error_msg)
+            return None, "", error_msg
+    def clear_speech2text_dialogue_history(self):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        try:
+            self.speech2text_dialogue_generator.clear_history()
+            return "", "✅ Speech-to-text dialogue history cleared successfully!"
+        except Exception as e:
+            error_msg = f"❌ Error clearing S2T dialogue history: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def clear_text_dialogue_history(self):
+        if not self.model_initialized:
+            return "", "❌ Error: Model not initialized, please load the model first"
+        try:
+            self.text_dialogue_generator.clear_history()
+            return "", "✅ Text dialogue history cleared successfully!"
+        except Exception as e:
+            error_msg = f"❌ Error clearing T2T dialogue history: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def create_interface(self):
+        with gr.Blocks(title="MiMo-Audio Multimodal Speech Processing System", theme=gr.themes.Soft()) as iface:
+            gr.Markdown("# 🎵 MiMo-Audio Multimodal Speech Processing System")
+            gr.Markdown("Supports audio understanding, text-to-speech, spoken dialogue, speech-to-text dialogue and text-to-text dialogue")
+            with gr.Tabs():
+                with gr.TabItem("⚙️ Model Configuration"):
+                    gr.Markdown("### 📋 Model initialization configuration")
+                    with gr.Row():
+                        with gr.Column():
+                            model_path = gr.Textbox(
+                                label="Model path",
+                                placeholder="Leave blank to use default path: ./models/MiMo-Audio-7B-Instruct",
+                                lines=3
+                            )
+                            tokenizer_path = gr.Textbox(
+                                label="Tokenizer path",
+                                placeholder="Leave blank to use default path: ./models/MiMo-Audio-Tokenizer",
+                                lines=3
+                            )
+                            init_btn = gr.Button("🔄 Initialize model", variant="primary", size="lg")
+                        with gr.Column():
+                            init_status = gr.Textbox(
+                                label="Initialization status",
+                                interactive=False,
+                                lines=6,
+                                placeholder="Click the initialize model button to start..."
+                            )
+                            gr.Markdown("### 💻 System information")
+                            device_info = gr.Textbox(
+                                label="Device information",
+                                value=f"GPU available: {'Yes' if torch.cuda.is_available() else 'No'}",
+                                interactive=False
+                            )
+                with gr.TabItem("🔊 Audio Understanding"):
+                    gr.Markdown("### 🎯 Audio Understanding")
+                    with gr.Row():
+                        with gr.Column():
+                            audio_understanding_input_audio = gr.Audio(
+                                label="Upload Audio File",
+                                type="filepath",
+                                interactive=True,
+                            )
+                            audio_understanding_input_text = gr.Textbox(
+                                label="Input Question",
+                                placeholder="Please input your question...",
+                                lines=3,
+                            )
+                            audio_understanding_thinking = gr.Checkbox(
+                                label="Enable Thinking Mode",
+                                value=False,
+                                info="Enable thinking mode, AI will perform a deeper analysis and thinking"
+                            )
+                            audio_understanding_generate_btn = gr.Button("🤖 Start Audio Understanding", variant="primary", size="lg")
+                        with gr.Column():
+                            audio_understanding_output_text = gr.Textbox(
+                                label="Answer Result",
+                                lines=8,
+                                interactive=False,
+                                placeholder="AI's answer will be displayed here...",
+                                elem_id="audio_understanding_output_text"
+                            )
+                            audio_understanding_status = gr.Textbox(
+                                label="Processing Status",
+                                lines=6,
+                                interactive=False,
+                                placeholder="Processing status information will be displayed here..."
+                            )
+                            with gr.Row():
+                                audio_understanding_copy_btn = gr.Button("📋 Copy Answer", size="sm")
+                                audio_understanding_clear_btn = gr.Button("🗑️ Clear Result", size="sm")
+                    gr.Markdown("### 🌟 Audio Understanding Examples")
+                    audio_understanding_examples = gr.Examples(
+                        examples=[
+                            [None, "这段音频的主要内容是什么？"],
+                            [None, "说话者的情感状态如何？"],
+                            [None, "音频中提到了哪些关键信息？"],
+                            [None, "Please summarize the main points of this conversation."],
+                            [None, "What viewpoint does the speaker want to express?"]
+                        ],
+                        inputs=[audio_understanding_input_audio, audio_understanding_input_text],
+                        label="Click the example to automatically fill the question"
+                    )
+                with gr.TabItem("🎵 Text-to-Speech"):
+                    gr.Markdown("### 🎵 Text-to-Speech")
+                    with gr.Row():
+                        with gr.Column():
+                            tts_input_text = gr.Textbox(
+                                label="Input Text",
+                                placeholder="Please input the text you want to convert to speech...",
+                                lines=4,
+                                max_lines=8
+                            )
+                            tts_instruct = gr.Textbox(
+                                label="Style Description (Optional)",
+                                placeholder="Please input the style description (optional)...",
+                                lines=3,
+                                max_lines=5
+                            )
+                            tts_use_instruct = gr.Checkbox(
+                                label="Use Style Description",
+                                value=True,
+                                info="Enable to use InstructTTS for style-controlled speech generation"
+                            )
+                            tts_generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+                        with gr.Column():
+                            tts_output_audio = gr.Audio(
+                                label="Generated Speech",
+                                type="filepath"
+                            )
+                            tts_status = gr.Textbox(
+                                label="Generation Status",
+                                lines=6,
+                                interactive=False
+                            )
+                            tts_download_btn = gr.DownloadButton(
+                                label="Download Generated Audio",
+                                visible=False
+                            )
+                with gr.TabItem("🎤 Spoken Dialogue"):
+                    gr.Markdown("### 🎯 Spoken Dialogue")
+                    with gr.Row():
+                        with gr.Column():
+                            dialogue_input_audio = gr.Audio(
+                                label="Upload User Speech",
+                                type="filepath",
+                                interactive=True
+                            )
+                            system_prompt = gr.Textbox(
+                                label="System Prompt (Optional)",
+                                placeholder="e.g.: You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.",
+                                lines=1
+                            )
+                            prompt_speech = gr.Audio(
+                                label="Prompt Speech (Optional, MiMo-Audio speaks with the same timbre as your prompt.)",
+                                type="filepath",
+                                interactive=True
+                            )
+                            spoken_dialogue_add_history = gr.Checkbox(
+                                label="Enable History Record",
+                                value=True,
+                                info="Enable to remember the previous dialogue context"
+                            )
+                            with gr.Row():
+                                dialogue_generate_btn = gr.Button("💬 Start Dialogue", variant="primary", size="lg")
+                            with gr.Row():
+                                dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
+                        with gr.Column():
+                            dialogue_output_audio = gr.Audio(
+                                label="Output Audio",
+                                type="filepath"
+                            )
+                            dialogue_output_text = gr.Textbox(
+                                label="Dialogue Response",
+                                lines=5,
+                                interactive=False,
+                            )
+                            dialogue_status = gr.Textbox(
+                                label="Dialogue Status",
+                                lines=5,
+                                interactive=False,
+                            )
+                with gr.TabItem("💬 S2T Dialogue"):
+                    gr.Markdown("### 🎯 S2T Dialogue")
+                    with gr.Row():
+                        with gr.Column():
+                            s2t_dialogue_input_audio = gr.Audio(
+                                label="Upload User Speech",
+                                type="filepath",
+                                interactive=True
+                            )
+                            s2t_dialogue_add_history = gr.Checkbox(
+                                label="Enable History Record",
+                                value=True,
+                                info="Enable to remember the previous dialogue context"
+                            )
+                            s2t_dialogue_thinking = gr.Checkbox(
+                                label="Enable Thinking Mode (think mode)",
+                                value=False,
+                                info="Enable to perform a deeper analysis and reasoning"
+                            )
+                            with gr.Row():
+                                s2t_dialogue_generate_btn = gr.Button("🎧 Start S2T Dialogue", variant="primary", size="lg")
+                            with gr.Row():
+                                s2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
+                        with gr.Column():
+                            s2t_dialogue_output_text = gr.Textbox(
+                                label="Dialogue Response",
+                                lines=8,
+                                interactive=False,
+                                placeholder="AI's dialogue response will be displayed here..."
+                            )
+                            s2t_dialogue_status = gr.Textbox(
+                                label="Dialogue Status",
+                                lines=5,
+                                interactive=False,
+                                placeholder="Dialogue status information will be displayed here..."
+                            )
+                with gr.TabItem("📝 T2T Dialogue"):
+                    gr.Markdown("### 🎯 T2T Dialogue")
+                    with gr.Row():
+                        with gr.Column():
+                            t2t_dialogue_input_text = gr.Textbox(
+                                label="Input Dialogue Content",
+                                placeholder="Please input the text content you want to dialogue...",
+                                lines=4,
+                                max_lines=8
+                            )
+                            t2t_dialogue_add_history = gr.Checkbox(
+                                label="Enable History Record",
+                                value=True,
+                                info="Enable to remember the previous dialogue context"
+                            )
+                            t2t_dialogue_thinking = gr.Checkbox(
+                                label="Enable Thinking Mode (Thinking)",
+                                value=False,
+                                info="Enable thinking mode, AI will perform a deeper analysis and thinking"
+                            )
+                            with gr.Row():
+                                t2t_dialogue_generate_btn = gr.Button("💬 Start T2T Dialogue", variant="primary", size="lg")
+                            with gr.Row():
+                                t2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
+                        with gr.Column():
+                            t2t_dialogue_output_text = gr.Textbox(
+                                label="Dialogue Response",
+                                lines=8,
+                                interactive=False,
+                                placeholder="AI's dialogue response will be displayed here..."
+                            )
+                            t2t_dialogue_status = gr.Textbox(
+                                label="Dialogue Status",
+                                lines=5,
+                                interactive=False,
+                                placeholder="Dialogue status information will be displayed here..."
+                            )
+                    gr.Markdown("### 🌟 T2T Dialogue Examples")
+                    t2t_dialogue_examples = gr.Examples(
+                        examples=[
+                            ["Hello, how are you?"],
+                            ["I want to know the history of the development of artificial intelligence"],
+                            ["Please recommend some good movies"],
+                            ["Can you help me explain the basic concepts of quantum physics?"],
+                            ["I'm learning programming recently, any suggestions?"]
+                        ],
+                        inputs=[t2t_dialogue_input_text],
+                        label="Click the example to automatically fill the dialogue content"
+                    )
+            def copy_text_to_clipboard(text):
+                return text
+            def clear_audio_understanding_results():
+                return "", "🗑️ Audio Understanding Result Cleared"
+            init_btn.click(
+                fn=lambda path, tok_path: self.initialize_model(path or None, tok_path or None),
+                inputs=[model_path, tokenizer_path],
+                outputs=[init_status]
+            )
+            audio_understanding_generate_btn.click(
+                fn=self.generate_audio_understanding_response,
+                inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
+                outputs=[audio_understanding_output_text, audio_understanding_status]
+            )
+            audio_understanding_copy_btn.click(
+                fn=None,
+                inputs=[audio_understanding_output_text],
+                js="(text) => {navigator.clipboard.writeText(text); alert('Copied to clipboard!')}"
+            )
+            tts_generate_btn.click(
+                fn=self.generate_tts_audio,
+                inputs=[tts_input_text, tts_instruct, tts_use_instruct],
+                outputs=[tts_output_audio, tts_status, tts_download_btn]
+            )
+            dialogue_generate_btn.click(
+                fn=self.generate_spoken_dialogue_response,
+                inputs=[dialogue_input_audio, system_prompt, prompt_speech, spoken_dialogue_add_history],
+                outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
+            )
+            dialogue_clear_history_btn.click(
+                fn=self.clear_spoken_dialogue_history,
+                outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
+            )
+            s2t_dialogue_generate_btn.click(
+                fn=self.generate_speech2text_dialogue_response,
+                inputs=[s2t_dialogue_input_audio, s2t_dialogue_thinking, s2t_dialogue_add_history],
+                outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
+            )
+            s2t_dialogue_clear_history_btn.click(
+                fn=self.clear_speech2text_dialogue_history,
+                outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
+            )
+            t2t_dialogue_generate_btn.click(
+                fn=self.generate_text_dialogue_response,
+                inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
+                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
+            )
+            t2t_dialogue_clear_history_btn.click(
+                fn=self.clear_text_dialogue_history,
+                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
+            )
+            audio_understanding_clear_btn.click(
+                fn=clear_audio_understanding_results,
+                outputs=[audio_understanding_output_text, audio_understanding_status]
+            )
+            tts_input_text.submit(
+                fn=self.generate_tts_audio,
+                inputs=[tts_input_text, tts_instruct, tts_use_instruct],
+                outputs=[tts_output_audio, tts_status, tts_download_btn]
+            )
+            audio_understanding_input_text.submit(
+                fn=self.generate_audio_understanding_response,
+                inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
+                outputs=[audio_understanding_output_text, audio_understanding_status]
+            )
+            t2t_dialogue_input_text.submit(
+                fn=self.generate_text_dialogue_response,
+                inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
+                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
+            )
+        return iface
+def main():
+    parser = argparse.ArgumentParser(description="MiMo-Audio")
+    parser.add_argument("--host", default="0.0.0.0", help="Server Address")
+    parser.add_argument("--port", type=int, default=7897, help="Port")
+    parser.add_argument("--share", action="store_true", help="Create Public Link")
+    parser.add_argument("--debug", action="store_true", help="Debug Mode")
+    args = parser.parse_args()
+    print("🚀 Launch MiMo-Audio...")
+    speech_interface = MultiModalSpeechInterface()
+    print("🎨 Create Gradio Interface...")
+    iface = speech_interface.create_interface()
+    print(f"🌐 Launch Service - Address: {args.host}:{args.port}")
+    iface.launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share,
+        debug=args.debug
+    )
+if __name__ == "__main__":
+    main()

src/mimo_audio/mimo_audio.py ADDED Viewed

	@@ -0,0 +1,1292 @@

+# Copyright 2025 Xiaomi Corporation.
+import os
+import re
+import time
+import random
+import torch
+import torchaudio
+import soundfile as sf
+from typing import Union
+from torchaudio.transforms import MelSpectrogram
+from transformers import (
+    AutoTokenizer,
+    GenerationConfig
+)
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from .process_speechdata import InputSegment, StreamingInputSegment
+from ..mimo_audio_tokenizer import MiMoAudioTokenizer
+from .templates import asr_en_templates, asr_zh_templates, tts_en_templates, tts_zh_templates
+from .modeling_mimo_audio import (
+    MiMoAudioArguments,
+    MiMoAudioForCausalLM,
+    MiMoSampler,
+    MiMoStopper,
+)
+def detect_language(text):
+    if re.search(r'[\u4e00-\u9fff]', text):
+        return 'zh'
+    else:
+        return 'en'
+class MimoAudio:
+    def __init__(
+        self,
+        model_path: str,
+        mimo_audio_tokenizer_path: str,
+        device: str | None = None,
+    ) -> None:
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.path = model_path
+        self.mimo_audio_tokenizer_path = mimo_audio_tokenizer_path
+        self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
+            self.path
+        )
+        self.padding_idx = int(self.tokenizer.pad_token_id)
+        special_tokens = [
+            "<|sosp|>",
+            "<|eosp|>",
+            "<|empty|>",
+            "<|Human|>",
+            "<|SpeechLM|>",
+            "<|sostm|>",
+            "<|eostm|>",
+            "<|eot|>",
+        ]
+        for token in special_tokens:
+            if token not in self.tokenizer.get_vocab():
+                print(f"Add special tokens {token} to tokenizer.vocab")
+                self.tokenizer.add_tokens([token], special_tokens=True)
+        self.sosp_idx = self.tokenizer.convert_tokens_to_ids("<|sosp|>")
+        self.eosp_idx = self.tokenizer.convert_tokens_to_ids("<|eosp|>")
+        self.empty_token = self.tokenizer.convert_tokens_to_ids("<|empty|>")
+        self.sostm_idx = self.tokenizer.convert_tokens_to_ids("<|sostm|>")
+        self.eostm_idx = self.tokenizer.convert_tokens_to_ids("<|eostm|>")
+        self.eot_idx = self.tokenizer.convert_tokens_to_ids("<|eot|>")
+        self.im_start_idx = self.tokenizer.convert_tokens_to_ids("<|im_start|>")
+        self.im_end_idx = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
+        model_args = MiMoAudioArguments(
+            model_name_or_path=self.path,
+            sosp_idx=self.sosp_idx,
+            eosp_idx=self.eosp_idx,
+            empty_idx=self.empty_token,
+            sostm_idx=self.sostm_idx,
+            eostm_idx=self.eostm_idx,
+            eot_idx=self.eot_idx,
+        )
+        start_loading_time = time.monotonic()
+        self.model = MiMoAudioForCausalLM.from_pretrained(
+            self.path,
+            args=model_args,
+            torch_dtype=torch.bfloat16,
+            device_map={"": self.device},
+        )
+        self.group_size=self.model.config.group_size
+        self.audio_channels=self.model.config.audio_channels
+        self.delay_pattern = self.model.config.delay_pattern
+        self.vocab_size = self.model.config.vocab_size
+        self.speech_zeroemb_idx = self.model.speech_empty_ids
+        self.model.eval()
+        print(
+            f"Model loaded in {time.monotonic() - start_loading_time:.2f} seconds, device: {self.device}"
+        )
+        self.generate_kwargs = {
+            "max_length": 8192,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        self.default_global_sampler = MiMoSampler(
+            do_sample=True, temperature=0.6, top_k=50, top_p=0.95
+        )
+        self.default_local_sampler = MiMoSampler(
+            do_sample=True, temperature=0.9, top_k=50, top_p=0.95
+        )
+        self.task_sampler_configs = {
+            "asr": {
+                "global": MiMoSampler(do_sample=False, temperature=1.0, top_p=1.0),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+            "tts": {
+                "global": MiMoSampler(do_sample=True, temperature=0.6, top_p=1.0),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+            "spoken_dialogue": {
+                "global": MiMoSampler(do_sample=True, temperature=0.6, top_p=0.95),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+            "audio_understanding": {
+                "global": MiMoSampler(do_sample=True, temperature=0.3, top_p=0.95),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+            "text_chat": {
+                "global": MiMoSampler(do_sample=True, temperature=0.4, top_p=0.95),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+            "in_context_learning_s2s": {
+                "global": MiMoSampler(do_sample=False, temperature=1.0, top_p=1.0),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+        }
+        start_loading_mimo_audio_tokenizer_time = time.monotonic()
+        self.mimo_audio_tokenizer = MiMoAudioTokenizer.from_pretrained(self.mimo_audio_tokenizer_path)
+        self.mimo_audio_tokenizer.eval().bfloat16().to(self.device)
+        print(
+            f"MiMo-Audio Tokenizer loaded in {time.monotonic() - start_loading_mimo_audio_tokenizer_time:.2f} seconds, device: {self.device}"
+        )
+        # Initialize mel spectrogram transform for consistent processing
+        self.mel_transform = MelSpectrogram(
+            sample_rate=self.mimo_audio_tokenizer.config.sampling_rate,
+            n_fft=self.mimo_audio_tokenizer.config.nfft,
+            hop_length=self.mimo_audio_tokenizer.config.hop_length,
+            win_length=self.mimo_audio_tokenizer.config.window_size,
+            f_min=self.mimo_audio_tokenizer.config.fmin,
+            f_max=self.mimo_audio_tokenizer.config.fmax,
+            n_mels=self.mimo_audio_tokenizer.config.n_mels,
+            power=1.0,
+            center=True,
+        ).to(self.device)
+        self.history = None
+    def get_task_sampler(self, task_name):
+        if task_name not in self.task_sampler_configs:
+            return {
+                "global": self.default_global_sampler,
+                "local": self.default_local_sampler
+            }
+        return self.task_sampler_configs[task_name]
+    def save_wav(self, path, wav):
+        sf.write(
+            path,
+            wav.reshape(-1).detach().cpu().numpy(),
+            24000,
+        )
+    def wav2mel(self, wav):
+        spec = self.mel_transform(wav[None, :])
+        return torch.log(torch.clip(spec, min=1e-7)).squeeze()
+    def resample_audio_if_needed(self, wav_tensor: torch.Tensor, original_sr: int):
+        target_sr = self.mimo_audio_tokenizer.config.sampling_rate
+        if original_sr != target_sr:
+            wav_tensor = torchaudio.functional.resample(
+                wav_tensor, original_sr, target_sr
+            )
+        return wav_tensor
+    def group_by_length(self, features: torch.Tensor, lengths: torch.Tensor, max_length: int):
+        if features.size(0) != lengths.sum().item():
+            raise ValueError(f"Feature size mismatch: {features.size(0)} vs {lengths.sum().item()}")
+        split_points = []
+        current_sum = 0
+        for i, seq_len in enumerate(lengths):
+            if current_sum + seq_len > max_length and current_sum > 0:
+                split_points.append(i)
+                current_sum = seq_len.item()
+            else:
+                current_sum += seq_len.item()
+        # Convert split points to group sizes
+        group_sizes = []
+        prev = 0
+        for point in split_points:
+            group_sizes.append(point - prev)
+            prev = point
+        if prev < len(lengths):
+            group_sizes.append(len(lengths) - prev)
+        len_groups = torch.split(lengths, group_sizes)
+        feature_sizes = [group.sum().item() for group in len_groups]
+        feature_groups = torch.split(features, feature_sizes)
+        return feature_groups, len_groups
+    def encode_batch(self, input_features: torch.Tensor, input_lens: torch.Tensor, max_length: int = 256000):
+        feature_groups, len_groups = self.group_by_length(input_features, input_lens, max_length)
+        encoded_parts = []
+        for features, lengths in zip(feature_groups, len_groups):
+            with torch.no_grad():
+                codes, _ = self.mimo_audio_tokenizer.encoder.encode(
+                    input_features=features.to(self.device),
+                    input_lens=lengths.to(self.device),
+                    return_codes_only=True
+                )
+                encoded_parts.append(codes)
+        return torch.cat(encoded_parts, dim=-1)
+    def preprocess_input(
+        self,
+        input: Union[None, str, torch.Tensor] = None,
+    ):
+        if isinstance(input, torch.Tensor) or (isinstance(input, str) and os.path.isfile(input)):
+            if isinstance(input, torch.Tensor):
+                wav = input
+            else:
+                wav, sr = torchaudio.load(input)
+                if wav.ndim == 2:
+                    wav = wav.mean(dim=0)
+                wav = self.resample_audio_if_needed(wav, sr)
+            wav = wav.to(self.device)
+            mel = self.wav2mel(wav).transpose(0, 1)  # (seq_len, n_mels)
+            input_len = mel.size(0)
+            segment_size = 6000
+            input_len_seg = [segment_size] * (input_len // segment_size)
+            if input_len % segment_size > 0:
+                input_len_seg.append(input_len % segment_size)
+            codes_packed = self.encode_batch(
+                input_features=mel,
+                input_lens=torch.tensor(input_len_seg),
+            )
+            codes = codes_packed.transpose(0, 1).detach().cpu()
+            audio_codes = codes[:, :self.audio_channels]
+            # Pad the sequence to be a multiple of group_size by repeating the last frame
+            num_timesteps = audio_codes.shape[0]
+            if num_timesteps % self.group_size != 0:
+                padding_needed = self.group_size - (num_timesteps % self.group_size)
+                last_tokens = audio_codes[-1:, :] # Keep dim for repeat
+                padding_tokens = last_tokens.repeat(padding_needed, 1)
+                audio_codes = torch.cat([audio_codes, padding_tokens], dim=0)
+            audio_tokenized = audio_codes.reshape(-1)
+            return audio_tokenized
+        else:
+            text = input
+            if (
+                text.isupper() or text.islower()
+            ):  # If the text only contains upper-case or lower-case letters, capitalize it.
+                text = text.capitalize()
+            return text
+    def get_input_ids(self, prompt):
+        input_ids = [
+            seg.to_input_id(
+                self.tokenizer,
+                self.group_size,
+                self.audio_channels,
+            )
+            for seg in prompt
+        ]
+        input_ids = torch.cat(input_ids, dim=1)
+        return input_ids.to(self.device)
+    def get_asr_sft_prompt(
+        self,
+        input: Union[None, str] = None,
+    ):
+        audio_tokenized = self.preprocess_input(input)
+        template = random.choice(asr_zh_templates + asr_en_templates)
+        lm_prompt = [
+            InputSegment(
+                text=f"<|im_start|>user\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                audio=audio_tokenized,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=template,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_end|>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text="<think>\n\n</think>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        ]
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_tts_sft_prompt(
+        self,
+        input: Union[None, str] = None,
+        instruct=None,
+        read_text_only=True,
+        prompt_speech=None,
+    ):
+        if prompt_speech is not None:
+            assistant_prompt_audio_token = self.preprocess_input(prompt_speech)
+        else:
+            assistant_prompt_audio_token = None
+        if not read_text_only:
+            text = self.preprocess_input(input)
+            if assistant_prompt_audio_token is None:
+                lm_prompt = [
+                    InputSegment(
+                        text="<|im_start|>system\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"你需要根据指定的风格指令和文本内容来生成语音。",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text="<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_start|>user\n{text}<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n<think>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                ]
+            else:
+                lm_prompt = [
+                    InputSegment(
+                        text="<|im_start|>system\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"你需要根据指定的风格指令和文本内容来生成和语音prompt具有相同音色的语音。你的音色应该是：",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text="",
+                        audio=assistant_prompt_audio_token,
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text="<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_start|>user\n{text}<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n<think>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                ]
+        else:
+            language = detect_language(input)
+            if language == "zh":
+                template = random.choice(tts_zh_templates)
+            else:
+                template = random.choice(tts_en_templates)
+            text = self.preprocess_input(input)
+            if instruct is None:
+                lm_prompt = [
+                    InputSegment(
+                        text=f"<|im_start|>user\n{template}: {text}<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n<|sostm|>",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                ]
+            else:
+                if assistant_prompt_audio_token is None:
+                    lm_prompt = [
+                        InputSegment(
+                            text="<|im_start|>system\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"你需要根据指定的风格指令和文本内容来生成语音。",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text="<|im_end|>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"<|im_start|>user\n{template}: {text}({instruct})<|im_end|>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"<|im_start|>assistant\n<think>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                    ]
+                else:
+                    lm_prompt = [
+                        InputSegment(
+                            text="<|im_start|>system\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"你需要根据指定的风格指令和文本内容来生成和语音prompt具有相同音色的语音。你的音色应该是：",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text="",
+                            audio=assistant_prompt_audio_token,
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text="<|im_end|>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"<|im_start|>user\n{template}: {text}({instruct})<|im_end|>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text=f"<|im_start|>assistant\n<think>\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                    ]
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_audio_understanding_sft_prompt(
+        self,
+        input_speech,
+        input_text,
+        thinking=False,
+    ):
+        audio_tokenized = self.preprocess_input(input_speech)
+        lm_prompt = [
+            InputSegment(
+                text=f"<|im_start|>user\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                audio=audio_tokenized,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=input_text,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_end|>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+        ]
+        if not thinking:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n\n</think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        else:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_spoken_dialogue_sft_prompt(
+        self,
+        input_speech,
+        system_prompt=None,
+        prompt_speech=None,
+        add_history=False,
+    ):
+        audio_tokenized = self.preprocess_input(input_speech)
+        lm_prompt = []
+        if add_history and self.history is not None:
+            lm_prompt += [
+                InputSegment(
+                    text=f"<|im_start|>user\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    audio=audio_tokenized,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text=f"<|im_end|>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text=f"<|im_start|>assistant\n<|sostm|>",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+            ]
+        else:
+            if prompt_speech:
+                lm_prompt += [
+                    InputSegment(
+                        text="<|im_start|>system\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"Your voice should be:",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        audio=self.preprocess_input(prompt_speech),
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text="<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                ]
+            lm_prompt += [
+                InputSegment(
+                    text=f"<|im_start|>user\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            ]
+            if system_prompt:
+                lm_prompt += [
+                    InputSegment(
+                        text=system_prompt,
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text="\n\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            lm_prompt += [
+                InputSegment(
+                    audio=audio_tokenized,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text=f"<|im_end|>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text=f"<|im_start|>assistant\n<|sostm|>",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+            ]
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_spoken_dialogue_sft_multiturn_prompt(
+        self,
+        message_list,
+        system_prompt=None,
+        prompt_speech=None,
+    ):
+        lm_prompt = []
+        if prompt_speech:
+            lm_prompt += [
+                InputSegment(
+                    text="<|im_start|>system\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text=f"Your voice should be:",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    audio=self.preprocess_input(prompt_speech),
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text="<|im_end|>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            ]
+        for i in range(len(message_list)):
+            if message_list[i]['role'] == 'user':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>user\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+                if system_prompt and i == 0:
+                    lm_prompt += [
+                        InputSegment(
+                            text=system_prompt,
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        ),
+                        InputSegment(
+                            text="\n\n",
+                            speech_zeroemb_idx=self.speech_zeroemb_idx,
+                            text_zeroemb_idx=self.empty_token,
+                        )
+                    ]
+                lm_prompt += [
+                    InputSegment(
+                        audio=self.preprocess_input(message_list[i]['content']),
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            elif message_list[i]['role'] == 'assistant':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    StreamingInputSegment(
+                        text=message_list[i]['content']["text"],
+                        audio=self.preprocess_input(message_list[i]['content']["audio"]),
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                        tokenizer=self.tokenizer,
+                        group_size=self.group_size,
+                        audio_channels=self.audio_channels,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            else:
+                raise ValueError(f"Invalid role: {message_list[i]['role']}")
+        lm_prompt += [
+            InputSegment(
+                text=f"<|im_start|>assistant\n<|sostm|>",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+        ]
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_s2t_dialogue_sft_prompt(
+        self,
+        input_speech,
+        thinking=False,
+    ):
+        audio_tokenized = self.preprocess_input(input_speech)
+        lm_prompt = [
+            InputSegment(
+                text=f"<|im_start|>user\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                audio=audio_tokenized,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_end|>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        ]
+        if not thinking:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n\n</think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        else:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_s2t_dialogue_sft_multiturn_prompt(self, message_list, thinking=False):
+        lm_prompt = []
+        for i in range(len(message_list)):
+            if message_list[i]['role'] == 'user':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>user\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        audio=self.preprocess_input(message_list[i]['content']),
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            elif message_list[i]['role'] == 'assistant':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=message_list[i]['content'],
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            else:
+                raise ValueError(f"Invalid role: {message_list[i]['role']}")
+        lm_prompt.append(
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        )
+        if not thinking:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n\n</think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        else:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_text_dialogue_sft_prompt(
+        self,
+        input_text,
+        thinking=False,
+    ):
+        lm_prompt = [
+            InputSegment(
+                text=f"<|im_start|>user\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=input_text,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_end|>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+        ]
+        if not thinking:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n\n</think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        else:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_text_dialogue_sft_multiturn_prompt(
+        self,
+        message_list,
+        thinking=False,
+    ):
+        lm_prompt = []
+        for i in range(len(message_list)):
+            if message_list[i]['role'] == 'user':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>user\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=message_list[i]['content'],
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            elif message_list[i]['role'] == 'assistant':
+                lm_prompt += [
+                    InputSegment(
+                        text=f"<|im_start|>assistant\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=message_list[i]['content'],
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    ),
+                    InputSegment(
+                        text=f"<|im_end|>\n",
+                        speech_zeroemb_idx=self.speech_zeroemb_idx,
+                        text_zeroemb_idx=self.empty_token,
+                    )
+                ]
+            else:
+                raise ValueError(f"Invalid role: {message_list[i]['role']}")
+        lm_prompt.append(
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        )
+        if not thinking:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n\n</think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        else:
+            lm_prompt.append(
+                InputSegment(
+                    text="<think>\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                )
+            )
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    def get_in_context_learning_s2s_prompt(self, instruction, prompt_examples, audio):
+        prompt = [
+            InputSegment(
+                text=f"[Int]:{instruction}\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        ]
+        for i in range(len(prompt_examples)):
+            prompt += [
+                InputSegment(
+                    audio=self.preprocess_input(prompt_examples[i]["input_audio"]),
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                InputSegment(
+                    text="\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+                StreamingInputSegment(
+                    text=prompt_examples[i]["output_transcription"],
+                    audio=self.preprocess_input(prompt_examples[i]["output_audio"]),
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                    tokenizer=self.tokenizer,
+                    group_size=self.group_size,
+                    audio_channels=self.audio_channels,
+                ),
+                InputSegment(
+                    text=" \n\n",
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.empty_token,
+                ),
+            ]
+        prompt += [
+            InputSegment(
+                audio=self.preprocess_input(audio),
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text="\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text="<|sostm|>",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+        ]
+        input_ids = self.get_input_ids(prompt)
+        return input_ids
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids,
+        return_audio=False,
+        output_audio_path=None,
+        stopping_criteria=None,
+        min_new_tokens=0,
+        max_new_tokens=8192,
+        add_history=False,
+        task_name=None,
+    ):
+        task_sampler = self.get_task_sampler(task_name)
+        generation_kwargs = self.generate_kwargs.copy()
+        generation_config = GenerationConfig(**generation_kwargs)
+        input_ids = input_ids.T.reshape(1, -1) # [B, flattened(T, audio_channels + 1)]
+        if add_history and self.history is not None:
+            input_ids = torch.cat([self.history, input_ids], dim=1)
+        prompt_length = input_ids.shape[1] // (self.audio_channels+1)
+        max_length = prompt_length // self.group_size + max_new_tokens
+        min_length = prompt_length // self.group_size + min_new_tokens
+        if stopping_criteria is not None:
+            for criterion in stopping_criteria:
+                if isinstance(criterion, MiMoStopper):
+                    criterion.max_length = max_length
+                    criterion.min_length = min_length
+        generated_ids = self.model.generate(
+            input_ids,
+            generation_config,
+            stopping_criteria=stopping_criteria,
+            global_sampler=task_sampler["global"],
+            local_sampler=task_sampler["local"],
+        )
+        self.history = generated_ids
+        generated_ids = generated_ids.int().cpu().reshape(-1, self.audio_channels+1).T[:, prompt_length:]
+        text = generated_ids[0, ::self.group_size][:-1]
+        detokenized_text = self.tokenizer.decode(text, skip_special_tokens=False).strip().replace("<|empty|>", "").replace("<|eot|>", "").replace("<|eostm|>", "")
+        print("Text channel:\t", detokenized_text)
+        if output_audio_path:
+            return_audio = True
+        if not return_audio:
+            return detokenized_text
+        sosp_idx_locations = (text == self.sostm_idx).nonzero(as_tuple=True)[0]
+        eosp_idx_locations = (text == self.eostm_idx).nonzero(as_tuple=True)[0]
+        if len(sosp_idx_locations) == 0:
+            start_location = 0
+        else:
+            start_location = sosp_idx_locations[0] * self.group_size + self.group_size
+        if len(eosp_idx_locations) == 0:
+            end_location = text.shape[0] * self.group_size
+        else:
+            end_location = eosp_idx_locations[0] * self.group_size
+        audio_sequence = generated_ids[:, start_location:end_location]  #[audio_channels+1, audio_length]
+        speech_sequence = audio_sequence[1:]
+        mask = speech_sequence[0] != (self.speech_zeroemb_idx[0] if isinstance(self.speech_zeroemb_idx, list) else self.speech_zeroemb_idx)
+        speech_sequence = speech_sequence[:, mask]
+        assert (speech_sequence < torch.tensor(self.speech_zeroemb_idx).unsqueeze(1)).all()
+        speech_sequence = speech_sequence.T.flatten()
+        speech_str = "".join([f"<{i}>" for i in speech_sequence])
+        tokens = torch.tensor(
+            [int(num) for num in re.findall(r"(\d+)>", speech_str)]
+        )
+        if tokens.numel() == 0:
+            wav = torch.zeros(24000)
+            self.save_wav(output_audio_path, wav)
+            return detokenized_text
+        codes = tokens.reshape(-1, self.audio_channels).T
+        codes = codes.type(torch.LongTensor).to(self.device)
+        segment_len = 1500
+        wav_list=[]
+        for start in range(0, codes.shape[-1], segment_len):
+            wav = self.mimo_audio_tokenizer.decode(codes[:,start:start+segment_len]).float()
+            wav_list.append(wav)
+        wav_concat = torch.cat(wav_list, dim=-1)
+        #wav = self.mimo_audio_tokenizer.decode(codes).float()
+        if output_audio_path is not None:
+            self.save_wav(output_audio_path, wav_concat)
+            return detokenized_text
+        else:
+            return wav_concat
+    def asr_sft(self, audio):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_asr_sft_prompt(audio)
+        result = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="asr")
+        return result
+    def tts_sft(self, text, output_path, instruct=None, read_text_only=True, prompt_speech=None):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.eostm_idx, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_tts_sft_prompt(text, instruct=instruct, read_text_only=read_text_only, prompt_speech=prompt_speech)
+        text_output = self.forward(input_ids, output_audio_path=output_path, stopping_criteria=stopping_criteria, task_name="tts")
+        return text_output
+    def audio_understanding_sft(self, input_speech, input_text, thinking=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+                )
+            ]
+        input_ids = self.get_audio_understanding_sft_prompt(input_speech, input_text, thinking=thinking)
+        result = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="audio_understanding")
+        return result
+    def spoken_dialogue_sft(self, input_speech, output_audio_path=None, system_prompt=None, prompt_speech=None, add_history=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.eostm_idx, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_spoken_dialogue_sft_prompt(input_speech, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
+        text = self.forward(input_ids, output_audio_path=output_audio_path, stopping_criteria=stopping_criteria, task_name="spoken_dialogue", add_history=add_history)
+        return text
+    # interface for message list interaction
+    def spoken_dialogue_sft_multiturn(self, message_list, output_audio_path=None, system_prompt=None, prompt_speech=None):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.eostm_idx, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_spoken_dialogue_sft_multiturn_prompt(message_list, system_prompt=system_prompt, prompt_speech=prompt_speech)
+        text = self.forward(input_ids, output_audio_path=output_audio_path, stopping_criteria=stopping_criteria, task_name="spoken_dialogue", add_history=False)
+        return text
+    def speech2text_dialogue_sft(self, input_speech, thinking=False, add_history=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_s2t_dialogue_sft_prompt(input_speech, thinking=thinking)
+        text = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="spoken_dialogue", add_history=add_history)
+        return text
+    # interface for message list interaction
+    def speech2text_dialogue_sft_multiturn(self, message_list, thinking=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_s2t_dialogue_sft_multiturn_prompt(message_list, thinking=thinking)
+        text = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="spoken_dialogue", add_history=False)
+        return text
+    def text_dialogue_sft(self, input_text, thinking=False, add_history=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_text_dialogue_sft_prompt(input_text, thinking=thinking)
+        text = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="text_chat", add_history=add_history)
+        return text
+    # interface for message list interaction
+    def text_dialogue_sft_multiturn(self, message_list, thinking=False):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_text_dialogue_sft_multiturn_prompt(message_list, thinking=thinking)
+        text = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="text_chat", add_history=False)
+        return text
+    def clear_history(self):
+        self.history = None
+        print("History cleared")
+    def in_context_learning_s2s(self, instruction, prompt_examples, audio, max_new_tokens=None, output_audio_path=None):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.eostm_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_in_context_learning_s2s_prompt(instruction, prompt_examples, audio)
+        self.forward(input_ids, output_audio_path=output_audio_path, stopping_criteria=stopping_criteria, max_new_tokens=max_new_tokens, task_name="in_context_learning_s2s")

src/mimo_audio/modeling_mimo_audio.py ADDED Viewed

	@@ -0,0 +1,835 @@

+# Copyright 2025 Xiaomi Corporation.
+import copy
+import logging
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import StoppingCriteria
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import (
+    GenerateOutput,
+    GenerationConfig,
+    StoppingCriteriaList,
+    is_deepspeed_zero3_enabled,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Model,
+    Qwen2PreTrainedModel,
+)
+from transformers.utils import is_torchdynamo_compiling
+logger = logging.getLogger(__name__)
+class MiMoStopper(StoppingCriteria):
+    def __init__(
+        self,
+        group_size: int,
+        audio_channels: int,
+        stop_tokens: list[int] | None = None,
+        max_length: int | None = None,
+        min_length: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+        self.step = (audio_channels + 1) * group_size
+        self.stop_token_ids = set(stop_tokens or [])
+        self.max_length = max_length
+        self.min_length = min_length or 0
+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor):
+        is_done = False
+        cur_len = input_ids.shape[-1] // self.step
+        if self.max_length:
+            is_done |= cur_len >= self.max_length
+        if (self.stop_token_ids and
+            input_ids.shape[1] >= self.step and
+            cur_len >= self.min_length):
+            last_token = input_ids[0, -self.step].item()
+            is_done |= last_token in self.stop_token_ids
+        return torch.full(
+            (input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool
+        )
+@dataclass
+class MiMoSampler:
+    do_sample: bool | None = None
+    temperature: float | None = None
+    top_k: int | None = None
+    top_p: float | None = None
+    def process(self, scores: torch.Tensor):
+        if self.temperature is not None:
+            scores = scores / self.temperature
+        if self.top_k is not None and self.top_k > 0:
+            top_k = min(self.top_k, scores.shape[-1])
+            indices_to_remove = scores < torch.topk(scores, top_k)[0][:, -1]
+            scores = scores.masked_fill(indices_to_remove, float("-inf"))
+        if self.top_p is not None and 0.0 < self.top_p <= 1.0:
+            top_p = self.top_p if 0.0 < self.top_p <= 1.0 else 1.0
+            sorted_logits, sorted_indices = torch.sort(scores)
+            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+            sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+            sorted_indices_to_remove[:, -1] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove
+            )
+            scores = scores.masked_fill(indices_to_remove, float("-inf"))
+        return scores
+    def sample(self, scores: torch.Tensor, removed_tokens: list[int] | None = None):
+        scores = self.process(scores)
+        for t in removed_tokens or []:
+            scores[:, t] = float("-inf")
+        if self.do_sample:
+            probs = scores.softmax(dim=-1)
+            return torch.multinomial(probs, num_samples=1).squeeze(-1)
+        return torch.argmax(scores, dim=-1)
+@dataclass
+class MiMoAudioOutput(ModelOutput):
+    text_logits: torch.FloatTensor | None = None
+    local_hidden_states: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    """Downcast hidden states for local transformer generation"""
+@dataclass
+class MiMoAudioConfig(Qwen2Config):
+    def __init__(
+        self,
+        *,
+        speech_vocab_size: str | int = "1025-1025-129-129-129-129-129-129",
+        speech_zeroemb_idx: str | int = "1024-1024-128-128-128-128-128-128",
+        delay_pattern: str = "0-1-2-3-4-5-6-7",
+        head_dim: int = 128,
+        group_size: int = 4,
+        audio_channels: int = 8,
+        local_dim: int = 1024,
+        local_layers: int = 16,
+        local_attn_heads: int = 64,
+        local_ffn_dim: int = 4096,
+        local_attn_dropout: float = 0.1,
+        input_local_layers: int = 6,
+        input_local_dim: int | None = None,
+        input_full_attention: bool | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        self.speech_vocab_size = speech_vocab_size
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.delay_pattern = delay_pattern
+        self.head_dim = head_dim
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+        self.local_dim = local_dim
+        self.local_layers = local_layers
+        self.local_attn_heads = local_attn_heads
+        self.local_ffn_dim = local_ffn_dim
+        self.local_attn_dropout = local_attn_dropout
+        self.input_local_layers = input_local_layers
+        self.input_local_dim = input_local_dim or local_dim
+        self.input_full_attention = input_full_attention
+    def _parse_maybe_list(self, value: str | int, length: int) -> List[int]:
+        if isinstance(value, str) and "-" in value:
+            return [int(s) for s in value.split("-")]
+        return [int(value)] * length
+    def parsed_speech_empty_ids(self):
+        return self._parse_maybe_list(self.speech_zeroemb_idx, self.audio_channels)
+    def parsed_speech_vocab_sizes(self):
+        return self._parse_maybe_list(self.speech_vocab_size, self.audio_channels)
+    def parsed_delay_pattern(self):
+        return self._parse_maybe_list(self.delay_pattern, self.audio_channels)
+    def local_config(self):
+        config = copy.deepcopy(self)
+        config.hidden_size = self.local_dim
+        config.num_hidden_layers = self.local_layers
+        config.num_attention_heads = self.local_attn_heads
+        config.num_key_value_heads = self.local_attn_heads
+        config.head_dim = config.hidden_size // self.local_attn_heads
+        config.intermediate_size = self.local_ffn_dim
+        config.attention_dropout = self.local_attn_dropout
+        return config
+    def input_local_config(self):
+        config = copy.deepcopy(self)
+        config.hidden_size = self.input_local_dim
+        config.num_hidden_layers = self.input_local_layers
+        config.num_attention_heads = self.local_attn_heads
+        config.num_key_value_heads = self.local_attn_heads
+        config.head_dim = config.hidden_size // self.local_attn_heads
+        config.intermediate_size = config.hidden_size * 4
+        config.attention_dropout = self.local_attn_dropout
+        return config
+@dataclass
+class MiMoAudioArguments:
+    model_name_or_path: str
+    sosp_idx: int
+    eosp_idx: int
+    sostm_idx: int
+    eostm_idx: int
+    eot_idx: int
+    empty_idx: int
+    def to_dict(self):
+        return {
+            "model_name_or_path": self.model_name_or_path,
+            "sosp_idx": self.sosp_idx,
+            "eosp_idx": self.eosp_idx,
+            "sostm_idx": self.sostm_idx,
+            "eostm_idx": self.eostm_idx,
+            "eot_idx": self.eot_idx,
+            "empty_idx": self.empty_idx,
+        }
+class MiMoAudioForCausalLM(Qwen2PreTrainedModel):
+    def __init__(
+        self,
+        config: MiMoAudioConfig | Qwen2Config,
+        args: MiMoAudioArguments | dict,
+    ):
+        super().__init__(config)
+        config = (
+            MiMoAudioConfig(**vars(config))
+            if isinstance(config, Qwen2Config)
+            else config
+        )
+        args = MiMoAudioArguments(**args) if isinstance(args, dict) else args
+        self.config = config
+        self.args = args
+        self.model = Qwen2Model(config)
+        self.speech_vocab_sizes = config.parsed_speech_vocab_sizes()
+        self.speech_empty_ids = config.parsed_speech_empty_ids()
+        self.delay_pattern = config.parsed_delay_pattern()
+        self.group_size = config.group_size
+        self.audio_channels = config.audio_channels
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Construct local transformer
+        self.local_config = config.local_config()
+        self.local_transformer = Qwen2Model(self.local_config)
+        self.local_transformer.embed_tokens = None
+        # Add input local transformer if configured
+        self.input_local_config = config.input_local_config()
+        self.input_local_transformer = Qwen2Model(self.input_local_config)
+        self.input_local_transformer.embed_tokens = None
+        self.local_transformer_lm_heads = nn.ModuleList(
+            [
+                nn.Linear(
+                    self.local_config.hidden_size,
+                    self.speech_vocab_sizes[i],
+                    bias=False,
+                )
+                for i in range(self.audio_channels)
+            ]
+        )
+        self.speech_embeddings = nn.ModuleList(
+            [
+                nn.Embedding(
+                    self.speech_vocab_sizes[i],
+                    self.input_local_config.hidden_size,
+                    padding_idx=self.speech_empty_ids[i],
+                )
+                for i in range(self.audio_channels)
+            ]
+        )
+        if self.input_local_config.hidden_size != self.local_config.hidden_size:
+            self.speech_embeddings_to_local = nn.Linear(
+                self.input_local_config.hidden_size,
+                self.local_config.hidden_size,
+                bias=False,
+            )
+        else:
+            self.speech_embeddings_to_local = None
+        # Create speech_group_downcast_first for group_first_in_global_context
+        self.speech_group_downcast = nn.Linear(
+            self.input_local_config.hidden_size * config.group_size,
+            config.hidden_size,
+            bias=False,
+        )
+        self.hidden_states_downcast = nn.Linear(
+            config.hidden_size,
+            self.local_config.hidden_size,
+            bias=False,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def apply_input_local_transformer(self, speech_embeddings: torch.Tensor):
+        B, T_groups, group_size, hidden_size = speech_embeddings.shape
+        # Process each group independently: [B*T//group_size, group_size, hidden_size]
+        input_embeddings = speech_embeddings.reshape(
+            B * T_groups, group_size, hidden_size
+        )
+        output: BaseModelOutputWithPast = self.input_local_transformer(
+            inputs_embeds=input_embeddings,
+            return_dict=True,
+            is_causal=not self.config.input_full_attention,  # for SDPA
+        )
+        encoded_embeddings = output.last_hidden_state
+        # Reshape back to original format
+        # [B*T//group_size, group_size, hidden_size] -> [B, T//group_size, group_size, hidden_size]
+        encoded_embeddings = encoded_embeddings.reshape(
+            B, T_groups, group_size, hidden_size
+        )
+        return encoded_embeddings
+    def _prepare_input_embeds(
+        self,
+        input_ids: torch.LongTensor,  # [B, audio_channels + 1, new_T]
+    ):
+        B = input_ids.shape[0]
+        input_ids = input_ids.int()
+        group_size = self.config.group_size
+        text_input_ids = input_ids[:, 0, ::group_size]
+        speech_input_ids = (
+            input_ids[:, 1:, :]
+            .view(B, self.audio_channels, -1, group_size)
+            .transpose(1, 2)
+        )  # [B, T//group_size, audio_channels, group_size]
+        is_speech = text_input_ids == self.args.empty_idx  # [B, T//group_size]
+        speech_embeds = torch.zeros(
+            (
+                B,
+                is_speech.shape[1],
+                group_size,
+                self.input_local_config.hidden_size,
+            ),
+            device=input_ids.device,
+            dtype=torch.bfloat16,
+        )
+        for idx in range(self.audio_channels):
+            cur_empty = self.speech_empty_ids[idx]
+            cur_embed = self.speech_embeddings[idx]
+            cur_speech_ids = speech_input_ids[:, :, idx, :]
+            cur_speech_embeds: torch.Tensor = cur_embed(cur_speech_ids)
+            # [B, T_groups, group_size, hidden_size]
+            cur_mask = cur_speech_ids == cur_empty
+            cur_speech_embeds.masked_fill_(cur_mask.unsqueeze(-1), 0.0)
+            speech_embeds += cur_speech_embeds
+        speech_embeds = speech_embeds * is_speech.unsqueeze(-1).unsqueeze(-1)
+        # Apply input local transformer if configured
+        speech_embeds = self.apply_input_local_transformer(speech_embeds)
+        speech_embeds = speech_embeds * is_speech.unsqueeze(-1).unsqueeze(-1)
+        T_groups = speech_embeds.shape[1]
+        speech_grouped_embeds: torch.Tensor = self.speech_group_downcast(
+            speech_embeds.view(B, T_groups, -1)
+        )  # [B, T_groups, hidden_size]
+        text_embeds: torch.Tensor = self.model.embed_tokens(text_input_ids)
+        text_zero_mask = text_input_ids == self.args.empty_idx
+        text_embeds.masked_fill_(text_zero_mask.unsqueeze(-1), 0.0)
+        return text_embeds + speech_grouped_embeds
+    def forward(
+        self,
+        input_ids: torch.LongTensor,  # [B, audio_channels + 1, new_T]
+        attention_mask: torch.Tensor,  # [B, T_group]
+        position_ids: torch.LongTensor,  # [B, new_T_group]
+        past_key_values: Cache | None = None,
+        cache_position: torch.LongTensor | None = None,  # [new_T_group]
+        **_kwargs,
+    ):
+        inputs_embeds = self._prepare_input_embeds(input_ids)
+        outputs: BaseModelOutputWithPast = self.model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            return_dict=True,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state  # [B, new_T_group, hidden_size]
+        text_logits: torch.Tensor = self.lm_head(
+            hidden_states[:, -1:, :]
+        )  # [B, 1, vocab_size]
+        shift_hidden_states: torch.Tensor = self.hidden_states_downcast(
+            hidden_states[:, -1:, :]
+        )  # [B, 1, hidden_size]
+        return MiMoAudioOutput(
+            text_logits=text_logits,
+            local_hidden_states=shift_hidden_states,
+            past_key_values=outputs.past_key_values,
+        )
+    def local_forward(
+        self,
+        local_embeds: torch.FloatTensor,  # [B, 1, hidden_size]
+        tokens_dtype: torch.dtype,
+        tokens_device: torch.device,
+        local_sampler: MiMoSampler | None = None,
+    ):
+        B = local_embeds.shape[0]
+        delay_iters = self.group_size + max(self.delay_pattern)
+        past_key_values = DynamicCache()
+        local_tokens = torch.zeros(
+            (B, self.group_size, self.audio_channels),
+            dtype=tokens_dtype,
+            device=tokens_device,
+        )
+        if local_sampler is None:
+            local_sampler = MiMoSampler()
+        for t in range(delay_iters):
+            output: BaseModelOutputWithPast = self.local_transformer(
+                inputs_embeds=local_embeds,
+                past_key_values=past_key_values,
+                return_dict=True,
+                use_cache=True,
+            )
+            hidden_state = output.last_hidden_state
+            past_key_values = output.past_key_values
+            local_embeds = torch.zeros_like(local_embeds)
+            for idx in range(self.audio_channels):
+                cur_start = self.delay_pattern[idx]
+                cur_end = cur_start + self.group_size
+                cur_empty = self.speech_empty_ids[idx]
+                if cur_start <= t < cur_end:
+                    cur_lm_head = self.local_transformer_lm_heads[idx]
+                    cur_scores: torch.Tensor = cur_lm_head(hidden_state)[:, -1, :]
+                    # [B, vocab_size]
+                    cur_token = local_sampler.sample(
+                        cur_scores,
+                        [cur_empty],
+                    )
+                    local_tokens[:, t - cur_start, idx] = cur_token
+                    cur_input_embed = self.speech_embeddings[idx](
+                        cur_token.unsqueeze(1)
+                    )
+                    if self.speech_embeddings_to_local is not None:
+                        cur_input_embed = self.speech_embeddings_to_local(
+                            cur_input_embed
+                        )
+                    local_embeds += cur_input_embed
+        return local_tokens  # [B, group_size, audio_channels]
+    def _prepare_attention_mask(
+        self, inputs: torch.Tensor, input_ids_length: int
+    ) -> torch.Tensor:
+        # No information for attention mask inference -> return default attention mask
+        return torch.ones(
+            (inputs.shape[0], input_ids_length),
+            dtype=torch.bool,
+            device=inputs.device,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        """
+        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
+        slicing inputs given the existing cache.
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
+        """
+        # 1. Handle BC:
+        model_inputs = {}
+        input_ids = input_ids.reshape(
+            input_ids.shape[0], -1, (self.audio_channels + 1) * self.config.group_size
+        ).transpose(1, 2)  # [B, audio_channels*group_size, T]
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = (
+                past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_length,
+                input_ids.shape[2],
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        # 2. Generic cache-dependent input preparation
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+            if (
+                inputs_embeds is not None or cache_position[-1] >= input_ids.shape[2]
+            ):  # Exception 1 or Exception 3
+                input_ids = input_ids[:, :, -cache_position.shape[0] :]
+            elif (
+                input_ids.shape[2] != cache_position.shape[0]
+            ):  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, :, cache_position]
+        # 3. Prepare base model inputs
+        input_ids_key = (
+            "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+        )
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if not self.config.is_encoder_decoder:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                model_inputs[input_ids_key] = None
+                model_inputs["inputs_embeds"] = inputs_embeds
+            else:
+                # `clone` calls in this function ensure a consistent stride. See #32227
+                model_inputs[input_ids_key] = input_ids.clone(
+                    memory_format=torch.contiguous_format
+                )
+                model_inputs["inputs_embeds"] = None
+        else:
+            model_inputs[input_ids_key] = input_ids.clone(
+                memory_format=torch.contiguous_format
+            )
+        # 4. Create missing `position_ids` on the fly
+        if attention_mask is not None and kwargs.get("position_ids") is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = (
+                position_ids  # placed in kwargs for further processing (see below)
+            )
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input: torch.Tensor = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values:
+                    model_input = model_input[:, -input_ids.shape[2] :]
+                    model_input = model_input.clone(
+                        memory_format=torch.contiguous_format
+                    )
+                model_inputs[model_input_name] = model_input
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        if model_inputs[input_ids_key] is not None:
+            model_inputs[input_ids_key] = (
+                cast(torch.Tensor, model_inputs[input_ids_key])
+                .transpose(1, 2)
+                .reshape(input_ids.shape[0], -1, (self.audio_channels + 1))
+                .transpose(1, 2)
+            )  # [B, audio_channels, T*group_size]
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+    def _get_initial_cache_position(self, input_ids: torch.Tensor, model_kwargs: dict):
+        """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+        # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
+        if "inputs_embeds" in model_kwargs:
+            cache_position = (
+                torch.ones_like(
+                    model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64
+                ).cumsum(0)
+                - 1
+            )
+        else:
+            cache_position = (
+                torch.ones(
+                    (
+                        input_ids.shape[1]
+                        // (self.audio_channels + 1)
+                        // self.config.group_size,
+                    ),
+                    dtype=torch.int64,
+                    device=input_ids.device,
+                ).cumsum(0)
+                - 1
+            )
+        past_length = 0
+        if model_kwargs.get("past_key_values") is not None:
+            cache = model_kwargs["past_key_values"]
+            past_length = 0
+            if not isinstance(cache, Cache):
+                past_length = cache[0][0].shape[2]
+            elif (
+                hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None
+            ):
+                past_length = cache.get_seq_length()
+            # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty,
+            # end-to-end compilation will yield bad results because `cache_position` will be incorrect.
+            if not is_torchdynamo_compiling():
+                cache_position = cache_position[past_length:]
+        model_kwargs["cache_position"] = cache_position
+        return model_kwargs
+    @torch.inference_mode()
+    def generate(
+        self,
+        inputs: torch.Tensor | None = None,
+        generation_config: GenerationConfig | None = None,
+        stopping_criteria: StoppingCriteriaList | list | None = None,
+        streamer: BaseStreamer | None = None,
+        synced_gpus: bool | None = None,
+        global_sampler: MiMoSampler | None = None,
+        local_sampler: MiMoSampler | None = None,
+        warmup_run: bool | None = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, **kwargs
+        )
+        self._validate_model_kwargs(model_kwargs.copy())
+        # 2. Set generation parameters if not already defined
+        if synced_gpus is None:
+            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
+                synced_gpus = True
+            else:
+                synced_gpus = False
+        # 3. Define model inputs
+        input_ids, _model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        input_ids_length = input_ids.shape[-1]
+        input_ids_length //= self.group_size * (self.audio_channels + 1)
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+        if "attention_mask" not in model_kwargs:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask(
+                inputs, input_ids_length
+            )
+        device = input_ids.device
+        self._prepare_special_tokens(generation_config, True, device=device)
+        model_kwargs["use_cache"] = True
+        model_kwargs["past_key_values"] = DynamicCache()
+        prepared_stopping_criteria = StoppingCriteriaList(
+            stopping_criteria if stopping_criteria is not None else []
+        )
+        prepared_stopping_criteria.append(
+            MiMoStopper(
+                self.group_size,
+                self.audio_channels,
+                max_length=generation_config.max_length,
+            )
+        )
+        stance = "default" if warmup_run else "eager_on_recompile"
+        with torch.compiler.set_stance(stance):
+            return self.slm_sample(
+                input_ids,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                global_sampler=global_sampler,
+                local_sampler=local_sampler,
+                **model_kwargs,
+            )
+    def slm_sample(
+        self,
+        input_ids: torch.LongTensor,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: BaseStreamer | None,
+        global_sampler: MiMoSampler | None = None,
+        local_sampler: MiMoSampler | None = None,
+        **model_kwargs,
+    ) -> torch.LongTensor:
+        max_length = generation_config.max_length
+        B, cur_len = input_ids.shape
+        cur_len //= self.group_size * (self.audio_channels + 1)
+        initial_len = cur_len
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(B, dtype=torch.long, device=input_ids.device)
+        min_length = 0
+        stop_token_ids = set()
+        for criterion in stopping_criteria:
+            if isinstance(criterion, MiMoStopper):
+                if criterion.min_length is not None:
+                    min_length = max(min_length, criterion.min_length)
+                stop_token_ids.update(criterion.stop_token_ids)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+        while self._has_unfinished_sequences(
+            this_peer_finished,
+            synced_gpus,
+            device=input_ids.device,
+            cur_len=cur_len,
+            max_length=max_length,
+        ):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            if (
+                cast(torch.Tensor, model_inputs["input_ids"]).shape[2]
+                != self.group_size
+            ):
+                # prefill run
+                with torch.compiler.set_stance("force_eager"):
+                    outputs: MiMoAudioOutput = self(**model_inputs)
+            else:
+                outputs: MiMoAudioOutput = self(**model_inputs)
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            text_logits: torch.Tensor = outputs.text_logits[:, -1, :].clone()
+            # [B, vocab_size]
+            removed_tokens = None
+            if cur_len < min_length:
+                removed_tokens = list(stop_token_ids)
+            next_text_tokens = global_sampler.sample(text_logits, removed_tokens=removed_tokens)
+            # [B]
+            local_hidden_states = outputs.local_hidden_states
+            # Only Supports batch_size=1 here
+            if next_text_tokens[0] != self.args.empty_idx:
+                zero_embed_tensor = torch.tensor(
+                    self.speech_empty_ids,
+                    device=next_text_tokens.device,
+                    dtype=input_ids.dtype,
+                )
+                next_speech_tokens = zero_embed_tensor.view(
+                    1, 1, self.audio_channels
+                ).expand(B, self.config.group_size, -1)
+            else:
+                next_speech_tokens = self.local_forward(
+                    local_embeds=local_hidden_states,
+                    tokens_dtype=next_text_tokens.dtype,
+                    tokens_device=next_text_tokens.device,
+                    local_sampler=local_sampler,
+                )
+            next_text_tokens = next_text_tokens.reshape(B, 1, 1).expand(
+                -1, self.group_size, -1
+            )  # [B, group_size, 1]
+            # generate speech tokens
+            next_tokens = torch.cat(
+                (next_text_tokens, next_speech_tokens), dim=-1
+            ).reshape(B, -1)  # [B, group_size * (audio_channels + 1)]
+            input_ids = torch.cat(
+                [input_ids, next_tokens], dim=-1
+            )  # [B, T*group_size*vq]
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(
+                input_ids, None
+            )
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        input_ids = input_ids[:B]
+        return input_ids

src/mimo_audio/process_speechdata.py ADDED Viewed

	@@ -0,0 +1,289 @@

+#!/usr/bin/env python3
+# Copyright 2025 Xiaomi Corporation.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Union, List
+class InputSegment:
+    def __init__(
+        self,
+        text: str = "",
+        audio: torch.Tensor = None,
+        tokenized_text: torch.Tensor = None,
+        speech_zeroemb_idx: Union[int, List[int]] = 1024,
+        text_zeroemb_idx: int = 152067,
+        add_sosp_eosp=True,
+    ) -> None:
+        has_text = text is not None
+        has_tokenized_text = tokenized_text is not None
+        assert has_text or has_tokenized_text, "Text or tokenized text must be provided"
+        self.audio = audio
+        self.text = text
+        self.tokenized_text = tokenized_text
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.text_zeroemb_idx = text_zeroemb_idx
+        self.add_sosp_eosp = add_sosp_eosp
+    @staticmethod
+    def insert_between(tensor, i, value=-1):
+        return torch.scatter(
+            torch.full(
+                (1, tensor.shape[1] + (tensor.shape[1] - 1) * i + i),
+                value,
+                dtype=tensor.dtype,
+            ),
+            1,
+            torch.arange(0, tensor.shape[1], dtype=torch.int64)[None] * (i + 1),
+            tensor,
+        )
+    def to_input_id(
+        self,
+        tokenizer,
+        group_size: int,
+        audio_channels: int = 8,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.audio is None:
+            if self.tokenized_text is None:
+                tokenized_text = tokenizer(
+                    self.text,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=999999,
+                    padding=False,
+                    add_special_tokens=False,
+                )["input_ids"].int()
+            else:
+                tokenized_text = self.tokenized_text.unsqueeze(0)
+            if group_size > 1:
+                tokenized_text = self.insert_between(
+                    tokenized_text, group_size - 1, value=-100
+                )
+            if isinstance(self.speech_zeroemb_idx, list):
+                audio_part_input_id = torch.zeros((audio_channels, tokenized_text.shape[1]), dtype=torch.int)
+                for i, idx in enumerate(self.speech_zeroemb_idx):
+                    audio_part_input_id[i, :] = idx
+            else:
+                audio_part_input_id = torch.full(
+                    (audio_channels, tokenized_text.shape[1]), self.speech_zeroemb_idx, dtype=torch.int
+                )
+        else:
+            sosp_token = (
+                tokenizer.convert_tokens_to_ids("<|sosp|>")
+                if self.add_sosp_eosp
+                else None
+            )
+            eosp_token = (
+                tokenizer.convert_tokens_to_ids("<|eosp|>")
+                if self.add_sosp_eosp
+                else None
+            )
+            audio_part = self.audio.reshape(-1, audio_channels).T  # [audio_channels, seqlen]
+            assert (
+                audio_part.shape[1] % group_size == 0
+            ), f"Audio shape {audio_part.shape} is not divisible by group_size {group_size}"
+            text_len = audio_part.shape[1] // group_size
+            empty_token = self.text_zeroemb_idx
+            if empty_token is None:
+                empty_token = tokenizer.eod
+            tokenized_text = torch.full((1, text_len), empty_token, dtype=torch.int)
+            tokenized_text = (
+                torch.cat(
+                    [
+                        torch.tensor([[sosp_token]], dtype=torch.int),
+                        tokenized_text,
+                        torch.tensor([[eosp_token]], dtype=torch.int),
+                    ],
+                    dim=1,
+                )
+                if self.add_sosp_eosp
+                else tokenized_text
+            )
+            tokenized_text = self.insert_between(
+                tokenized_text, group_size - 1, value=-100
+            )
+            if self.add_sosp_eosp:
+                if isinstance(self.speech_zeroemb_idx, list):
+                    sosp_part = torch.zeros((audio_channels, group_size), dtype=torch.int)
+                    eosp_part = torch.zeros((audio_channels, group_size), dtype=torch.int)
+                    for i, idx in enumerate(self.speech_zeroemb_idx):
+                        sosp_part[i, :] = idx
+                        eosp_part[i, :] = idx
+                    audio_part_input_id = torch.cat([sosp_part, audio_part, eosp_part], dim=1)
+                else:
+                    audio_part_input_id = torch.cat(
+                        [
+                            torch.full((audio_channels, group_size), self.speech_zeroemb_idx, dtype=torch.int),
+                            audio_part,
+                            torch.full((audio_channels, group_size), self.speech_zeroemb_idx, dtype=torch.int),
+                        ],
+                        dim=1,
+                    )
+            else:
+                audio_part_input_id = audio_part
+        input_ids = torch.cat(
+            [tokenized_text, audio_part_input_id], dim=0
+        )  # [n_rvq + 1, seqlen]
+        return input_ids
+class StreamingInputSegment:
+    def __init__(
+        self,
+        text: str = "",
+        audio: torch.Tensor = None,
+        tokenized_text: torch.Tensor = None,
+        speech_zeroemb_idx: Union[int, List[int]] = 1024,
+        text_zeroemb_idx: int = 152067,
+        text_segment_size: int = 5,
+        audio_segment_size: int = 5,
+        tokenizer=None,
+        group_size=None,
+        audio_channels=None,
+    ) -> None:
+        has_text = text is not None
+        has_tokenized_text = tokenized_text is not None
+        assert has_text or has_tokenized_text, "Text or tokenized text must be provided"
+        self.audio = audio
+        self.text = text
+        self.tokenized_text = tokenized_text
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.text_zeroemb_idx = text_zeroemb_idx
+        self.text_segment_size = text_segment_size
+        self.audio_segment_size = audio_segment_size
+        self.tokenizer = tokenizer
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+    def to_input_id(
+        self,
+        tokenizer,
+        group_size: int,
+        audio_channels: int = 8,
+    ):
+        if self.tokenized_text is None:
+            tokenized_text = tokenizer(
+                self.text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=999999,
+                padding=False,
+                add_special_tokens=False,
+            )["input_ids"].int()  # [1, seqlen]
+        else:
+            tokenized_text = self.tokenized_text.unsqueeze(0)
+        tokenized_text = tokenized_text.squeeze(0)
+        text_segments = tokenized_text.split(self.text_segment_size, dim=0)
+        audio_segments = self.audio.split(self.audio_segment_size*group_size*audio_channels, dim=0)
+        tokenized_segments = []
+        tokenized_segments.append(
+            InputSegment(
+                text='<|sostm|>',
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.text_zeroemb_idx,
+            ),
+        )
+        eot_tokens = tokenizer(
+            "<|eot|>",
+            return_tensors="pt",
+            truncation=True,
+            max_length=999999,
+            padding=False,
+            add_special_tokens=False,
+        )["input_ids"][0].to(text_segments[-1])
+        text_segments = text_segments[:-1] + (torch.cat([text_segments[-1], eot_tokens], dim=0),)
+        length = min(len(text_segments), len(audio_segments))
+        for i in range(length):
+            text_segment = text_segments[i]
+            audio_segment = audio_segments[i]
+            tokenized_segments.append(
+                InputSegment(
+                    tokenized_text=text_segment,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+            tokenized_segments.append(
+                InputSegment(
+                    audio=audio_segment,
+                    add_sosp_eosp=False,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        for j in range(length, len(text_segments)):
+            tokenized_segments.append(
+                InputSegment(
+                    tokenized_text=text_segments[j],
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        for j in range(length, len(audio_segments)):
+            tokenized_segments.append(
+                InputSegment(
+                    audio=audio_segments[j],
+                    add_sosp_eosp=False,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        tokenized_segments.append(
+            InputSegment(
+                text="<|eostm|>",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.text_zeroemb_idx,
+            ),
+        )
+        input_ids = [
+            seg.to_input_id(
+                self.tokenizer,
+                self.group_size,
+                self.audio_channels,
+            )
+            for seg in tokenized_segments
+        ]
+        input_ids = torch.cat(input_ids, dim=1).type(torch.int64)  # [n_rvq + 1, seqlen]
+        return input_ids

src/mimo_audio/templates.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright 2025 Xiaomi Corporation.
+asr_zh_templates = [
+    "请将这段语音转换为文字",
+    "帮我识别这个音频文件中的内容",
+    "把这段录音转成文本",
+    "请转录这段语音",
+    "将音频内容转换成文字格式",
+    "识别并转写这段语音",
+    "把语音内容写成文字",
+    "转录这个音频片段",
+    "将这段对话转换为文本",
+    "麻烦帮我把这段录音整理成详细的文字记录",
+  ]
+asr_en_templates = [
+    "Please transcribe this audio file",
+    "Convert this speech recording to text",
+    "Transcribe the following voice message",
+    "Turn this audio into readable text",
+    "Please convert the recording to written format",
+    "Transcribe what you hear in this audio",
+    "Convert this spoken content to text",
+    "Please write down what is said in this recording",
+    "Transcribe this voice recording",
+    "Could you please help me transcribe this important recording?",
+    "Would you mind converting this voice message into a readable text format?",
+    "I'd really appreciate it if you could turn this audio file into a written document",
+  ]
+tts_zh_templates = [
+    "请将这段文字转换为语音",
+    "帮我把这个文本读出来",
+    "将这些文字生成音频",
+    "请朗读这段内容",
+    "把这段话转换成语音文件",
+    "生成这段文字的语音版本",
+    "请用语音播报这些内容",
+    "将文本转换为可听的音频",
+    "帮我朗读这段文字",
+    "把这些内容念出来",
+  ]
+tts_en_templates = [
+    "Please convert this text to speech",
+    "Turn this writing into audio",
+    "Generate speech from this text",
+    "Read this content out loud",
+    "Convert these words to voice",
+    "Create an audio version of this text",
+    "Please vocalize this content",
+    "Turn this text into audible format",
+    "Help me convert this writing to speech",
+    "Make this text into spoken audio",
+  ]

src/mimo_audio_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright 2025 Xiaomi Corporation.
+from .modeling_audio_tokenizer import MiMoAudioTokenizer, StreamingConfig, StreamingCache
+from .configuration_audio_tokenizer import MiMoAudioTokenizerConfig
+__all__ = ['MiMoAudioTokenizer', 'StreamingConfig', 'StreamingCache', 'MiMoAudioTokenizerConfig']

src/mimo_audio_tokenizer/configuration_audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2025 Xiaomi Corporation.
+from transformers import PretrainedConfig
+class MiMoAudioTokenizerConfig(PretrainedConfig):
+    model_type = "mimo_audio_tokenizer"
+    def __init__(
+        self,
+        max_audio_seconds: int = 1800,
+        stride_size: int = 2,
+        avg_pooler: int = 1,
+        d_model: int = 768,
+        scale_embedding: bool = True,
+        kernel_size: int = 3,
+        activation_function: str = "gelu",
+        encoder_layers: int = 8,
+        encoder_skip_layer_id: int = None,
+        encoder_attention_heads: int = 12,
+        encoder_ffn_dim: int = 3072,
+        encoder_causal: bool = False,
+        encoder_attn_window_size: list[int] = None,
+        decoder_layers: int = 8,
+        decoder_attention_heads: int = 12,
+        decoder_ffn_dim: int = 3072,
+        decoder_kernel_size: int = 3,
+        decoder_stride_size: int = 2,
+        decoder_causal: bool = True,
+        decoder_attn_window_size: list[int] = None,
+        nfft: int = 1024,
+        vocoder_dim: int = 512,
+        vocoder_intermediate_dim: int = 4096,
+        vocoder_num_layers: int = 30,
+        n_mels: int = 80,
+        sampling_rate: int = 24000,
+        hop_length: int = 240,
+        window_size: int = 1024,
+        vocoder_padding: str = "same",
+        fmin: int = 0,
+        fmax: int = None,
+        num_quantizers: int = 12,
+        codebook_size: list[int] = None,
+        threshold_ema_dead_code: int = 10,
+        position_embedding_type: str = "rope",
+        rope_theta: int = 10000,
+        rope_type: str = "default",
+        ln_type: str = "LayerNorm",
+        vocoder_attention_heads: int = 4,
+        vocoder_attn_window_size: list[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_audio_seconds = max_audio_seconds
+        self.stride_size = stride_size
+        self.avg_pooler = avg_pooler
+        self.d_model = d_model
+        self.scale_embedding = scale_embedding
+        self.kernel_size = kernel_size
+        self.activation_function = activation_function
+        self.encoder_layers = encoder_layers
+        self.encoder_skip_layer_id = encoder_skip_layer_id
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_causal = encoder_causal
+        self.encoder_attn_window_size = (
+            encoder_attn_window_size
+            if encoder_attn_window_size is not None
+            else [-1, -1]
+        )
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_kernel_size = decoder_kernel_size
+        self.decoder_stride_size = decoder_stride_size
+        self.decoder_causal = decoder_causal
+        self.decoder_attn_window_size = (
+            decoder_attn_window_size
+            if decoder_attn_window_size is not None
+            else [-1, -1]
+        )
+        self.nfft = nfft
+        self.vocoder_dim = vocoder_dim
+        self.vocoder_intermediate_dim = vocoder_intermediate_dim
+        self.vocoder_num_layers = vocoder_num_layers
+        self.n_mels = n_mels
+        self.sampling_rate = sampling_rate
+        self.hop_length = hop_length
+        self.window_size = window_size
+        self.vocoder_padding = vocoder_padding
+        self.fmin = fmin
+        self.fmax = fmax
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size if codebook_size is not None else [1024]
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.position_embedding_type = position_embedding_type
+        self.rope_theta = rope_theta
+        self.rope_type = rope_type
+        self.ln_type = ln_type
+        self.vocoder_attention_heads = vocoder_attention_heads
+        self.vocoder_attn_window_size = (
+            vocoder_attn_window_size
+            if vocoder_attn_window_size is not None
+            else [40, 10]
+        )

src/mimo_audio_tokenizer/modeling_audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,857 @@

+# Copyright 2025 Xiaomi Corporation.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from flash_attn import flash_attn_varlen_func
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_audio_tokenizer import MiMoAudioTokenizerConfig
+from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update, apply_rotary_pos_emb
+from .quantization import ResidualVectorQuantizer
+from dataclasses import dataclass, field
+from typing import List
+def get_sequence_mask(inputs, inputs_length):
+    if inputs.dim() == 3:
+        bsz, tgt_len, _ = inputs.size()
+    else:
+        bsz, tgt_len = inputs_length.shape[0], torch.max(inputs_length)
+    sequence_mask = torch.arange(0, tgt_len).to(inputs.device)
+    sequence_mask = torch.lt(sequence_mask, inputs_length.reshape(bsz, 1)).view(
+        bsz, tgt_len, 1
+    )
+    unpacking_index = torch.cumsum(sequence_mask.to(torch.int64).view(-1), dim=0) - 1
+    return sequence_mask, unpacking_index
+def unpack_hidden_states(
+    hidden_states, lengths, sequence_mask=None, unpacking_index=None
+):
+    bsz = lengths.shape[0]
+    if sequence_mask is None or unpacking_index is None:
+        sequence_mask, unpacking_index = get_sequence_mask(hidden_states, lengths)
+    hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+        bsz, torch.max(lengths), hidden_states.shape[-1]
+    )
+    hidden_states = torch.where(sequence_mask, hidden_states, 0)
+    return hidden_states
+def get_position_ids(lengths):
+    total_len = lengths.sum()
+    offset = torch.cat([torch.zeros(1).to(lengths), lengths[:-1].cumsum(dim=0)])
+    offset = torch.repeat_interleave(offset, lengths)
+    position_ids = torch.arange(0, total_len).to(offset) - offset
+    return position_ids
+@dataclass
+class StreamingConfig:
+    seg_point: int = field(default=60 * 25)
+    process_seg_point: bool = field(default=True)
+    left_overlap: int = field(default=10 * 25)
+    right_overlap: int = field(default=40)
+    seg_point_left_overlap: int = field(default=0)
+@dataclass
+class StreamingCache:
+    hidden_states: List[torch.Tensor] = field(default=None)
+    processed_lengths: List[int] = field(default=None)
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(
+        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
+    ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(
+                spec,
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                self.window,
+                center=True,
+            )
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+class ISTFTHead(nn.Module):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x).transpose(1, 2)
+        mag, p = x.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        original_dtype = x.dtype
+        S = mag.float() * (x.float() + 1j * y.float())
+        audio = self.istft(S)
+        audio = audio.to(original_dtype)
+        return audio
+class RotaryEmbedding(nn.Module):
+    def __init__(self, base, dim, max_seq_len, rope_type="default", device=None):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.rope_type = rope_type
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            device=device, base=base, dim=dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[:, None].float().expand(-1, 1).to(x.device)
+        position_ids_expanded = position_ids[None, :].float()
+        device_type = (
+            x.device.type
+            if isinstance(x.device.type, str) and x.device.type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(0, 1)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+LAYER_NORM = {"LayerNorm": nn.LayerNorm, "RMSNorm": RMSNorm}
+class Attention(nn.Module):
+    def __init__(self, embed_dim, num_heads, window_size=(-1, -1), causal=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.window_size = window_size
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.causal = causal
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        seq_len: torch.Tensor,
+        rope_position_embeddings=None,
+    ):
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(
+            bsz, self.num_heads, self.head_dim
+        )
+        key_states = self.k_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(
+            bsz, self.num_heads, self.head_dim
+        )
+        if rope_position_embeddings is not None:
+            cos, sin = rope_position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+        cu_len = F.pad(torch.cumsum(seq_len, dim=0), (1, 0), "constant", 0).to(
+            torch.int32
+        )
+        max_seqlen = torch.max(seq_len).to(torch.int32).detach()
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_len,
+            cu_len,
+            max_seqlen,
+            max_seqlen,
+            causal=self.causal,
+            window_size=self.window_size,
+        )
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        act,
+        d_model,
+        encoder_attention_heads,
+        encoder_ffn_dim,
+        causal,
+        ln_type="LayerNorm",
+        attn_window_size=(-1, -1),
+    ):
+        super().__init__()
+        self.embed_dim = d_model
+        self.self_attn = Attention(
+            self.embed_dim, encoder_attention_heads, attn_window_size, causal
+        )
+        self.self_attn_layer_norm = LAYER_NORM[ln_type](self.embed_dim)
+        self.activation_fn = act
+        self.fc1 = nn.Linear(self.embed_dim, encoder_ffn_dim)
+        self.fc2 = nn.Linear(encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LAYER_NORM[ln_type](self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        seq_len: torch.Tensor,
+        rope_position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states, seq_len, rope_position_embeddings=rope_position_embeddings
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        if (
+            hidden_states.dtype == torch.float16
+            or hidden_states.dtype == torch.bfloat16
+        ) and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        return hidden_states
+class TransformerVocos(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (
+            self.config.max_audio_seconds
+            * self.config.sampling_rate
+            // self.config.hop_length
+        )
+        self.embeddings = nn.Linear(config.n_mels, config.vocoder_dim, bias=False)
+        self.poisition_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.vocoder_dim // config.vocoder_attention_heads,
+            self.max_source_positions,
+            self.config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[self.config.activation_function],
+                    self.config.vocoder_dim,
+                    self.config.vocoder_attention_heads,
+                    self.config.vocoder_intermediate_dim,
+                    causal=False,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.vocoder_attn_window_size,
+                )
+                for _ in range(self.config.vocoder_num_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[self.config.ln_type](self.config.vocoder_dim)
+        self.hop_size = self.config.hop_length
+        self.head = ISTFTHead(
+            self.config.vocoder_dim,
+            self.config.nfft,
+            self.config.hop_length,
+            self.config.vocoder_padding,
+        )
+    def forward(self, x: torch.Tensor, input_length):
+        x = x.transpose(1, 2)
+        attention_mask, unpacking_index = get_sequence_mask(x, input_length)
+        x = torch.masked_select(x, attention_mask).view(
+            torch.sum(input_length), self.config.n_mels
+        )
+        x = self.embeddings(x)
+        position_ids = torch.arange(0, x.size(0), device=x.device, dtype=torch.long)
+        rope_position_embeddings = self.poisition_embedding(x, position_ids)
+        for idx, layer in enumerate(self.layers):
+            x = layer(
+                x, input_length, rope_position_embeddings=rope_position_embeddings
+            )
+        x = self.layer_norm(x)
+        x = unpack_hidden_states(x, input_length, attention_mask, unpacking_index)
+        x = self.head(x)
+        output_length = input_length * self.hop_size
+        return x[:, None, :], output_length
+class AudioEncoder(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        config._attn_implementation = "flash_attention_2"
+        self.config = config
+        self.max_source_positions = (
+            config.max_audio_seconds * config.sampling_rate // config.hop_length
+        ) // config.stride_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.skip_layer_idx = config.encoder_skip_layer_id
+        self.conv1 = nn.Conv1d(
+            config.n_mels, config.d_model, kernel_size=config.kernel_size, padding=1
+        )
+        self.conv2 = nn.Conv1d(
+            config.d_model,
+            config.d_model,
+            kernel_size=config.kernel_size,
+            stride=config.stride_size,
+            padding=1,
+        )
+        self.position_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.d_model // config.encoder_attention_heads,
+            self.max_source_positions,
+            config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[config.activation_function],
+                    config.d_model,
+                    config.encoder_attention_heads,
+                    config.encoder_ffn_dim,
+                    causal=self.config.encoder_causal,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.encoder_attn_window_size,
+                )
+                for _ in range(config.encoder_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[config.ln_type](config.d_model)
+        if self.config.avg_pooler != 1:
+            self.down_sample_layer = nn.Sequential(
+                nn.Conv1d(
+                    config.d_model,
+                    config.d_model,
+                    config.avg_pooler,
+                    config.avg_pooler,
+                    bias=False,
+                ),
+                nn.GELU(),
+            )
+            self.down_sample_norm = LAYER_NORM[config.ln_type](config.d_model)
+        else:
+            self.down_sample_layer = None
+        if self.config.num_quantizers != 0:
+            self.quantizer = ResidualVectorQuantizer(
+                dimension=self.config.d_model,
+                n_q=self.config.num_quantizers,
+                bins=self.config.codebook_size,
+                threshold_ema_dead_code=self.config.threshold_ema_dead_code,
+            )
+        else:
+            self.quantizer = None
+    def get_features(self, input_features, output_length):
+        input_features = input_features.to(self.conv1.weight)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        bsz, tgt_len, _ = inputs_embeds.size()
+        hidden_states = inputs_embeds
+        position_ids = (
+            get_position_ids(output_length).long().to(input_features.device)
+        )
+        rope_position_embeddings = self.position_embedding(
+            input_features, position_ids
+        )
+        attention_mask, unpacking_index = get_sequence_mask(
+            hidden_states, output_length
+        )
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.config.d_model
+        )
+        skip_connect_hidden_states = 0.0
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                output_length,
+                rope_position_embeddings=rope_position_embeddings,
+            )
+            if (self.skip_layer_idx is not None) and idx == self.skip_layer_idx - 1:
+                skip_connect_hidden_states = hidden_states.clone()
+        hidden_states += skip_connect_hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        if self.down_sample_layer is not None:
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+                bsz, tgt_len, self.config.d_model
+            )
+            if hidden_states.size(1) % self.config.avg_pooler:
+                pad_len = (
+                    self.config.avg_pooler
+                    - hidden_states.size(1) % self.config.avg_pooler
+                )
+                hidden_states = torch.nn.functional.pad(
+                    hidden_states, (0, 0, 0, pad_len), mode="constant", value=0.0
+                )
+                tgt_len += pad_len
+            tgt_len = tgt_len // self.config.avg_pooler
+            hidden_states = self.down_sample_layer(hidden_states.transpose(1, 2))
+            output_length = (
+                output_length // self.config.avg_pooler
+                + (output_length % self.config.avg_pooler != 0).int()
+            )
+            hidden_states = hidden_states.transpose(1, 2)
+            attention_mask, unpacking_index = get_sequence_mask(
+                hidden_states, output_length
+            )
+            hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+                torch.sum(output_length), self.config.d_model
+            )
+            hidden_states = self.down_sample_norm(hidden_states)
+        return (
+            hidden_states,
+            output_length,
+            attention_mask,
+            unpacking_index,
+            tgt_len,
+            bsz,
+        )
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+    @torch.no_grad()
+    def encode(
+        self,
+        input_features,
+        input_lens=None,
+        output_length=None,
+        return_codes_only=False,
+        n_q=None,
+        use_quantizer=True,
+    ):
+        if output_length is None:
+            output_length = self.get_output_length(input_lens)
+        input_features = unpack_hidden_states(input_features, input_lens)
+        hidden_states, output_length, attention_mask, unpacking_index, tgt_len, bsz = (
+            self.get_features(
+                input_features=input_features.transpose(1, 2),
+                output_length=output_length,
+            )
+        )
+        dtype = hidden_states.dtype
+        if use_quantizer and self.quantizer is not None:
+            self.quantizer.float()
+            codes = self.quantizer.encode(hidden_states.float(), n_q=n_q)
+            if return_codes_only:
+                return codes, output_length
+            hidden_states = self.quantizer.decode(codes)
+            hidden_states = hidden_states.to(dtype)
+        else:
+            codes = None
+        hidden_states_packed = hidden_states.clone()
+        # unpacking
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+            bsz, tgt_len, self.config.d_model
+        )
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        return hidden_states, hidden_states_packed, output_length, codes
+    @torch.no_grad()
+    def decode_vq(self, codes):
+        self.quantizer.float()
+        hidden_states = self.quantizer.decode(codes)
+        return hidden_states
+class CausalConvTranspose1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+        self.norm = nn.GroupNorm(1, out_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+    def forward(self, hidden_states, input_length, output_dim=None):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        bsz = input_length.shape[0]
+        if output_dim is None:
+            output_dim = hidden_states.dim()
+        if hidden_states.dim() <= 2:  # unpack sequence to 3d
+            sequence_mask, unpacking_index = get_sequence_mask(
+                hidden_states, input_length
+            )
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+                bsz, torch.max(input_length), self.in_channels
+            )
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, L, C) -> (N, C, L)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, C, L) -> (N, L, C)
+        casual_padding_right = max(0, kernel_size - stride)
+        hidden_states = hidden_states[
+            :, : hidden_states.shape[1] - casual_padding_right, :
+        ]
+        output_length = (input_length - 1) * stride + kernel_size - casual_padding_right
+        sequence_mask, _ = get_sequence_mask(hidden_states, output_length)
+        if output_dim <= 2:
+            hidden_states = torch.masked_select(hidden_states, sequence_mask).view(
+                -1, self.out_channels
+            )
+        else:
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)
+            hidden_states = hidden_states[:, : torch.max(output_length), :]
+        return hidden_states, output_length
+class AudioDecoder(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (
+            self.config.max_audio_seconds
+            * self.config.sampling_rate
+            // self.config.hop_length
+        )
+        if self.config.avg_pooler != 1:
+            self.dconv1 = CausalConvTranspose1d(
+                self.config.d_model,
+                self.config.d_model,
+                self.config.avg_pooler,
+                self.config.avg_pooler,
+            )
+        else:
+            self.dconv1 = None
+        self.position_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.d_model // config.decoder_attention_heads,
+            self.max_source_positions,
+            config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[self.config.activation_function],
+                    self.config.d_model,
+                    self.config.decoder_attention_heads,
+                    self.config.decoder_ffn_dim,
+                    causal=self.config.decoder_causal,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.decoder_attn_window_size,
+                )
+                for _ in range(self.config.decoder_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[config.ln_type](self.config.d_model)
+        self.dconv2 = CausalConvTranspose1d(
+            self.config.d_model,
+            self.config.n_mels,
+            self.config.decoder_kernel_size,
+            self.config.decoder_stride_size,
+        )
+        self.vocoder = TransformerVocos(config)
+    def forward(
+        self,
+        audio_embed,
+        input_length,
+    ):
+        assert audio_embed.shape[-1] == self.config.d_model
+        audio_embed = audio_embed.to(self.layer_norm.weight)
+        if self.dconv1 is not None:
+            audio_embed, output_length = self.dconv1(
+                audio_embed, input_length, output_dim=3
+            )
+            _, tgt_len, _ = audio_embed.size()
+        else:
+            output_length = input_length
+            tgt_len = audio_embed.size(0)
+        hidden_states = audio_embed
+        position_ids = (
+            get_position_ids(output_length).long().to(hidden_states.device)
+        )
+        rope_position_embeddings = self.position_embedding(
+            hidden_states, position_ids
+        )
+        # packing hidden states
+        attention_mask, _ = get_sequence_mask(hidden_states, output_length)
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.config.d_model
+        )
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                output_length,
+                rope_position_embeddings=rope_position_embeddings,
+            )
+        hidden_states = self.layer_norm(hidden_states)
+        coarse_mel, output_length = self.dconv2(
+            hidden_states, output_length, output_dim=3
+        )
+        recon_wav, wav_length = self.vocoder(
+            x=coarse_mel.transpose(1, 2),
+            input_length=output_length,
+        )
+        return recon_wav
+class MiMoAudioTokenizer(PreTrainedModel):
+    config_class = MiMoAudioTokenizerConfig
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__(config)
+        self.config = config
+        self.sampling_rate = config.sampling_rate
+        self.encoder = AudioEncoder(config=config)
+        self.decoder = AudioDecoder(config=config)
+        self.downsample_rate = int(self.config.hop_length * 2 * self.config.avg_pooler)
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+    @torch.no_grad()
+    def encode(self, mels, input_lens, use_quantizer=True):
+        input_features = mels
+        encoder_output_length = self.get_output_length(input_lens)
+        hidden_states, hidden_states_packed, encoder_output_length, codes = (
+            self.encoder.encode(
+                input_features, input_lens=input_lens, use_quantizer=use_quantizer
+            )
+        )
+        return hidden_states, hidden_states_packed, encoder_output_length, codes
+    @torch.no_grad()
+    def decode(self, codes):
+        hidden_states = self.encoder.decode_vq(codes)
+        output = self.decoder(
+            hidden_states,
+            torch.tensor([hidden_states.size(0)], device=hidden_states.device),
+        )
+        return output
+    @torch.no_grad()
+    def streaming_decode(self, codes_chunks, chunk_input_lengths, history_cache=StreamingCache(), streaming_config=StreamingConfig(), last_chunk=False):
+        hidden_states = self.encoder.decode_vq(codes_chunks)
+        input_lengths = []
+        input_hidden_states = []
+        start_idx = 0
+        cache_hidden_states = []
+        for i, input_length in enumerate(chunk_input_lengths):
+            sample_hidden_states = hidden_states[start_idx:start_idx + input_length]
+            start_idx += input_length
+            if history_cache.hidden_states is not None:
+                sample_hidden_states = torch.cat([history_cache.hidden_states[i], sample_hidden_states], dim=0)
+                input_length += history_cache.hidden_states[i].size(0)
+            input_hidden_states.append(sample_hidden_states)
+            cache_hidden_states.append(sample_hidden_states.clone())
+            input_lengths.append(input_length)
+        input_hidden_states = torch.cat(input_hidden_states, dim=0)
+        input_lengths = torch.tensor(input_lengths, device=hidden_states.device)
+        output = self.decoder(input_hidden_states, input_lengths)
+        return_wavs = []
+        frames_per_token = self.config.avg_pooler * self.config.stride_size * self.config.hop_length
+        processed_lengths = []
+        for i, wav in enumerate(output):
+            wav = wav.float().detach().cpu()
+            start_idx = history_cache.processed_lengths[i] if history_cache.processed_lengths is not None else 0
+            if last_chunk:
+                return_wavs.append(wav[:, start_idx * frames_per_token:])
+                new_processed_length = input_lengths[i].item()
+            elif input_lengths[i].item() <= streaming_config.right_overlap:
+                return_wavs.append(None)
+                new_processed_length = 0
+            else:
+                end_idx = (input_lengths[i].item() - streaming_config.right_overlap)
+                wav = wav[:, start_idx * frames_per_token: end_idx * frames_per_token]
+                return_wavs.append(wav)
+                new_processed_length = end_idx
+                if input_lengths[i].item() > streaming_config.left_overlap:
+                    cache_hidden_states[i] = cache_hidden_states[i][-streaming_config.left_overlap:]
+                    new_processed_length -= (input_lengths[i].item() - streaming_config.left_overlap)
+            processed_lengths.append(new_processed_length)
+        history_cache.hidden_states = cache_hidden_states
+        history_cache.processed_lengths = processed_lengths
+        return return_wavs, history_cache

src/mimo_audio_tokenizer/modeling_rope_utils.py ADDED Viewed

	@@ -0,0 +1,878 @@

+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import wraps
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import is_torch_available, logging
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+def dynamic_rope_update(rope_forward):
+    """
+    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
+    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+    Args:
+        rope_forward (Callable):
+            The forward pass of the RoPE implementation.
+    Returns:
+        The decorated forward pass.
+    """
+    def longrope_frequency_update(self, position_ids, device):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = torch.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = (
+                self.config.original_max_position_embeddings
+            )
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(
+                    self.config, device, seq_len=original_max_position_embeddings + 1
+                )
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+    def dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @wraps(rope_forward)
+    def wrapper(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device)
+        return rope_forward(self, x, position_ids)
+    return wrapper
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
+        head_dim = (
+            getattr(config, "head_dim", None)
+            or config.hidden_size // config.num_attention_heads
+        )
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (
+        base
+        ** (
+            torch.arange(0, dim, 2, dtype=torch.int64).to(
+                device=device, dtype=torch.float
+            )
+            / dim
+        )
+    )
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+        max_position_embeddings = rope_kwargs["max_position_embeddings"]
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        dim = int(head_dim * partial_rotary_factor)
+        max_position_embeddings = config.max_position_embeddings
+        factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    seq_len = (
+        seq_len
+        if seq_len is not None and seq_len > max_position_embeddings
+        else max_position_embeddings
+    )
+    # Compute the inverse frequencies
+    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (
+        dim / (dim - 2)
+    )
+    inv_freq = 1.0 / (
+        base
+        ** (
+            torch.arange(0, dim, 2, dtype=torch.int64).to(
+                device=device, dtype=torch.float
+            )
+            / dim
+        )
+    )
+    return inv_freq, attention_factor
+def _compute_yarn_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://huggingface.co/papers/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # No need to keep BC with yarn, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    factor = config.rope_scaling["factor"]
+    attention_factor = config.rope_scaling.get("attention_factor")
+    mscale = config.rope_scaling.get("mscale")
+    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
+    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if "original_max_position_embeddings" in config.rope_scaling:
+        original_max_position_embeddings = config.rope_scaling[
+            "original_max_position_embeddings"
+        ]
+        factor = config.max_position_embeddings / original_max_position_embeddings
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if mscale and mscale_all_dim:
+            attention_factor = float(
+                get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim)
+            )
+        else:
+            attention_factor = get_mscale(factor)
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (
+            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
+        ) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(
+            find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        )
+        high = math.ceil(
+            find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        )
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (
+        torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim
+    )
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(
+        beta_fast, beta_slow, dim, base, original_max_position_embeddings
+    )
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(
+        device=device, dtype=torch.float
+    )
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    # No need to keep BC with longrope, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+            f"{rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        original_max_position_embeddings = config.original_max_position_embeddings
+        factor = (
+            config.max_position_embeddings / config.original_max_position_embeddings
+        )
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(
+                1 + math.log(factor) / math.log(original_max_position_embeddings)
+            )
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if seq_len and seq_len > original_max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = (
+        torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+    )
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling[
+        "low_freq_factor"
+    ]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling[
+        "high_freq_factor"
+    ]  # `4` in the original implementation
+    old_context_len = config.rope_scaling[
+        "original_max_position_embeddings"
+    ]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(
+        wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+    )
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    smoothed_inv_freq = (
+        1 - smooth_factor
+    ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}
+def _check_received_keys(
+    rope_type: str,
+    received_keys: set,
+    required_keys: set,
+    optional_keys: Optional[set] = None,
+    ignore_keys: Optional[set] = None,
+):
+    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
+    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
+    if "type" in received_keys:
+        received_keys -= {"type"}
+        required_keys.add("rope_type")
+    # Some models need to store model-specific keys, and we don't want to throw warning at them
+    if ignore_keys is not None:
+        received_keys -= ignore_keys
+    missing_keys = required_keys - received_keys
+    if missing_keys:
+        raise KeyError(
+            f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}"
+        )
+    if optional_keys is not None:
+        unused_keys = received_keys - required_keys - optional_keys
+    else:
+        unused_keys = received_keys - required_keys
+    if unused_keys:
+        logger.warning(
+            f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}"
+        )
+def _validate_default_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+def _validate_linear_scaling_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+def _validate_dynamic_scaling_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+def _validate_yarn_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    optional_keys = {
+        "attention_factor",
+        "beta_fast",
+        "beta_slow",
+        "original_max_position_embeddings",
+        "mscale",
+        "mscale_all_dim",
+    }
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+    attention_factor = rope_scaling.get("attention_factor")
+    if attention_factor is not None and (
+        not isinstance(attention_factor, float) or attention_factor < 0
+    ):
+        logger.warning(
+            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+        )
+    beta_fast = rope_scaling.get("beta_fast")
+    if beta_fast is not None and not isinstance(beta_fast, float):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}"
+        )
+    beta_slow = rope_scaling.get("beta_slow")
+    if beta_slow is not None and not isinstance(beta_slow, float):
+        logger.warning(
+            f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}"
+        )
+    if (beta_fast or 32) < (beta_slow or 1):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
+            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
+        )
+def _validate_longrope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "short_factor", "long_factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    short_factor = rope_scaling.get("short_factor")
+    if not isinstance(short_factor, list) and all(
+        isinstance(x, (int, float)) for x in short_factor
+    ):
+        logger.warning(
+            f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}"
+        )
+    if not len(short_factor) == dim // 2:
+        logger.warning(
+            f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}"
+        )
+    long_factor = rope_scaling.get("long_factor")
+    if not isinstance(long_factor, list) and all(
+        isinstance(x, (int, float)) for x in long_factor
+    ):
+        logger.warning(
+            f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}"
+        )
+    if not len(long_factor) == dim // 2:
+        logger.warning(
+            f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}"
+        )
+    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
+    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
+    # unique to longrope (= undesirable)
+    if hasattr(config, "original_max_position_embeddings"):
+        logger.warning_once(
+            "This model has set a `original_max_position_embeddings` field, to be used together with "
+            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
+            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
+            "as it is compatible with most model architectures."
+        )
+    else:
+        factor = rope_scaling.get("factor")
+        if factor is None:
+            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
+        elif not isinstance(factor, float) or factor < 1.0:
+            logger.warning(
+                f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+            )
+        attention_factor = rope_scaling.get("attention_factor")
+        if attention_factor is not None:
+            if not isinstance(attention_factor, float) or attention_factor < 0.0:
+                logger.warning(
+                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+                )
+def _validate_llama3_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {
+        "rope_type",
+        "factor",
+        "original_max_position_embeddings",
+        "low_freq_factor",
+        "high_freq_factor",
+    }
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+    low_freq_factor = rope_scaling["low_freq_factor"]
+    high_freq_factor = rope_scaling["high_freq_factor"]
+    if low_freq_factor is None or not isinstance(low_freq_factor, float):
+        logger.warning(
+            f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}"
+        )
+    if high_freq_factor is None or not isinstance(high_freq_factor, float):
+        logger.warning(
+            f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}"
+        )
+    if high_freq_factor <= low_freq_factor:
+        logger.warning(
+            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
+            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
+        )
+    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
+    if original_max_position_embeddings is None or not isinstance(
+        original_max_position_embeddings, int
+    ):
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
+            f"{original_max_position_embeddings}"
+        )
+    if original_max_position_embeddings >= config.max_position_embeddings:
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
+            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
+        )
+# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
+ROPE_VALIDATION_FUNCTIONS = {
+    "default": _validate_default_rope_parameters,
+    "linear": _validate_linear_scaling_rope_parameters,
+    "dynamic": _validate_dynamic_scaling_rope_parameters,
+    "yarn": _validate_yarn_parameters,
+    "longrope": _validate_longrope_parameters,
+    "llama3": _validate_llama3_parameters,
+}
+def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
+    """
+    Validate the RoPE config arguments, given a `PretrainedConfig` object
+    """
+    rope_scaling = getattr(
+        config, "rope_scaling", None
+    )  # not a default parameter in `PretrainedConfig`
+    if rope_scaling is None:
+        return
+    # BC: "rope_type" was originally "type"
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
+    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
+    if validation_fn is not None:
+        validation_fn(config, ignore_keys=ignore_keys)
+    else:
+        logger.warning(
+            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        x (`torch.Tensor`): The input tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed

src/mimo_audio_tokenizer/quantization.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright 2025 Xiaomi Corporation.
+import typing as tp
+from einops import rearrange, repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+def rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    else:
+        return 0
+def world_size():
+    if dist.is_initialized():
+        return dist.get_world_size()
+    else:
+        return 1
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    if dist.is_initialized():
+        dist.all_reduce(new, op=dist.ReduceOp.SUM)
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    selected_samples = samples[indices]
+    if dist.is_initialized():
+        dist.broadcast(selected_samples, src=0)
+    return selected_samples
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        dists = -(
+            samples.pow(2).sum(1, keepdim=True)
+            - 2 * samples @ means.t()
+            + means.t().pow(2).sum(0, keepdim=True)
+        )
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means = new_means.scatter_add_(
+            0, repeat(buckets, "n -> n d", d=dim), samples
+        )
+        if dist.is_initialized():
+            dist.all_reduce(bins, op=dist.ReduceOp.SUM)
+            dist.all_reduce(new_means, op=dist.ReduceOp.SUM)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = (
+            uniform_init if not kmeans_init else torch.zeros
+        )
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+    def replace_(self, samples, mask):
+        # modified_codebook = torch.where(
+        #     mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        # )
+        replace_num = mask.sum()
+        modified_codebook = self.embed.clone()
+        modified_codebook[mask] = sample_vectors(samples, replace_num)
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t().contiguous(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        # x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x):
+        device = x.device
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, codebook_size, **kwargs):
+        super().__init__()
+        if isinstance(codebook_size, int):
+            codebook_size = [codebook_size] * num_quantizers
+        elif len(codebook_size) < num_quantizers:
+            codebook_size += [codebook_size[-1]] * (num_quantizers - len(codebook_size))
+        self.layers = nn.ModuleList(
+            [
+                VectorQuantization(codebook_size=codebook_size[i], **kwargs)
+                for i in range(num_quantizers)
+            ]
+        )
+    def forward(
+        self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None
+    ):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        out_quantized = []
+        n_q = n_q or len(self.layers)
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+            if layers and i in layers:
+                out_quantized.append(quantized_out)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses, out_quantized
+    def encode(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
+    ) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = len(self.layers) if n_q is None else n_q
+        st = 0 if st is None else st
+        for layer in self.layers[st:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[st + i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+class ResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer.
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        bins: int | list = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        n_q: tp.Optional[int] = None,
+        layers: tp.Optional[list] = None,
+    ):
+        """Residual vector quantization on the given input tensor.
+        Args:
+            x (torch.Tensor): Input tensor.
+            n_q (int): Number of quantizer used to quantize. Default: All quantizers.
+            layers (list): Layer that need to return quantized. Defalt: None.
+        Returns:
+            QuantizedResult:
+                The quantized (or approximately quantized) representation with
+                the associated numbert quantizers and layer quantized required to return.
+        """
+        n_q = n_q if n_q else self.n_q
+        quantized, codes, commit_loss, quantized_list = self.vq(
+            x, n_q=n_q, layers=layers
+        )
+        return quantized, codes, torch.mean(commit_loss), quantized_list
+    def encode(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
+    ) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        Args:
+            x (torch.Tensor): Input tensor.
+            n_q (int): Number of quantizer used to quantize. Default: All quantizers.
+            st (int): Start to encode input from which layers. Default: 0.
+        """
+        n_q = n_q if n_q else self.n_q
+        st = st or 0
+        codes = self.vq.encode(x, n_q=n_q, st=st)
+        return codes
+    def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
+        """Decode the given codes to the quantized representation.
+        Args:
+            codes (torch.Tensor): Input indices for each quantizer.
+            st (int): Start to decode input codes from which layers. Default: 0.
+        """
+        quantized = self.vq.decode(codes, st=st)
+        return quantized