fish-speech-1

Sleeping

App Files Files Community

PoTaTo721 commited on Jul 30, 2024

Commit

69e8a46

1 Parent(s): 469209d

update to 1.2

Browse files

Files changed (45) hide show

app.py +420 -129
fish_speech/configs/base.yaml +1 -0
fish_speech/configs/firefly_gan_vq.yaml +34 -0
fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
fish_speech/configs/text2semantic_finetune.yaml +22 -18
fish_speech/datasets/concat_repeat.py +53 -0
fish_speech/datasets/semantic.py +496 -0
fish_speech/datasets/vqgan.py +3 -1
fish_speech/models/text2semantic/__init__.py +0 -3
fish_speech/models/text2semantic/lit_module.py +22 -164
fish_speech/models/text2semantic/llama.py +227 -70
fish_speech/models/text2semantic/lora.py +92 -0
fish_speech/models/vqgan/modules/firefly.py +88 -1
fish_speech/models/vqgan/modules/fsq.py +1 -1
fish_speech/text/__init__.py +2 -1
fish_speech/text/chn_text_norm/.gitignore +114 -0
fish_speech/text/chn_text_norm/README.md +36 -0
fish_speech/text/chn_text_norm/__init__.py +0 -0
fish_speech/text/chn_text_norm/basic_class.py +172 -0
fish_speech/text/chn_text_norm/basic_constant.py +30 -0
fish_speech/text/chn_text_norm/basic_util.py +342 -0
fish_speech/text/chn_text_norm/cardinal.py +32 -0
fish_speech/text/chn_text_norm/date.py +75 -0
fish_speech/text/chn_text_norm/digit.py +32 -0
fish_speech/text/chn_text_norm/fraction.py +35 -0
fish_speech/text/chn_text_norm/money.py +43 -0
fish_speech/text/chn_text_norm/percentage.py +33 -0
fish_speech/text/chn_text_norm/telephone.py +51 -0
fish_speech/text/chn_text_norm/text.py +177 -0
fish_speech/text/clean.py +1 -5
fish_speech/text/spliter.py +130 -0
fish_speech/utils/file.py +1 -1
fish_speech/utils/rich_utils.py +7 -3
fish_speech/utils/spectrogram.py +122 -0
tools/api.py +482 -0
tools/auto_rerank.py +159 -0
tools/llama/build_dataset.py +169 -0
tools/llama/eval_in_context.py +171 -0
tools/llama/generate.py +119 -180
tools/llama/merge_lora.py +95 -0
tools/llama/quantize.py +46 -64
tools/llama/rebuild_tokenizer.py +57 -0
tools/vqgan/create_train_split.py +83 -0
tools/vqgan/extract_vq.py +227 -0
tools/vqgan/inference.py +29 -26

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import hydra
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
-snapshot_download(repo_id="fishaudio/fish-speech-1", local_dir="./checkpoints/fish-speech-1")
 print("All checkpoints downloaded")
@@ -23,6 +23,16 @@ from transformers import AutoTokenizer
 from tools.llama.generate import launch_thread_safe_queue
 from tools.vqgan.inference import load_model as load_vqgan_model
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
@@ -30,8 +40,8 @@ os.environ["EINX_FILTER_TRACEBACK"] = "false"
 HEADER_MD = """# Fish Speech
-## The demo in this space is version 1.0, Please check [Fish Audio](https://fish.audio) for the best model.
-## 该 Demo 为 Fish Speech 1.0 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
@@ -39,14 +49,14 @@ A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https
 You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).
 你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.
-Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.
-相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
-The model running in this WebUI is Fish Speech V1 Medium SFT 4K.
-在此 WebUI 中运行的模型是 Fish Speech V1 Medium SFT 4K.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
@@ -85,36 +95,27 @@ def inference(
     top_p,
     repetition_penalty,
     temperature,
-    speaker,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
-        return None, f"Text is too long, please keep it under {args.max_gradio_length} characters."
-    # Parse reference audio aka prompt
-    prompt_tokens = None
-    if enable_reference_audio and reference_audio is not None:
-        # reference_audio_sr, reference_audio_content = reference_audio
-        reference_audio_content, _ = librosa.load(
-            reference_audio, sr=vqgan_model.sampling_rate, mono=True
-        )
-        audios = torch.from_numpy(reference_audio_content).to(vqgan_model.device)[
-            None, None, :
-        ]
-        logger.info(
-            f"Loaded audio with {audios.shape[2] / vqgan_model.sampling_rate:.2f} seconds"
         )
-        # VQ Encoder
-        audio_lengths = torch.tensor(
-            [audios.shape[2]], device=vqgan_model.device, dtype=torch.long
-        )
-        prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
     # LLAMA Inference
     request = dict(
-        tokenizer=llama_tokenizer,
-        device=vqgan_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_p=top_p,
@@ -123,43 +124,246 @@ def inference(
         compile=args.compile,
         iterative_prompt=chunk_length > 0,
         chunk_length=chunk_length,
-        max_length=args.max_length,
-        speaker=speaker if speaker else None,
         prompt_tokens=prompt_tokens if enable_reference_audio else None,
         prompt_text=reference_text if enable_reference_audio else None,
     )
-    payload = dict(
-        response_queue=queue.Queue(),
-        request=request,
     )
-    llama_queue.put(payload)
-    codes = []
     while True:
-        result = payload["response_queue"].get()
-        if result == "next":
-            # TODO: handle next sentence
-            continue
-        if result == "done":
-            if payload["success"] is False:
-                return None, build_html_error_message(payload["response"])
             break
-        codes.append(result)
-    codes = torch.cat(codes, dim=1)
-    # VQGAN Inference
-    feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
-    fake_audios = vqgan_model.decode(
-        indices=codes[None], feature_lengths=feature_lengths, return_audios=True
-    )[0, 0]
-    fake_audios = fake_audios.float().cpu().numpy()
-    return (vqgan_model.sampling_rate, fake_audios), None
 def build_app():
@@ -170,95 +374,179 @@ def build_app():
         app.load(
             None,
             None,
-            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', 'light');window.location.search = params.toString();}}",
         )
         # Inference
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
-                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
                 )
                 with gr.Row():
-                    with gr.Tab(label="Advanced Config / 高级参数"):
                         chunk_length = gr.Slider(
-                            label="Iterative Prompt Length, 0 means off / 迭代提示长度，0 表示关闭",
                             minimum=0,
-                            maximum=100,
-                            value=30,
                             step=8,
                         )
                         max_new_tokens = gr.Slider(
-                            label="Maximum tokens per batch, 0 means no limit / 每批最大令牌数，0 表示无限制",
-                            minimum=128,
-                            maximum=512,
-                            value=512,  # 0 means no limit
                             step=8,
                         )
                         top_p = gr.Slider(
-                            label="Top-P", minimum=0, maximum=1, value=0.7, step=0.01
                         )
                         repetition_penalty = gr.Slider(
                             label="Repetition Penalty",
-                            minimum=0,
-                            maximum=2,
-                            value=1.5,
                             step=0.01,
                         )
                         temperature = gr.Slider(
                             label="Temperature",
-                            minimum=0,
-                            maximum=2,
                             value=0.7,
                             step=0.01,
                         )
-                        speaker = gr.Textbox(
-                            label="Speaker / 说话人",
-                            placeholder="Type name of the speaker / 输入说话人的名称",
-                            lines=1,
-                        )
-                    with gr.Tab(label="Reference Audio / 参考音频"):
                         gr.Markdown(
-                            "5 to 10 seconds of reference audio, useful for specifying speaker. \n5 到 10 秒的参考音频，适用于指定音色。"
                         )
                         enable_reference_audio = gr.Checkbox(
-                            label="Enable Reference Audio / 启用参考音频",
                         )
                         reference_audio = gr.Audio(
-                            label="Reference Audio / 参考音频",
                             type="filepath",
                         )
-                        reference_text = gr.Textbox(
-                            label="Reference Text / 参考文本",
-                            placeholder="参考文本",
-                            lines=1,
                         )
             with gr.Column(scale=3):
-                with gr.Row():
-                    error = gr.HTML(label="Error Message / 错误信息")
-                with gr.Row():
-                    audio = gr.Audio(label="Generated Audio / 音频", type="numpy")
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
-                            value="\U0001F3A7 Generate / 合成", variant="primary"
                         )
         # # Submit
         generate.click(
-            inference,
             [
-                text,
                 enable_reference_audio,
                 reference_audio,
                 reference_text,
@@ -267,12 +555,29 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
-                speaker,
             ],
-            [audio, error],
             concurrency_limit=1,
         )
     return app
@@ -281,74 +586,60 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/text2semantic-sft-large-v1-4k.pth",
     )
     parser.add_argument(
-        "--llama-config-name", type=str, default="dual_ar_2_codebook_large"
-    )
-    parser.add_argument(
-        "--vqgan-checkpoint-path",
         type=Path,
-        default="checkpoints/vq-gan-group-fsq-2x1024.pth",
     )
-    parser.add_argument("--vqgan-config-name", type=str, default="vqgan_pretrain")
-    parser.add_argument("--tokenizer", type=str, default="fishaudio/fish-speech-1")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
-    parser.add_argument("--max-length", type=int, default=2048)
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-gradio-length", type=int, default=0)
     return parser.parse_args()
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
-    args.compile = True
-    args.max_gradio_length = 1024
-    args.tokenizer = "./checkpoints/fish-speech-1"
-    args.llama_checkpoint_path = "./checkpoints/fish-speech-1/text2semantic-sft-medium-v1-4k.pth"
-    args.llama_config_name = "dual_ar_2_codebook_medium"
-    args.vqgan_checkpoint_path = "./checkpoints/fish-speech-1/vq-gan-group-fsq-2x1024.pth"
-    args.vqgan_config_name = "vqgan_pretrain"
     logger.info("Loading Llama model...")
     llama_queue = launch_thread_safe_queue(
-        config_name=args.llama_config_name,
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,
         precision=args.precision,
-        max_length=args.max_length,
         compile=args.compile,
     )
-    llama_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
     logger.info("Llama model loaded, loading VQ-GAN model...")
-    vqgan_model = load_vqgan_model(
-        config_name=args.vqgan_config_name,
-        checkpoint_path=args.vqgan_checkpoint_path,
         device=args.device,
     )
-    logger.info("VQ-GAN model loaded, warming up...")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
-    inference(
-        text="Hello, world!",
-        enable_reference_audio=False,
-        reference_audio=None,
-        reference_text="",
-        max_new_tokens=0,
-        chunk_length=0,
-        top_p=0.7,
-        repetition_penalty=1.5,
-        temperature=0.7,
-        speaker=None,
     )
     logger.info("Warming up done, launching the web UI...")
     app = build_app()
-    app.launch(show_api=False)

 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
+snapshot_download(repo_id="fishaudio/fish-speech-1.2-sft", local_dir="./checkpoints/fish-speech-1.2-sft")
 print("All checkpoints downloaded")
 from tools.llama.generate import launch_thread_safe_queue
 from tools.vqgan.inference import load_model as load_vqgan_model
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+from tools.api import decode_vq_tokens, encode_reference
+from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
+from tools.llama.generate import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+    launch_thread_safe_queue,
+)
+from tools.vqgan.inference import load_model as load_decoder_model
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
 HEADER_MD = """# Fish Speech
+## The demo in this space is version 1.2, Please check [Fish Audio](https://fish.audio) for the best model.
+## 该 Demo 为 Fish Speech 1.2 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
 You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).
 你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.
+Related code and weights are released under CC BY-NC-SA 4.0 License.
+相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+The model running in this WebUI is Fish Speech V1.2 Medium SFT.
+在此 WebUI 中运行的模型是 Fish Speech V1.2 Medium SFT.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
     top_p,
     repetition_penalty,
     temperature,
+    streaming=False,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
+        return (
+            None,
+            None,
+            "Text is too long, please keep it under {} characters.".format(
+                args.max_gradio_length
+            ),
         )
+    # Parse reference audio aka prompt
+    prompt_tokens = encode_reference(
+        decoder_model=decoder_model,
+        reference_audio=reference_audio,
+        enable_reference_audio=enable_reference_audio,
+    )
     # LLAMA Inference
     request = dict(
+        device=decoder_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_p=top_p,
         compile=args.compile,
         iterative_prompt=chunk_length > 0,
         chunk_length=chunk_length,
+        max_length=2048,
         prompt_tokens=prompt_tokens if enable_reference_audio else None,
         prompt_text=reference_text if enable_reference_audio else None,
     )
+    response_queue = queue.Queue()
+    llama_queue.put(
+        GenerateRequest(
+            request=request,
+            response_queue=response_queue,
+        )
     )
+    if streaming:
+        yield wav_chunk_header(), None, None
+    segments = []
     while True:
+        result: WrappedGenerateResponse = response_queue.get()
+        if result.status == "error":
+            yield None, None, build_html_error_message(result.response)
             break
+        result: GenerateResponse = result.response
+        if result.action == "next":
+            break
+        with torch.autocast(
+            device_type=(
+                "cpu"
+                if decoder_model.device.type == "mps"
+                else decoder_model.device.type
+            ),
+            dtype=args.precision,
+        ):
+            fake_audios = decode_vq_tokens(
+                decoder_model=decoder_model,
+                codes=result.codes,
+            )
+        fake_audios = fake_audios.float().cpu().numpy()
+        segments.append(fake_audios)
+        if streaming:
+            yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
+    if len(segments) == 0:
+        return (
+            None,
+            None,
+            build_html_error_message(
+                "No audio generated, please check the input text."
+            ),
+        )
+    # No matter streaming or not, we need to return the final audio
+    audio = np.concatenate(segments, axis=0)
+    yield None, (decoder_model.spec_transform.sample_rate, audio), None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+def inference_with_auto_rerank(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    use_auto_rerank,
+    streaming=False,
+):
+    max_attempts = 2 if use_auto_rerank else 1
+    best_wer = float("inf")
+    best_audio = None
+    best_sample_rate = None
+    for attempt in range(max_attempts):
+        audio_generator = inference(
+            text,
+            enable_reference_audio,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            streaming=False,
+        )
+        # 获取音频数据
+        for _ in audio_generator:
+            pass
+        _, (sample_rate, audio), message = _
+        if audio is None:
+            return None, None, message
+        if not use_auto_rerank:
+            return None, (sample_rate, audio), None
+        asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
+        wer = calculate_wer(text, asr_result["text"])
+        if wer <= 0.3 and not asr_result["huge_gap"]:
+            return None, (sample_rate, audio), None
+        if wer < best_wer:
+            best_wer = wer
+            best_audio = audio
+            best_sample_rate = sample_rate
+        if attempt == max_attempts - 1:
+            break
+    return None, (best_sample_rate, best_audio), None
+inference_stream = partial(inference, streaming=True)
+n_audios = 4
+global_audio_list = []
+global_error_list = []
+def inference_wrapper(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    batch_infer_num,
+    if_load_asr_model,
+):
+    audios = []
+    errors = []
+    for _ in range(batch_infer_num):
+        result = inference_with_auto_rerank(
+            text,
+            enable_reference_audio,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            if_load_asr_model,
+        )
+        _, audio_data, error_message = result
+        audios.append(
+            gr.Audio(value=audio_data if audio_data else None, visible=True),
+        )
+        errors.append(
+            gr.HTML(value=error_message if error_message else None, visible=True),
+        )
+    for _ in range(batch_infer_num, n_audios):
+        audios.append(
+            gr.Audio(value=None, visible=False),
+        )
+        errors.append(
+            gr.HTML(value=None, visible=False),
+        )
+    return None, *audios, *errors
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+def normalize_text(user_input, use_normalization):
+    if use_normalization:
+        return ChnNormedText(raw_text=user_input).normalize()
+    else:
+        return user_input
+asr_model = None
+def change_if_load_asr_model(if_load):
+    global asr_model
+    if if_load:
+        gr.Warning("Loading faster whisper model...")
+        if asr_model is None:
+            asr_model = load_model()
+        return gr.Checkbox(label="Unload faster whisper model", value=if_load)
+    if if_load is False:
+        gr.Warning("Unloading faster whisper model...")
+        del asr_model
+        asr_model = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        return gr.Checkbox(label="Load faster whisper model", value=if_load)
+def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
+    if if_load and asr_model is not None:
+        if (
+            if_auto_label
+            and enable_ref
+            and ref_audio is not None
+            and ref_text.strip() == ""
+        ):
+            data, sample_rate = librosa.load(ref_audio)
+            res = batch_asr(asr_model, [data], sample_rate)[0]
+            ref_text = res["text"]
+    else:
+        gr.Warning("Whisper model not loaded!")
+    return gr.Textbox(value=ref_text)
 def build_app():
         app.load(
             None,
             None,
+            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}"
+            % args.theme,
         )
         # Inference
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
+                    label="Input Text", placeholder=TEXTBOX_PLACEHOLDER, lines=10
+                )
+                refined_text = gr.Textbox(
+                    label="Realtime Transform Text",
+                    placeholder=
+                        "Normalization Result Preview (Currently Only Chinese)",
+                    lines=5,
+                    interactive=False,
                 )
                 with gr.Row():
+                    if_refine_text = gr.Checkbox(
+                        label="Text Normalization",
+                        value=True,
+                        scale=1,
+                    )
+                    if_load_asr_model = gr.Checkbox(
+                        label="Load / Unload ASR model for auto-reranking",
+                        value=False,
+                        scale=3,
+                    )
+                with gr.Row():
+                    with gr.Tab(label="Advanced Config"):
                         chunk_length = gr.Slider(
+                            label="Iterative Prompt Length, 0 means off",
                             minimum=0,
+                            maximum=500,
+                            value=100,
                             step=8,
                         )
                         max_new_tokens = gr.Slider(
+                            label="Maximum tokens per batch, 0 means no limit",
+                            minimum=0,
+                            maximum=2048,
+                            value=1024,  # 0 means no limit
                             step=8,
                         )
                         top_p = gr.Slider(
+                            label="Top-P",
+                            minimum=0.6,
+                            maximum=0.9,
+                            value=0.7,
+                            step=0.01,
                         )
                         repetition_penalty = gr.Slider(
                             label="Repetition Penalty",
+                            minimum=1,
+                            maximum=1.5,
+                            value=1.2,
                             step=0.01,
                         )
                         temperature = gr.Slider(
                             label="Temperature",
+                            minimum=0.6,
+                            maximum=0.9,
                             value=0.7,
                             step=0.01,
                         )
+                    with gr.Tab(label="Reference Audio"):
                         gr.Markdown(
+                                "5 to 10 seconds of reference audio, useful for specifying speaker."
                         )
                         enable_reference_audio = gr.Checkbox(
+                            label="Enable Reference Audio",
                         )
                         reference_audio = gr.Audio(
+                            label="Reference Audio",
                             type="filepath",
                         )
+                        with gr.Row():
+                            if_auto_label = gr.Checkbox(
+                                label="Auto Labeling",
+                                min_width=100,
+                                scale=0,
+                                value=False,
+                            )
+                            reference_text = gr.Textbox(
+                                label="Reference Text",
+                                lines=1,
+                                placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
+                                value="",
+                            )
+                    with gr.Tab(label="Batch Inference"):
+                        batch_infer_num = gr.Slider(
+                            label="Batch infer nums",
+                            minimum=1,
+                            maximum=n_audios,
+                            step=1,
+                            value=1,
                         )
             with gr.Column(scale=3):
+                for _ in range(n_audios):
+                    with gr.Row():
+                        error = gr.HTML(
+                            label="Error Message",
+                            visible=True if _ == 0 else False,
+                        )
+                        global_error_list.append(error)
+                    with gr.Row():
+                        audio = gr.Audio(
+                            label="Generated Audio",
+                            type="numpy",
+                            interactive=False,
+                            visible=True if _ == 0 else False,
+                        )
+                        global_audio_list.append(audio)
+                with gr.Row():
+                    stream_audio = gr.Audio(
+                        label="Streaming Audio",
+                        streaming=True,
+                        autoplay=True,
+                        interactive=False,
+                        show_download_button=True,
+                    )
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
+                            value="\U0001F3A7 " + "Generate", variant="primary"
+                        )
+                        generate_stream = gr.Button(
+                            value="\U0001F3A7 " + "Streaming Generate",
+                            variant="primary",
                         )
+        text.input(
+            fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
+        )
+        if_load_asr_model.change(
+            fn=change_if_load_asr_model,
+            inputs=[if_load_asr_model],
+            outputs=[if_load_asr_model],
+        )
+        if_auto_label.change(
+            fn=lambda: gr.Textbox(value=""),
+            inputs=[],
+            outputs=[reference_text],
+        ).then(
+            fn=change_if_auto_label,
+            inputs=[
+                if_load_asr_model,
+                if_auto_label,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+            ],
+            outputs=[reference_text],
+        )
         # # Submit
         generate.click(
+            inference_wrapper,
             [
+                refined_text,
                 enable_reference_audio,
                 reference_audio,
                 reference_text,
                 top_p,
                 repetition_penalty,
                 temperature,
+                batch_infer_num,
+                if_load_asr_model,
             ],
+            [stream_audio, *global_audio_list, *global_error_list],
             concurrency_limit=1,
         )
+        generate_stream.click(
+            inference_stream,
+            [
+                refined_text,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+                max_new_tokens,
+                chunk_length,
+                top_p,
+                repetition_penalty,
+                temperature,
+            ],
+            [stream_audio, global_audio_list[0], global_error_list[0]],
+            concurrency_limit=10,
+        )
     return app
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.2-sft",
     )
     parser.add_argument(
+        "--decoder-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
     )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-gradio-length", type=int, default=0)
+    parser.add_argument("--theme", type=str, default="light")
     return parser.parse_args()
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
     logger.info("Loading Llama model...")
     llama_queue = launch_thread_safe_queue(
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,
         precision=args.precision,
         compile=args.compile,
     )
     logger.info("Llama model loaded, loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
         device=args.device,
     )
+    logger.info("Decoder model loaded, warming up...")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    list(
+        inference(
+            text="Hello, world!",
+            enable_reference_audio=False,
+            reference_audio=None,
+            reference_text="",
+            max_new_tokens=0,
+            chunk_length=100,
+            top_p=0.7,
+            repetition_penalty=1.2,
+            temperature=0.7,
+        )
     )
     logger.info("Warming up done, launching the web UI...")
     app = build_app()
+    app.launch(show_api=True)

fish_speech/configs/base.yaml CHANGED Viewed

@@ -17,6 +17,7 @@ trainer:
   devices: auto
   strategy:
     _target_: lightning.pytorch.strategies.DDPStrategy
   precision: bf16-mixed

   devices: auto
   strategy:
     _target_: lightning.pytorch.strategies.DDPStrategy
+    process_group_backend: nccl  # This should be override when training on windows
   precision: bf16-mixed

fish_speech/configs/firefly_gan_vq.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
+spec_transform:
+  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
+  sample_rate: 44100
+  n_mels: 160
+  n_fft: 2048
+  hop_length: 512
+  win_length: 2048
+backbone:
+  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
+  input_channels: 160
+  depths: [3, 3, 9, 3]
+  dims: [128, 256, 384, 512]
+  drop_path_rate: 0.2
+  kernel_size: 7
+head:
+  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
+  hop_length: 512
+  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
+  upsample_kernel_sizes: [16, 16, 4, 4, 4]
+  resblock_kernel_sizes: [3, 7, 11]
+  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+  num_mels: 512
+  upsample_initial_channel: 512
+  use_template: false
+  pre_conv_kernel_size: 13
+  post_conv_kernel_size: 13
+quantizer:
+  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
+  input_dim: 512
+  n_groups: 4
+  n_codebooks: 1
+  levels: [8, 5, 5, 5]
+  downsample_factor: [2]

fish_speech/configs/lora/r_8_alpha_16.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: fish_speech.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01

fish_speech/configs/text2semantic_finetune.yaml CHANGED Viewed

@@ -1,18 +1,16 @@
 defaults:
   - base
-  - model@model.model: dual_ar_2_codebook_small
   - _self_
 project: text2semantic_finetune_dual_ar
-max_length: 2048
-ckpt_path: checkpoints/text2semantic-medium-v1-2k.pth
-resume_weights_only: true
 # Lightning Trainer
 trainer:
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
   max_steps: 1000
   precision: bf16-true
   limit_val_batches: 10
@@ -21,29 +19,31 @@ trainer:
 # Dataset Configuration
 tokenizer:
   _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: fishaudio/fish-speech-1
 # Dataset Configuration
 train_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
   proto_files:
     - data/protos
   tokenizer: ${tokenizer}
   max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
   use_speaker: false
 val_dataset:
-  _target_: fish_speech.datasets.text.AutoAugTextDataset
   proto_files:
     - data/protos
   tokenizer: ${tokenizer}
   max_length: ${max_length}
-  num_codebooks: ${model.model.config.num_codebooks}
   use_speaker: false
 data:
-  _target_: fish_speech.datasets.text.TextDataModule
   train_dataset: ${train_dataset}
   val_dataset: ${val_dataset}
   num_workers: 4
@@ -53,13 +53,18 @@ data:
 # Model Configuration
 model:
-  _target_: fish_speech.models.text2semantic.TextToSemantic
-  model: {}
   optimizer:
     _target_: torch.optim.AdamW
     _partial_: true
-    lr: 1e-5
     weight_decay: 0
     betas: [0.9, 0.95]
     eps: 1e-5
@@ -68,12 +73,11 @@ model:
     _target_: torch.optim.lr_scheduler.LambdaLR
     _partial_: true
     lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
       _partial_: true
-      num_warmup_steps: 100
-      num_training_steps: ${trainer.max_steps}
 # Callbacks
 callbacks:
   model_checkpoint:
-    every_n_train_steps: 100

 defaults:
   - base
   - _self_
 project: text2semantic_finetune_dual_ar
+max_length: 4096
+pretrained_ckpt_path: checkpoints/fish-speech-1.2-sft
 # Lightning Trainer
 trainer:
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
   max_steps: 1000
   precision: bf16-true
   limit_val_batches: 10
 # Dataset Configuration
 tokenizer:
   _target_: transformers.AutoTokenizer.from_pretrained
+  pretrained_model_name_or_path: ${pretrained_ckpt_path}
 # Dataset Configuration
 train_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
   proto_files:
     - data/protos
   tokenizer: ${tokenizer}
+  causal: true
   max_length: ${max_length}
   use_speaker: false
+  interactive_prob: 0.7
 val_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
   proto_files:
     - data/protos
   tokenizer: ${tokenizer}
+  causal: true
   max_length: ${max_length}
   use_speaker: false
+  interactive_prob: 0.7
 data:
+  _target_: fish_speech.datasets.semantic.SemanticDataModule
   train_dataset: ${train_dataset}
   val_dataset: ${val_dataset}
   num_workers: 4
 # Model Configuration
 model:
+  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
+  model:
+    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
+    path: ${pretrained_ckpt_path}
+    load_weights: true
+    max_length: ${max_length}
+    lora_config: null
   optimizer:
     _target_: torch.optim.AdamW
     _partial_: true
+    lr: 1e-4
     weight_decay: 0
     betas: [0.9, 0.95]
     eps: 1e-5
     _target_: torch.optim.lr_scheduler.LambdaLR
     _partial_: true
     lr_lambda:
+      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
       _partial_: true
+      num_warmup_steps: 10
 # Callbacks
 callbacks:
   model_checkpoint:
+    every_n_train_steps: ${trainer.val_check_interval}

fish_speech/datasets/concat_repeat.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import bisect
+import random
+from typing import Iterable
+from torch.utils.data import Dataset, IterableDataset
+class ConcatRepeatDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    repeats: list[int]
+    @staticmethod
+    def cumsum(sequence, repeats):
+        r, s = [], 0
+        for dataset, repeat in zip(sequence, repeats):
+            l = len(dataset) * repeat
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
+        super().__init__()
+        self.datasets = list(datasets)
+        self.repeats = repeats
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
+        assert len(self.datasets) == len(
+            repeats
+        ), "datasets and repeats should have the same length"
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatRepeatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset = self.datasets[dataset_idx]
+        return dataset[sample_idx % len(dataset)]

fish_speech/datasets/semantic.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import random
+from dataclasses import dataclass
+from itertools import chain
+from pathlib import Path
+from random import Random
+from typing import Optional, Union
+import numpy as np
+import pyarrow.parquet as pq
+import torch
+import torch.nn.functional as F
+from datasets.download.streaming_download_manager import xopen
+from huggingface_hub import HfApi
+from lightning import LightningDataModule
+from torch.distributed import get_rank, get_world_size, is_initialized
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+from transformers import AutoTokenizer
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.datasets.protos.text_data_pb2 import SampledData
+from fish_speech.datasets.protos.text_data_stream import read_pb_stream
+from fish_speech.text.clean import clean_text
+from fish_speech.utils import RankedLogger
+from fish_speech.utils.braceexpand import braceexpand
+log = RankedLogger(__name__, rank_zero_only=True)
+def split_by_rank_worker(files):
+    # We need to know the total number of devices
+    # to split the data properly
+    total_devices = 1
+    if is_initialized():
+        total_devices = get_world_size()
+    worker_info = get_worker_info()
+    if worker_info is not None:
+        total_devices *= worker_info.num_workers
+    if len(files) < total_devices:
+        # Repeat the files N times to match the number of devices
+        files = files * (total_devices // len(files) + 1)
+    # DDP
+    if is_initialized():
+        files = files[get_rank() :: get_world_size()]
+    # Split by worker
+    if worker_info is not None:
+        files = files[worker_info.id :: worker_info.num_workers]
+    return files
+class AutoTextSemanticInstructionDataset(IterableDataset):
+    """
+    Auto Augment Dataset by Speaker
+    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
+    2. Automatically normalize the text
+    For interactive mode, we use the following format (multiple sequences):
+    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
+    For non-interactive mode, we use the following format (one long sequence):
+    <s> [INST] text [/INST] ... </s>
+    """
+    def __init__(
+        self,
+        proto_files: list[str],
+        seed: int = 42,
+        interactive_prob: float = 0.5,
+        max_length: int = 1024,
+        tokenizer: AutoTokenizer = None,
+        use_speaker: bool | float = True,
+        causal: bool = True,
+        num_codebooks: Optional[int] = None,
+        skip_text_prob: float = 0.0,
+    ):
+        """
+        Args:
+            proto_files: proto buf files if using local data
+            seed: random seed
+            interactive_prob: probability to use interactive mode
+            max_length: max length of the text
+            tokenizer: tokenizer
+            use_speaker: include speaker information in the prompt
+            causal: use causal sampling when using local data, disable will lead to random sampling
+            num_codebooks: number of codebooks, if None, it will be automatically detected
+            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
+        """
+        super().__init__()
+        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
+        self.seed = seed
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.interactive_prob = interactive_prob
+        self.use_speaker = use_speaker
+        self.proto_files = proto_files
+        self.causal = causal
+        self.num_codebooks = num_codebooks
+        self.skip_text_prob = skip_text_prob
+        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
+        self.groups = None
+    def init_mock_data_server(self):
+        if self.groups is not None:
+            return
+        # Expand the proto files
+        expanded_proto_files = []
+        for filename in self.proto_files:
+            for i in braceexpand(filename):
+                i = Path(i)
+                if i.is_file():
+                    expanded_proto_files.append(i)
+                elif i.is_dir():
+                    expanded_proto_files.extend(i.rglob("*.proto"))
+                    expanded_proto_files.extend(i.rglob("*.protos"))
+                else:
+                    raise ValueError(f"{i} is not a file or directory")
+        expanded_proto_files = sorted(expanded_proto_files)
+        Random(self.seed).shuffle(expanded_proto_files)
+        self.groups = []
+        shard_proto_files = split_by_rank_worker(expanded_proto_files)
+        log.info(
+            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
+        )
+        count = 0
+        for filename in shard_proto_files:
+            with open(filename, "rb") as f:
+                for text_data in read_pb_stream(f):
+                    self.groups.append(text_data)
+                    count += 1
+        log.info(f"Read total {count} groups of data")
+        # Shuffle the lines
+        Random(self.seed).shuffle(self.groups)
+        self.group_weights = [len(i.sentences) for i in self.groups]
+    def __iter__(self):
+        while True:
+            yield self.augment()
+    def tokenize_sentence(self, sentence: str):
+        sentence = clean_text(sentence)
+        tokens = self.tokenizer.encode(
+            f"{sentence}",
+            max_length=10**6,
+            add_special_tokens=False,
+            truncation=False,
+        )
+        return sentence, len(tokens)
+    def sample_data(self):
+        if self.groups is None:
+            self.init_mock_data_server()
+        # Shuffle unique lines, estimate that each sample is at least 20 tokens
+        num_samples = self.max_length // 20
+        # choice group based on their number of samples
+        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
+        if self.causal:
+            # Sample in order
+            if num_samples >= len(group.sentences):
+                samples = group.sentences
+            else:
+                begin = random.randint(0, len(group.sentences) - num_samples)
+                samples = group.sentences[begin : begin + num_samples]
+        else:
+            samples = random.choices(
+                group.sentences, k=min(num_samples, len(group.sentences))
+            )
+        return SampledData(
+            source=group.source,
+            name=group.name,
+            samples=samples,
+        )
+    def augment(self):
+        final_text, final_semantic = [], []
+        response = self.sample_data()
+        if len(response.samples) == 0:
+            # Invalid group
+            return None
+        samples = list(response.samples)
+        idx = 0
+        use_interactive = random.random() < self.interactive_prob
+        if use_interactive is False:
+            # Random sample based on speaker using a truncated normal distribution
+            a = torch.tensor([0], dtype=torch.float32)
+            torch.nn.init.trunc_normal_(
+                a,
+                mean=self.max_length // 2,
+                std=self.max_length // 4,
+                a=10,
+                b=self.max_length,
+            )
+            remaining_tokens = a.long().item() - 4
+        else:
+            remaining_tokens = self.max_length
+        # Use speaker
+        if isinstance(self.use_speaker, float):
+            use_speaker = random.random() < self.use_speaker
+        else:
+            use_speaker = self.use_speaker
+        all_tokens, all_labels = [], []
+        while remaining_tokens > 0 and len(samples) > 0:
+            sentence = samples.pop(0)
+            text = random.choice(sentence.texts)
+            text, length = self.tokenize_sentence(text)
+            remaining_tokens -= length + len(sentence.semantics[0].values)
+            if use_interactive is False:
+                final_text.append(text)
+                final_semantic.append(sentence.semantics)
+            else:
+                # For interactive mode, we only apply speaker for the first sentence
+                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
+                tokens, labels = self.pack_sentences(
+                    sentences=[text],
+                    semantics=[sentence.semantics],
+                    speaker=response.name if use_speaker else None,
+                    skip_text=random.random() < self.skip_text_prob,
+                )
+                all_tokens.append(tokens)
+                all_labels.append(labels)
+            idx += 1
+        if use_interactive is False:
+            tokens, labels = self.pack_sentences(
+                final_text,
+                semantics=final_semantic,
+                speaker=response.name if use_speaker else None,
+            )
+            all_tokens.append(tokens)
+            all_labels.append(labels)
+        tokens = torch.cat(all_tokens, dim=1)
+        labels = torch.cat(all_labels, dim=1)
+        # Verify that the length is correct
+        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
+        data = {"tokens": tokens, "labels": labels}
+        return data
+    def pack_sentences(
+        self,
+        sentences: list[str],
+        semantics: list,
+        speaker: Optional[str] = None,
+        skip_text: bool = False,
+    ):
+        if speaker is None:
+            speaker = "assistant"
+        cated_sentences = " ".join(sentences)
+        if skip_text:
+            cated_sentences = "<|skip_text|>"
+        final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
+        final_text = final_text + f"<|im_start|>{speaker}\n"
+        encoded = self.tokenizer.encode(
+            final_text,
+            add_special_tokens=False,
+            truncation=False,
+            max_length=10**6,
+        )
+        semantic_length = sum([len(i[0].values) for i in semantics])
+        prompt_length = len(encoded)
+        num_codebooks = (
+            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
+        )
+        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
+        tokens = (
+            encoded
+            + [self.semantic_token_id] * semantic_length
+            + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
+        )
+        # Codebook bos/padding: 0, eos: 1
+        codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
+        for segment in semantics:
+            for book_idx, book in zip(range(num_codebooks), segment):
+                for j in book.values:
+                    codes[book_idx].append(int(j) + 1)
+        for book in codes:
+            book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
+        tokens = [tokens] + codes
+        tokens = torch.tensor(tokens, dtype=torch.long)
+        labels = tokens.clone()
+        if skip_text:
+            # If text is not provided, the sentence is used for condition only, all labels are -100
+            torch.fill_(labels, -100)
+            return tokens, labels
+        # Mask out the <s> tokens for semantic, predict semantic tokens only
+        # Since we don't mask out the input tokens, the language modeling still works
+        labels[1:, :prompt_length] = -100
+        tokens = tokens[:, :-1]
+        labels = labels[:, 1:]
+        # Verify the padding is correct, and the last token is eos
+        assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
+        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
+        return tokens, labels
+@dataclass
+class TextDataCollator:
+    tokenizer: AutoTokenizer
+    max_length: int = 1024
+    def __call__(self, examples):
+        if "negative_tokens" in examples:
+            positive_examples = []
+            negative_examples = []
+            for i in examples:
+                positive_examples.append(
+                    {
+                        "tokens": i["tokens"],
+                        "labels": i["labels"],
+                    }
+                )
+                negative_examples.append(
+                    {
+                        "tokens": i["negative_tokens"],
+                        "labels": i["negative_labels"],
+                    }
+                )
+            examples = positive_examples + negative_examples
+        return self.batchify(examples)
+    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
+        tokens, attention_masks, labels = [], [], []
+        # Calculate the max length
+        max_tokens_length = 0
+        for example in examples:
+            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
+        max_tokens_length = min(max_tokens_length, self.max_length)
+        for example in examples:
+            _tokens = example[tokens_key][:, :max_tokens_length]
+            _labels = example[labels_key][:, :max_tokens_length]
+            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
+            tokens_length = _tokens.size(1)
+            _attention_mask[:tokens_length] = False
+            assert tokens_length == _labels.size(
+                1
+            ), f"{tokens_length} != {_labels.size(1)}"
+            if tokens_length < max_tokens_length:
+                _tokens = F.pad(
+                    _tokens,
+                    (0, max_tokens_length - tokens_length),
+                    value=self.tokenizer.eos_token_id,
+                )
+                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
+                _labels = F.pad(
+                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
+                )
+            tokens.append(_tokens)
+            attention_masks.append(_attention_mask)
+            labels.append(_labels)
+        tokens = torch.stack(tokens, dim=0)
+        attention_masks = torch.stack(attention_masks, dim=0)
+        labels = torch.stack(labels, dim=0)
+        return {
+            "inputs": tokens,
+            "attention_masks": attention_masks,
+            "labels": labels,
+        }
+class InterleaveDataset(IterableDataset):
+    def __init__(
+        self,
+        datasets: list[IterableDataset],
+        probabilities: list[float],
+        seed: int = 42,
+    ):
+        super().__init__()
+        self.datasets = datasets
+        self.probabilities = probabilities
+        self.seed = seed
+    def __iter__(self):
+        rng = np.random.default_rng(self.seed)
+        dataset_iterators = [iter(dataset) for dataset in self.datasets]
+        while True:
+            # Random choice one
+            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
+            dataset_iterator = dataset_iterators[dataset_idx]
+            try:
+                yield next(dataset_iterator)
+            except StopIteration:
+                # Exhausted, create a new iterator
+                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
+                yield next(dataset_iterators[dataset_idx])
+class SemanticDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
+        val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
+        batch_size: int = 32,
+        tokenizer: AutoTokenizer = None,
+        max_length: int = 1024,
+        num_workers: int = 4,
+    ):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+if __name__ == "__main__":
+    from tqdm import tqdm
+    ds = AutoTextSemanticInstructionDataset(
+        ["data/protos"],
+        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
+        use_speaker=False,
+        interactive_prob=1.0,
+        skip_text_prob=0.5,
+    )
+    for i in ds:
+        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
+        # i["labels"][0][i["labels"][0] == -100] = 0
+        # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
+        break

fish_speech/datasets/vqgan.py CHANGED Viewed

@@ -28,7 +28,7 @@ class VQGANDataset(Dataset):
         self.files = [
             root / line.strip()
-            for line in filelist.read_text().splitlines()
             if line.strip()
         ]
         self.sample_rate = sample_rate
@@ -120,6 +120,7 @@ class VQGANDataModule(LightningDataModule):
             collate_fn=VQGANCollator(),
             num_workers=self.num_workers,
             shuffle=True,
         )
     def val_dataloader(self):
@@ -128,6 +129,7 @@ class VQGANDataModule(LightningDataModule):
             batch_size=self.val_batch_size,
             collate_fn=VQGANCollator(),
             num_workers=self.num_workers,
         )

         self.files = [
             root / line.strip()
+            for line in filelist.read_text(encoding="utf-8").splitlines()
             if line.strip()
         ]
         self.sample_rate = sample_rate
             collate_fn=VQGANCollator(),
             num_workers=self.num_workers,
             shuffle=True,
+            persistent_workers=True,
         )
     def val_dataloader(self):
             batch_size=self.val_batch_size,
             collate_fn=VQGANCollator(),
             num_workers=self.num_workers,
+            persistent_workers=True,
         )

fish_speech/models/text2semantic/__init__.py CHANGED Viewed

@@ -1,3 +0,0 @@
-from .lit_module import TextToSemantic
-__all__ = ["TextToSemantic"]

fish_speech/models/text2semantic/lit_module.py CHANGED Viewed

@@ -1,110 +1,40 @@
-from dataclasses import dataclass
 from typing import Any, Optional
 import lightning as L
-import loralib as lora
 import torch
 import torch.nn.functional as F
 from lightning.pytorch.utilities.types import OptimizerLRScheduler
 import fish_speech.utils as utils
 from fish_speech.models.text2semantic.llama import NaiveTransformer
 log = utils.RankedLogger(__name__, rank_zero_only=True)
-@dataclass
-class LoraConfig:
-    r: int
-    lora_alpha: float
-    lora_dropout: float = 0.0
 class TextToSemantic(L.LightningModule):
     def __init__(
         self,
         model: NaiveTransformer,
         optimizer: Any,
         lr_scheduler: Any,
-        lora_config: Optional[LoraConfig] = None,
-        save_lora_only: bool = False,
-        use_dpo: bool = False,
-        dpo_beta: float = 0.2,
     ):
         super().__init__()
         self.model = model
         self.optimizer_builder = optimizer
         self.lr_scheduler_builder = lr_scheduler
-        self.lora_config = lora_config
-        self.save_lora_only = save_lora_only
-        self.use_dpo = use_dpo  # We don't support reference model yet
-        self.dpo_beta = dpo_beta
-        if self.lora_config is not None:
-            self.setup_lora()
-    def setup_lora(self):
-        # Replace the embedding layer with a LoRA layer
-        self.model.embeddings = lora.Embedding(
-            num_embeddings=self.model.embeddings.num_embeddings,
-            embedding_dim=self.model.embeddings.embedding_dim,
-            padding_idx=self.model.embeddings.padding_idx,
-            r=self.lora_config.r,
-            lora_alpha=self.lora_config.lora_alpha,
-        )
-        # Replace output layer with a LoRA layer
-        linears = [(self.model, "output")]
-        # Replace all linear layers with LoRA layers
-        for layer in self.model.layers:
-            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-            linears.extend(
-                [
-                    (layer.feed_forward, "w1"),
-                    (layer.feed_forward, "w2"),
-                    (layer.feed_forward, "w3"),
-                ]
-            )
-        if hasattr(self.model, "fast_layers"):
-            # Dual-AR model
-            linears.extend([(self.model, "fast_output")])
-            for layer in self.model.fast_layers:
-                linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-                linears.extend(
-                    [
-                        (layer.feed_forward, "w1"),
-                        (layer.feed_forward, "w2"),
-                        (layer.feed_forward, "w3"),
-                    ]
-                )
-        for module, layer in linears:
-            updated_linear = lora.Linear(
-                in_features=getattr(module, layer).in_features,
-                out_features=getattr(module, layer).out_features,
-                bias=getattr(module, layer).bias,
-                r=self.lora_config.r,
-                lora_alpha=self.lora_config.lora_alpha,
-                lora_dropout=self.lora_config.lora_dropout,
-            )
-            setattr(module, layer, updated_linear)
-        # Mark only the LoRA layers as trainable
-        lora.mark_only_lora_as_trainable(self.model, bias="lora_only")
     def forward(self, x):
         return self.model(x)
     def on_save_checkpoint(self, checkpoint):
-        if self.lora_config is None or self.save_lora_only is False:
-            return
         # Save only LoRA parameters
         state_dict = checkpoint["state_dict"]
         for name in list(state_dict.keys()):
             if "lora" not in name:
                 state_dict.pop(name)
@@ -178,6 +108,11 @@ class TextToSemantic(L.LightningModule):
     def _step(self, batch, batch_idx, stage: str):
         is_train = stage == "train"
         # Do positive and negative samples in the same batch to speed up training
         labels = batch["labels"]
         outputs = self.model(
@@ -187,92 +122,22 @@ class TextToSemantic(L.LightningModule):
         token_logits = outputs.token_logits
         codebook_logits = outputs.codebook_logits
-        if self.use_dpo:
-            # Firtst half is positive, second half is negative
-            token_logits, negative_token_logits = token_logits.chunk(2)
-            codebook_logits, negative_codebook_logits = codebook_logits.chunk(2)
-            labels, negative_labels = labels.chunk(2)
         # Generate labels
         base_loss = F.cross_entropy(
-            token_logits.reshape(-1, token_logits.size(-1)),
             labels[:, 0].reshape(-1),
             ignore_index=-100,
         )
         codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
         semantic_loss = F.cross_entropy(
-            codebook_logits.reshape(-1, codebook_logits.size(-1)),
             codebook_labels.reshape(-1),
             ignore_index=-100,
         )
         loss = base_loss + semantic_loss
-        # If we use dpo
-        if self.use_dpo:
-            negative_codebook_labels = negative_labels[
-                :, 1 : 1 + self.model.config.num_codebooks
-            ].mT
-            positive_codebook_logps = self.get_batch_logps(
-                codebook_logits, codebook_labels
-            )
-            negative_codebook_logps = self.get_batch_logps(
-                negative_codebook_logits, negative_codebook_labels
-            )
-            # TODO: implement the reference model, avoid screwing up the gradients
-            dpo_loss = -F.logsigmoid(
-                (positive_codebook_logps - negative_codebook_logps) * self.dpo_beta
-            ).mean()
-            chosen_rewards = self.dpo_beta * positive_codebook_logps.detach()
-            rejected_rewards = self.dpo_beta * negative_codebook_logps.detach()
-            reward_accuracy = (chosen_rewards > rejected_rewards).float().mean()
-            chosen_rewards, rejected_rewards = (
-                chosen_rewards.mean(),
-                rejected_rewards.mean(),
-            )
-            loss = loss + dpo_loss
-            self.log(
-                f"{stage}/dpo_loss",
-                dpo_loss,
-                on_step=is_train,
-                on_epoch=not is_train,
-                prog_bar=False,
-                logger=True,
-            )
-            self.log(
-                f"{stage}/chosen_rewards",
-                chosen_rewards,
-                on_step=is_train,
-                on_epoch=not is_train,
-                prog_bar=False,
-                logger=True,
-            )
-            self.log(
-                f"{stage}/rejected_rewards",
-                rejected_rewards,
-                on_step=is_train,
-                on_epoch=not is_train,
-                prog_bar=False,
-                logger=True,
-            )
-            self.log(
-                f"{stage}/reward_accuracy",
-                reward_accuracy,
-                on_step=is_train,
-                on_epoch=not is_train,
-                prog_bar=False,
-                logger=True,
-            )
         self.log(
             f"{stage}/loss",
             loss,
@@ -280,6 +145,7 @@ class TextToSemantic(L.LightningModule):
             on_epoch=not is_train,
             prog_bar=True,
             logger=True,
         )
         self.log(
@@ -289,6 +155,7 @@ class TextToSemantic(L.LightningModule):
             on_epoch=not is_train,
             prog_bar=False,
             logger=True,
         )
         self.log(
@@ -298,6 +165,7 @@ class TextToSemantic(L.LightningModule):
             on_epoch=not is_train,
             prog_bar=False,
             logger=True,
         )
         # Top-5 accuracy
@@ -309,31 +177,21 @@ class TextToSemantic(L.LightningModule):
             on_epoch=not is_train,
             prog_bar=True,
             logger=True,
         )
-        if self.model.config.num_codebooks != self.model.config.num_in_codebooks:
-            accuracy = self.get_accuracy(
-                codebook_logits[:, :, : self.model.config.num_in_codebooks],
-                codebook_labels[:, :, : self.model.config.num_in_codebooks],
-            )
-            self.log(
-                f"{stage}/top_5_accuracy_in",
-                accuracy,
-                on_step=is_train,
-                on_epoch=not is_train,
-                prog_bar=True,
-                logger=True,
-            )
         return loss
     def get_accuracy(self, logits, labels):
         _, indices = logits.topk(5, dim=-1)
         correct = indices.eq(labels.unsqueeze(-1))
-        correct[labels == -100] = 0
         correct = correct.sum()
-        accuracy = correct / (labels != -100).sum()
         return accuracy

 from typing import Any, Optional
 import lightning as L
 import torch
 import torch.nn.functional as F
 from lightning.pytorch.utilities.types import OptimizerLRScheduler
 import fish_speech.utils as utils
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
 from fish_speech.models.text2semantic.llama import NaiveTransformer
 log = utils.RankedLogger(__name__, rank_zero_only=True)
 class TextToSemantic(L.LightningModule):
     def __init__(
         self,
         model: NaiveTransformer,
         optimizer: Any,
         lr_scheduler: Any,
     ):
         super().__init__()
         self.model = model
         self.optimizer_builder = optimizer
         self.lr_scheduler_builder = lr_scheduler
     def forward(self, x):
         return self.model(x)
     def on_save_checkpoint(self, checkpoint):
         # Save only LoRA parameters
         state_dict = checkpoint["state_dict"]
+        use_lora = any("lora" in name for name in state_dict.keys())
+        if not use_lora:
+            return
         for name in list(state_dict.keys()):
             if "lora" not in name:
                 state_dict.pop(name)
     def _step(self, batch, batch_idx, stage: str):
         is_train = stage == "train"
+        if is_train:
+            # Key part to make lora work
+            # Otherwise the parameters are merged, which lead to incorrect gradients
+            self.model.train()
         # Do positive and negative samples in the same batch to speed up training
         labels = batch["labels"]
         outputs = self.model(
         token_logits = outputs.token_logits
         codebook_logits = outputs.codebook_logits
         # Generate labels
         base_loss = F.cross_entropy(
+            token_logits.view(-1, token_logits.size(-1)),
             labels[:, 0].reshape(-1),
             ignore_index=-100,
         )
         codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
         semantic_loss = F.cross_entropy(
+            codebook_logits.view(-1, codebook_logits.size(-1)),
             codebook_labels.reshape(-1),
             ignore_index=-100,
         )
         loss = base_loss + semantic_loss
         self.log(
             f"{stage}/loss",
             loss,
             on_epoch=not is_train,
             prog_bar=True,
             logger=True,
+            sync_dist=not is_train,
         )
         self.log(
             on_epoch=not is_train,
             prog_bar=False,
             logger=True,
+            sync_dist=not is_train,
         )
         self.log(
             on_epoch=not is_train,
             prog_bar=False,
             logger=True,
+            sync_dist=not is_train,
         )
         # Top-5 accuracy
             on_epoch=not is_train,
             prog_bar=True,
             logger=True,
+            sync_dist=not is_train,
         )
         return loss
     def get_accuracy(self, logits, labels):
+        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
+        if mask.sum() == 0:
+            return torch.tensor(0.0, device=logits.device)
         _, indices = logits.topk(5, dim=-1)
         correct = indices.eq(labels.unsqueeze(-1))
+        correct[~mask] = 0
         correct = correct.sum()
+        accuracy = correct / mask.sum()
         return accuracy

fish_speech/models/text2semantic/llama.py CHANGED Viewed

@@ -1,13 +1,25 @@
 import math
 from dataclasses import dataclass
 from typing import Optional
 import torch
 import torch.nn as nn
 from einops import rearrange
 from torch import Tensor
 from torch.nn import functional as F
 from torch.utils.checkpoint import checkpoint
 def find_multiple(n: int, k: int) -> int:
@@ -18,6 +30,8 @@ def find_multiple(n: int, k: int) -> int:
 @dataclass
 class BaseModelArgs:
     vocab_size: int = 32000
     n_layer: int = 32
     n_head: int = 32
@@ -29,16 +43,19 @@ class BaseModelArgs:
     norm_eps: float = 1e-5
     max_seq_len: int = 2048
     dropout: float = 0.0
     # Codebook configs
     codebook_size: int = 160
     num_codebooks: int = 4
-    num_in_codebooks: Optional[int] = None
-    codebook_padding_idx: int = 0
     # Gradient checkpointing
     use_gradient_checkpointing: bool = True
     def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_head
@@ -46,18 +63,41 @@ class BaseModelArgs:
             hidden_dim = 4 * self.dim
             n_hidden = int(2 * hidden_dim / 3)
             self.intermediate_size = find_multiple(n_hidden, 256)
-        if self.num_in_codebooks is None:
-            self.num_in_codebooks = self.num_codebooks
         self.head_dim = self.dim // self.n_head
 @dataclass
 class NaiveModelArgs(BaseModelArgs):
-    pass
 @dataclass
 class DualARModelArgs(BaseModelArgs):
     n_fast_layer: int = 4
@@ -95,24 +135,35 @@ class BaseTransformerForwardResult:
 class BaseTransformer(nn.Module):
-    def __init__(self, config: BaseModelArgs) -> None:
         super().__init__()
         self.config = config
         # Slow transformer
         self.embeddings = nn.Embedding(
-            config.vocab_size + config.codebook_size * config.num_in_codebooks,
             config.dim,
         )
         self.layers = nn.ModuleList(
             TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
         )
         self.norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.output = nn.Linear(
-            config.dim,
-            config.vocab_size,
-            bias=False,
-        )
         self.register_buffer(
             "freqs_cis",
@@ -139,6 +190,9 @@ class BaseTransformer(nn.Module):
         self.max_batch_size = -1
         self.max_seq_len = -1
     def setup_caches(
         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
     ):
@@ -161,11 +215,9 @@ class BaseTransformer(nn.Module):
     def embed(self, x: Tensor) -> Tensor:
         vocab_embeds = [self.embeddings(x[:, 0])]
-        for i in range(self.config.num_in_codebooks):
-            emb = self.embeddings(
-                x[:, i + 1] + i * self.config.codebook_size + self.config.vocab_size
-            )
-            emb[x[:, i + 1] == self.config.codebook_padding_idx] = 0
             vocab_embeds.append(emb)
         x = torch.stack(vocab_embeds, dim=3)
@@ -174,21 +226,23 @@ class BaseTransformer(nn.Module):
         return x
     def forward(
-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
     ) -> BaseTransformerForwardResult:
-        # x: (batch, num_codebooks + 1, seq_len)
         seq_len = inp.size(2)
         # Here we want to merge the embeddings of the codebooks
         x = self.embed(inp)
-        mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
         freqs_cis = self.freqs_cis[:seq_len]
         # Not that the causal mask here follows the definition of scaled_dot_product_attention
         # That is, FALSE means masked out
         # To maintain consistency, key_padding_mask use TRUE to mask out
         if key_padding_mask is not None:
             mask = mask & key_padding_mask[:, None, None, :].logical_not()
         for layer in self.layers:
@@ -199,7 +253,11 @@ class BaseTransformer(nn.Module):
         # We got slow_out here
         slow_out = self.norm(x)
-        token_logits = self.output(slow_out)
         return BaseTransformerForwardResult(
             logits=token_logits,
@@ -207,7 +265,10 @@ class BaseTransformer(nn.Module):
         )
     def forward_generate(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
     ) -> BaseTransformerForwardResult:
         # This is used for generation, optimized for torch compile
         assert (
@@ -225,22 +286,117 @@ class BaseTransformer(nn.Module):
             x = layer(x, freqs_cis, mask, input_pos=input_pos)
         # If prefill, we only calculate the logits of last token
-        if x.size(1) > 1:
             x = x[:, -1:]
         # We got slow_out here
         slow_out = self.norm(x)
-        token_logits = self.output(slow_out)
         return BaseTransformerForwardResult(
             logits=token_logits,
             hidden_states=x,
         )
 class NaiveTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs) -> None:
-        super().__init__(config)
         self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
         self.codebook_output = nn.Linear(
@@ -249,6 +405,8 @@ class NaiveTransformer(BaseTransformer):
             bias=False,
         )
     def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
         token_logits = result.logits
         x = result.hidden_states
@@ -265,9 +423,14 @@ class NaiveTransformer(BaseTransformer):
         )
     def forward(
-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
     ) -> TransformerForwardResult:
-        result = super().forward(inp, key_padding_mask)
         return self.decode(result)
     def forward_generate(
@@ -278,13 +441,11 @@ class NaiveTransformer(BaseTransformer):
 class DualARTransformer(BaseTransformer):
-    def __init__(self, config: DualARModelArgs) -> None:
-        super().__init__(config)
         # Fast transformer
-        self.fast_embeddings = nn.Embedding(
-            config.codebook_size, config.dim, padding_idx=config.codebook_padding_idx
-        )
         # The equivalent bs is so large that sdpa doesn't work
         self.fast_layers = nn.ModuleList(
@@ -297,6 +458,8 @@ class DualARTransformer(BaseTransformer):
             bias=False,
         )
     def setup_caches(
         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
     ):
@@ -316,7 +479,9 @@ class DualARTransformer(BaseTransformer):
             )
     def forward(
-        self, inp: Tensor, key_padding_mask: Optional[Tensor] = None
     ) -> TransformerForwardResult:
         parent_result = super().forward(inp, key_padding_mask)
         token_logits = parent_result.logits
@@ -331,7 +496,7 @@ class DualARTransformer(BaseTransformer):
         # Drop the last token and rotate left
         codebooks = inp[:, 1:-1, 1:]
-        codebooks = F.pad(codebooks, (0, 1), value=self.config.codebook_padding_idx)
         codebook_embeddings = self.fast_embeddings(codebooks)
         x = torch.cat([x[:, None], codebook_embeddings], dim=1)
         b, s = x.size(0), x.size(2)
@@ -339,7 +504,12 @@ class DualARTransformer(BaseTransformer):
         # Remove padded part
         codebooks = rearrange(codebooks, "b n s -> (b s) n")
-        codebook_mask = (codebooks == self.config.codebook_padding_idx).all(dim=-1)
         x_bs, x_len = x.size(0), x.size(1)
         x = x[~codebook_mask]
@@ -422,7 +592,9 @@ class Attention(nn.Module):
         total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
         # key, query, value projections for all heads, but in a batch
-        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
         self.wo = nn.Linear(config.dim, config.dim, bias=False)
         self.kv_cache = None
@@ -469,13 +641,24 @@ class Attention(nn.Module):
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         if self.use_sdpa:
-            y = F.scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=self.dropout if self.training else 0.0,
-            )
         else:
             y = self.eq_scaled_dot_product_attention(
                 q,
@@ -567,29 +750,3 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     x_out2 = x_out2.flatten(3)
     return x_out2.type_as(x)
-if __name__ == "__main__":
-    args = DualARModelArgs(
-        max_seq_len=4096,
-        vocab_size=32312,
-        n_layer=12,
-        n_fast_layer=4,
-        n_head=12,
-        dim=768,
-        rope_base=10000,
-        norm_eps=1e-5,
-        codebook_size=128,
-        num_codebooks=4,
-    )
-    model = DualARTransformer(args)
-    model = model.cuda().bfloat16()
-    print("Total params:", sum(i.numel() for i in model.parameters()) / 1024 / 1024)
-    inputs = torch.randint(0, 100, (2, 5, 128)).cuda()
-    key_padding_mask = torch.zeros(2, 128).bool().cuda()
-    key_padding_mask[0, 2:] = True
-    x1 = model(inputs, key_padding_mask=key_padding_mask)
-    print(x1.token_logits.shape)
-    print(x1.codebook_logits.shape)

+import json
 import math
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional
 import torch
 import torch.nn as nn
 from einops import rearrange
+from loguru import logger
 from torch import Tensor
 from torch.nn import functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from torch.utils.checkpoint import checkpoint
+from transformers import AutoTokenizer
+from fish_speech.conversation import SEMANTIC_TOKEN
+from fish_speech.utils import RankedLogger
+from .lora import LoraConfig, setup_lora
+log = RankedLogger(__name__, rank_zero_only=True)
 def find_multiple(n: int, k: int) -> int:
 @dataclass
 class BaseModelArgs:
+    model_type: str = "base"
     vocab_size: int = 32000
     n_layer: int = 32
     n_head: int = 32
     norm_eps: float = 1e-5
     max_seq_len: int = 2048
     dropout: float = 0.0
+    tie_word_embeddings: bool = True
+    attention_qkv_bias: bool = False
     # Codebook configs
     codebook_size: int = 160
     num_codebooks: int = 4
     # Gradient checkpointing
     use_gradient_checkpointing: bool = True
+    # Initialize the model
+    initializer_range: float = 0.02
     def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_head
             hidden_dim = 4 * self.dim
             n_hidden = int(2 * hidden_dim / 3)
             self.intermediate_size = find_multiple(n_hidden, 256)
         self.head_dim = self.dim // self.n_head
+    @staticmethod
+    def from_pretrained(path: str):
+        path = Path(path)
+        if path.is_dir():
+            path = path / "config.json"
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        match data["model_type"]:
+            case "naive":
+                cls = NaiveModelArgs
+            case "dual_ar":
+                cls = DualARModelArgs
+            case _:
+                raise ValueError(f"Unknown model type: {data['model_type']}")
+        return cls(**data)
+    def save(self, path: str):
+        with open(path, "w") as f:
+            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
 @dataclass
 class NaiveModelArgs(BaseModelArgs):
+    model_type: str = "naive"
 @dataclass
 class DualARModelArgs(BaseModelArgs):
+    model_type: str = "dual_ar"
     n_fast_layer: int = 4
 class BaseTransformer(nn.Module):
+    def __init__(
+        self, config: BaseModelArgs, tokenizer: AutoTokenizer, init_weights: bool = True
+    ) -> None:
         super().__init__()
         self.config = config
+        self.tokenizer = tokenizer
+        self.semantic_token_id = tokenizer.convert_tokens_to_ids(SEMANTIC_TOKEN)
         # Slow transformer
         self.embeddings = nn.Embedding(
+            config.vocab_size,
+            config.dim,
+        )
+        self.codebook_embeddings = nn.Embedding(
+            config.codebook_size * config.num_codebooks,
             config.dim,
         )
         self.layers = nn.ModuleList(
             TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
         )
         self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        if self.config.tie_word_embeddings is False:
+            self.output = nn.Linear(
+                config.dim,
+                config.vocab_size,
+                bias=False,
+            )
         self.register_buffer(
             "freqs_cis",
         self.max_batch_size = -1
         self.max_seq_len = -1
+        if init_weights:
+            self.apply(self._init_weights)
     def setup_caches(
         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
     ):
     def embed(self, x: Tensor) -> Tensor:
         vocab_embeds = [self.embeddings(x[:, 0])]
+        for i in range(self.config.num_codebooks):
+            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
+            emb[x[:, 0] != self.semantic_token_id] = 0
             vocab_embeds.append(emb)
         x = torch.stack(vocab_embeds, dim=3)
         return x
     def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
     ) -> BaseTransformerForwardResult:
         seq_len = inp.size(2)
         # Here we want to merge the embeddings of the codebooks
         x = self.embed(inp)
         freqs_cis = self.freqs_cis[:seq_len]
         # Not that the causal mask here follows the definition of scaled_dot_product_attention
         # That is, FALSE means masked out
         # To maintain consistency, key_padding_mask use TRUE to mask out
+        mask = None
         if key_padding_mask is not None:
+            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
             mask = mask & key_padding_mask[:, None, None, :].logical_not()
         for layer in self.layers:
         # We got slow_out here
         slow_out = self.norm(x)
+        if self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
         return BaseTransformerForwardResult(
             logits=token_logits,
         )
     def forward_generate(
+        self,
+        x: Tensor,
+        input_pos: Optional[Tensor] = None,
+        return_all: bool = False,
     ) -> BaseTransformerForwardResult:
         # This is used for generation, optimized for torch compile
         assert (
             x = layer(x, freqs_cis, mask, input_pos=input_pos)
         # If prefill, we only calculate the logits of last token
+        if x.size(1) > 1 and not return_all:
             x = x[:, -1:]
         # We got slow_out here
         slow_out = self.norm(x)
+        if self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
         return BaseTransformerForwardResult(
             logits=token_logits,
             hidden_states=x,
         )
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @staticmethod
+    def from_pretrained(
+        path: str,
+        load_weights: bool = False,
+        max_length: int | None = None,
+        lora_config: LoraConfig | None = None,
+        rope_base: int | None = None,
+    ) -> "BaseTransformer":
+        config = BaseModelArgs.from_pretrained(str(path))
+        if max_length is not None:
+            config.max_seq_len = max_length
+            log.info(f"Override max_seq_len to {max_length}")
+        if rope_base is not None:
+            config.rope_base = rope_base
+            log.info(f"Override rope_base to {rope_base}")
+        match config.model_type:
+            case "naive":
+                model_cls = NaiveTransformer
+            case "dual_ar":
+                model_cls = DualARTransformer
+            case _:
+                raise ValueError(f"Unknown model type: {config.model_type}")
+        tokenizer = AutoTokenizer.from_pretrained(str(path))
+        log.info(f"Loading model from {path}, config: {config}")
+        model = model_cls(config, tokenizer=tokenizer)
+        if lora_config is not None:
+            setup_lora(model, lora_config)
+            log.info(f"LoRA setup: {lora_config}")
+        if load_weights is False:
+            log.info("Randomly initialized model")
+        else:
+            if "int8" in str(Path(path)):
+                logger.info("Using int8 weight-only quantization!")
+                from tools.llama.quantize import WeightOnlyInt8QuantHandler
+                simple_quantizer = WeightOnlyInt8QuantHandler(model)
+                model = simple_quantizer.convert_for_runtime()
+            if "int4" in str(Path(path)):
+                logger.info("Using int4 quantization!")
+                path_comps = path.name.split("-")
+                assert path_comps[-2].startswith("g")
+                groupsize = int(path_comps[-2][1:])
+                from tools.llama.quantize import WeightOnlyInt4QuantHandler
+                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
+                model = simple_quantizer.convert_for_runtime()
+            weights = torch.load(
+                Path(path) / "model.pth", map_location="cpu", mmap=True
+            )
+            err = model.load_state_dict(weights, strict=False, assign=True)
+            log.info(f"Loaded weights with error: {err}")
+        return model
+    def save_pretrained(self, path: str, drop_lora: bool = False):
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        self.config.save(path / "config.json")
+        state_dict = self.state_dict()
+        if drop_lora:
+            for key in list(state_dict.keys()):
+                if "lora" not in key:
+                    continue
+                state_dict.pop(key)
+                log.info(f"Drop LoRA parameter: {key}")
+        torch.save(state_dict, path / "model.pth")
+        self.tokenizer.save_pretrained(path)
 class NaiveTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
         self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
         self.codebook_output = nn.Linear(
             bias=False,
         )
+        self.apply(self._init_weights)
     def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
         token_logits = result.logits
         x = result.hidden_states
         )
     def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
     ) -> TransformerForwardResult:
+        result = super().forward(
+            inp=inp,
+            key_padding_mask=key_padding_mask,
+        )
         return self.decode(result)
     def forward_generate(
 class DualARTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
         # Fast transformer
+        self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)
         # The equivalent bs is so large that sdpa doesn't work
         self.fast_layers = nn.ModuleList(
             bias=False,
         )
+        self.apply(self._init_weights)
     def setup_caches(
         self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
     ):
             )
     def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
     ) -> TransformerForwardResult:
         parent_result = super().forward(inp, key_padding_mask)
         token_logits = parent_result.logits
         # Drop the last token and rotate left
         codebooks = inp[:, 1:-1, 1:]
+        codebooks = F.pad(codebooks, (0, 1), value=0)
         codebook_embeddings = self.fast_embeddings(codebooks)
         x = torch.cat([x[:, None], codebook_embeddings], dim=1)
         b, s = x.size(0), x.size(2)
         # Remove padded part
         codebooks = rearrange(codebooks, "b n s -> (b s) n")
+        codebook_mask = (codebooks == 0).all(dim=-1)
+        if torch.all(codebook_mask):
+            # If all codebooks are padded, we keep first 8 to make sure the model runs
+            codebook_mask[:8] = False
         x_bs, x_len = x.size(0), x.size(1)
         x = x[~codebook_mask]
         total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
         # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(
+            config.dim, total_head_dim, bias=config.attention_qkv_bias
+        )
         self.wo = nn.Linear(config.dim, config.dim, bias=False)
         self.kv_cache = None
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         if self.use_sdpa:
+            if mask is None:
+                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                    y = F.scaled_dot_product_attention(
+                        q,
+                        k,
+                        v,
+                        dropout_p=self.dropout if self.training else 0.0,
+                        is_causal=True,
+                        # No third party attn_mask here to use flash_attention
+                    )
+            else:
+                y = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=mask,
+                    dropout_p=self.dropout if self.training else 0.0,
+                )
         else:
             y = self.eq_scaled_dot_product_attention(
                 q,
     x_out2 = x_out2.flatten(3)
     return x_out2.type_as(x)

fish_speech/models/text2semantic/lora.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from dataclasses import dataclass
+import loralib as lora
+@dataclass
+class LoraConfig:
+    r: int
+    lora_alpha: float
+    lora_dropout: float = 0.0
+def setup_lora(model, lora_config):
+    # Replace the embedding layer with a LoRA layer
+    model.embeddings = lora.Embedding(
+        num_embeddings=model.embeddings.num_embeddings,
+        embedding_dim=model.embeddings.embedding_dim,
+        padding_idx=model.embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    model.codebook_embeddings = lora.Embedding(
+        num_embeddings=model.codebook_embeddings.num_embeddings,
+        embedding_dim=model.codebook_embeddings.embedding_dim,
+        padding_idx=model.codebook_embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    # Replace output layer with a LoRA layer
+    linears = [(model, "output")]
+    # Replace all linear layers with LoRA layers
+    for layer in model.layers:
+        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+        linears.extend(
+            [
+                (layer.feed_forward, "w1"),
+                (layer.feed_forward, "w2"),
+                (layer.feed_forward, "w3"),
+            ]
+        )
+    if hasattr(model, "fast_layers"):
+        model.fast_embeddings = lora.Embedding(
+            num_embeddings=model.fast_embeddings.num_embeddings,
+            embedding_dim=model.fast_embeddings.embedding_dim,
+            padding_idx=model.fast_embeddings.padding_idx,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+        )
+        # Dual-AR model
+        linears.append((model, "fast_output"))
+        for layer in model.fast_layers:
+            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+            linears.extend(
+                [
+                    (layer.feed_forward, "w1"),
+                    (layer.feed_forward, "w2"),
+                    (layer.feed_forward, "w3"),
+                ]
+            )
+    for module, layer in linears:
+        updated_linear = lora.Linear(
+            in_features=getattr(module, layer).in_features,
+            out_features=getattr(module, layer).out_features,
+            bias=getattr(module, layer).bias,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+            lora_dropout=lora_config.lora_dropout,
+        )
+        setattr(module, layer, updated_linear)
+    # Mark only the LoRA layers as trainable
+    lora.mark_only_lora_as_trainable(model, bias="none")
+def get_merged_state_dict(model):
+    # This line will merge the state dict of the model and the LoRA parameters
+    model.eval()
+    # Then we need to remove the LoRA parameters from the state dict
+    state_dict = model.state_dict()
+    for name in list(state_dict.keys()):
+        if "lora" in name:
+            state_dict.pop(name)
+    return state_dict

fish_speech/models/vqgan/modules/firefly.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # A inference only version of the FireflyGAN model
 from functools import partial
 from math import prod
 from typing import Callable
@@ -13,6 +14,8 @@ from torch.nn.utils.parametrizations import weight_norm
 from torch.nn.utils.parametrize import remove_parametrizations
 from torch.utils.checkpoint import checkpoint
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
@@ -474,6 +477,89 @@ class ConvNeXtEncoder(nn.Module):
         return self.norm(x)
 class FireflyBase(nn.Module):
     def __init__(self, ckpt_path: str = None, pretrained: bool = True):
         super().__init__()
@@ -500,11 +586,12 @@ class FireflyBase(nn.Module):
         )
         if ckpt_path is not None:
-            self.load_state_dict(torch.load(ckpt_path, map_location="cpu"))
         elif pretrained:
             state_dict = torch.hub.load_state_dict_from_url(
                 "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt",
                 map_location="cpu",
             )
         if "state_dict" in state_dict:

 # A inference only version of the FireflyGAN model
+import math
 from functools import partial
 from math import prod
 from typing import Callable
 from torch.nn.utils.parametrize import remove_parametrizations
 from torch.utils.checkpoint import checkpoint
+from fish_speech.models.vqgan.utils import sequence_mask
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
         return self.norm(x)
+class FireflyArchitecture(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        head: nn.Module,
+        quantizer: nn.Module,
+        spec_transform: nn.Module,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.head = head
+        self.quantizer = quantizer
+        self.spec_transform = spec_transform
+    def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
+        if self.spec_transform is not None:
+            x = self.spec_transform(x)
+        x = self.backbone(x)
+        if mask is not None:
+            x = x * mask
+        if self.quantizer is not None:
+            vq_result = self.quantizer(x)
+            x = vq_result.z
+            if mask is not None:
+                x = x * mask
+        x = self.head(x, template=template)
+        if x.ndim == 2:
+            x = x[:, None, :]
+        if self.vq is not None:
+            return x, vq_result
+        return x
+    def encode(self, audios, audio_lengths):
+        audios = audios.float()
+        mels = self.spec_transform(audios)
+        mel_lengths = audio_lengths // self.spec_transform.hop_length
+        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
+        mel_masks_float_conv = mel_masks[:, None, :].float()
+        mels = mels * mel_masks_float_conv
+        # Encode
+        encoded_features = self.backbone(mels) * mel_masks_float_conv
+        feature_lengths = mel_lengths // math.prod(self.quantizer.downsample_factor)
+        return self.quantizer.encode(encoded_features), feature_lengths
+    def decode(self, indices, feature_lengths) -> torch.Tensor:
+        factor = math.prod(self.quantizer.downsample_factor)
+        mel_masks = sequence_mask(feature_lengths * factor, indices.shape[2] * factor)
+        mel_masks_float_conv = mel_masks[:, None, :].float()
+        audio_masks = sequence_mask(
+            feature_lengths * factor * self.spec_transform.hop_length,
+            indices.shape[2] * factor * self.spec_transform.hop_length,
+        )
+        audio_masks_float_conv = audio_masks[:, None, :].float()
+        z = self.quantizer.decode(indices) * mel_masks_float_conv
+        x = self.head(z) * audio_masks_float_conv
+        return x
+    def remove_parametrizations(self):
+        if hasattr(self.backbone, "remove_parametrizations"):
+            self.backbone.remove_parametrizations()
+        if hasattr(self.head, "remove_parametrizations"):
+            self.head.remove_parametrizations()
+    @property
+    def device(self):
+        return next(self.parameters()).device
 class FireflyBase(nn.Module):
     def __init__(self, ckpt_path: str = None, pretrained: bool = True):
         super().__init__()
         )
         if ckpt_path is not None:
+            state_dict = torch.load(ckpt_path, map_location="cpu")
         elif pretrained:
             state_dict = torch.hub.load_state_dict_from_url(
                 "https://github.com/fishaudio/vocoder/releases/download/1.0.0/firefly-gan-base-generator.ckpt",
                 map_location="cpu",
+                model_dir="checkpoints",
             )
         if "state_dict" in state_dict:

fish_speech/models/vqgan/modules/fsq.py CHANGED Viewed

@@ -20,7 +20,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
     def __init__(
         self,
         input_dim: int = 512,
-        n_codebooks: int = 9,
         n_groups: int = 1,
         levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
         downsample_factor: tuple[int] = (2, 2),

     def __init__(
         self,
         input_dim: int = 512,
+        n_codebooks: int = 1,
         n_groups: int = 1,
         levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
         downsample_factor: tuple[int] = (2, 2),

fish_speech/text/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .clean import clean_text
-__all__ = ["clean_text"]

 from .clean import clean_text
+from .spliter import split_text
+__all__ = ["clean_text", "split_text"]

fish_speech/text/chn_text_norm/.gitignore ADDED Viewed

	@@ -0,0 +1,114 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# JetBrains PyCharm
+.idea
+# Customize
+references
+url.txt
+# Git
+.git

fish_speech/text/chn_text_norm/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# This account is no longer in use, see [Atomicoo](https://github.com/atomicoo) for my latest works.
+# Chn Text Norm
+this is a repository for chinese text normalization (no longer maintained).
+## Quick Start ##
+### Git Clone Repo ###
+git clone this repo to the root directory of your project which need to use it.
+    cd /path/to/proj
+    git clone https://github.com/Joee1995/chn-text-norm.git
+after that, your doc tree should be:
+```
+proj                     # root of your project
+|--- chn_text_norm       # this chn-text-norm tool
+     |--- text.py
+     |--- ...
+|--- text_normalize.py   # your text normalization code
+|--- ...
+```
+### How to Use ? ###
+    # text_normalize.py
+    from chn_text_norm.text import *
+    raw_text = 'your raw text'
+    text = Text(raw_text=raw_text).normalize()
+### How to add quantums ###
+打开test.py，然后你就知道怎么做了。

fish_speech/text/chn_text_norm/__init__.py ADDED Viewed

File without changes

fish_speech/text/chn_text_norm/basic_class.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# -*- coding: utf-8 -*-
+"""基本类
+中文字符类
+中文数字/数位类
+中文数字类
+中文数位类
+中文数字系统类
+中文数学符号类
+*中文其他符号类
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-02"
+from fish_speech.text.chn_text_norm.basic_constant import NUMBERING_TYPES
+class ChineseChar(object):
+    """
+    中文字符
+    每个字符对应简体和繁体,
+    e.g. 简体 = '负', 繁体 = '負'
+    转换时可转换为简体或繁体
+    """
+    def __init__(self, simplified, traditional):
+        self.simplified = simplified
+        self.traditional = traditional
+        self.__repr__ = self.__str__
+    def __str__(self):
+        return self.simplified or self.traditional or None
+    def __repr__(self):
+        return self.__str__()
+class ChineseNumberUnit(ChineseChar):
+    """
+    中文数字/数位字符
+    每个字符除繁简体外还有一个额外的大写字符
+    e.g. '陆' 和 '陸'
+    """
+    def __init__(self, power, simplified, traditional, big_s, big_t):
+        super(ChineseNumberUnit, self).__init__(simplified, traditional)
+        self.power = power
+        self.big_s = big_s
+        self.big_t = big_t
+    def __str__(self):
+        return "10^{}".format(self.power)
+    @classmethod
+    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
+        if small_unit:
+            return ChineseNumberUnit(
+                power=index + 1,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[1],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[0]:
+            return ChineseNumberUnit(
+                power=index + 8,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[1]:
+            return ChineseNumberUnit(
+                power=(index + 2) * 4,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[2]:
+            return ChineseNumberUnit(
+                power=pow(2, index + 3),
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        else:
+            raise ValueError(
+                "Counting type should be in {0} ({1} provided).".format(
+                    NUMBERING_TYPES, numbering_type
+                )
+            )
+class ChineseNumberDigit(ChineseChar):
+    """
+    中文数字字符
+    """
+    def __init__(
+        self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None
+    ):
+        super(ChineseNumberDigit, self).__init__(simplified, traditional)
+        self.value = value
+        self.big_s = big_s
+        self.big_t = big_t
+        self.alt_s = alt_s
+        self.alt_t = alt_t
+    def __str__(self):
+        return str(self.value)
+    @classmethod
+    def create(cls, i, v):
+        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
+class ChineseMath(ChineseChar):
+    """
+    中文数位字符
+    """
+    def __init__(self, simplified, traditional, symbol, expression=None):
+        super(ChineseMath, self).__init__(simplified, traditional)
+        self.symbol = symbol
+        self.expression = expression
+        self.big_s = simplified
+        self.big_t = traditional
+CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
+class NumberSystem(object):
+    """
+    中文数字系统
+    """
+    pass
+class MathSymbol(object):
+    """
+    用于中文数字系统的数学符号 (繁/简体), e.g.
+    positive = ['正', '正']
+    negative = ['负', '負']
+    point = ['点', '點']
+    """
+    def __init__(self, positive, negative, point):
+        self.positive = positive
+        self.negative = negative
+        self.point = point
+    def __iter__(self):
+        for v in self.__dict__.values():
+            yield v
+# class OtherSymbol(object):
+#     """
+#     其他符号
+#     """
+#
+#     def __init__(self, sil):
+#         self.sil = sil
+#
+#     def __iter__(self):
+#         for v in self.__dict__.values():
+#             yield v

fish_speech/text/chn_text_norm/basic_constant.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# -*- coding: utf-8 -*-
+"""基本常量
+中文数字/数位/符号字符常量
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-02"
+CHINESE_DIGIS = "零一二三四五六七八九"
+BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖"
+BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖"
+SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬"
+LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载"
+LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載"
+SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬"
+ZERO_ALT = "〇"
+ONE_ALT = "幺"
+TWO_ALTS = ["两", "兩"]
+POSITIVE = ["正", "正"]
+NEGATIVE = ["负", "負"]
+POINT = ["点", "點"]
+# PLUS = [u'加', u'加']
+# SIL = [u'杠', u'槓']
+# 中文数字系统类型
+NUMBERING_TYPES = ["low", "mid", "high"]

fish_speech/text/chn_text_norm/basic_util.py ADDED Viewed

	@@ -0,0 +1,342 @@

+# -*- coding: utf-8 -*-
+"""基本方法
+创建中文数字系统 方法
+中文字符串 <=> 数字串 方法
+数字串 <=> 中文字符串 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-02"
+from fish_speech.text.chn_text_norm.basic_class import *
+from fish_speech.text.chn_text_norm.basic_constant import *
+def create_system(numbering_type=NUMBERING_TYPES[1]):
+    """
+    根据数字系统类型返回创建相应的数字系统，默认为 mid
+    NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
+        low:  '兆' = '亿' * '十' = $10^{9}$,  '京' = '兆' * '十', etc.
+        mid:  '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
+        high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
+    返回对应的数字系统
+    """
+    # chinese number units of '亿' and larger
+    all_larger_units = zip(
+        LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
+        LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL,
+    )
+    larger_units = [
+        CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)
+    ]
+    # chinese number units of '十, 百, 千, 万'
+    all_smaller_units = zip(
+        SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED,
+        SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL,
+    )
+    smaller_units = [
+        CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)
+    ]
+    # digis
+    chinese_digis = zip(
+        CHINESE_DIGIS,
+        CHINESE_DIGIS,
+        BIG_CHINESE_DIGIS_SIMPLIFIED,
+        BIG_CHINESE_DIGIS_TRADITIONAL,
+    )
+    digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
+    digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
+    digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
+    digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
+    # symbols
+    positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x)
+    negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x)
+    point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y)))
+    # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
+    system = NumberSystem()
+    system.units = smaller_units + larger_units
+    system.digits = digits
+    system.math = MathSymbol(positive_cn, negative_cn, point_cn)
+    # system.symbols = OtherSymbol(sil_cn)
+    return system
+def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
+    def get_symbol(char, system):
+        for u in system.units:
+            if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
+                return u
+        for d in system.digits:
+            if char in [
+                d.traditional,
+                d.simplified,
+                d.big_s,
+                d.big_t,
+                d.alt_s,
+                d.alt_t,
+            ]:
+                return d
+        for m in system.math:
+            if char in [m.traditional, m.simplified]:
+                return m
+    def string2symbols(chinese_string, system):
+        int_string, dec_string = chinese_string, ""
+        for p in [system.math.point.simplified, system.math.point.traditional]:
+            if p in chinese_string:
+                int_string, dec_string = chinese_string.split(p)
+                break
+        return [get_symbol(c, system) for c in int_string], [
+            get_symbol(c, system) for c in dec_string
+        ]
+    def correct_symbols(integer_symbols, system):
+        """
+        一百八 to 一百八十
+        一亿一千三百万 to 一亿 一千万 三百万
+        """
+        if integer_symbols and isinstance(integer_symbols[0], CNU):
+            if integer_symbols[0].power == 1:
+                integer_symbols = [system.digits[1]] + integer_symbols
+        if len(integer_symbols) > 1:
+            if isinstance(integer_symbols[-1], CND) and isinstance(
+                integer_symbols[-2], CNU
+            ):
+                integer_symbols.append(
+                    CNU(integer_symbols[-2].power - 1, None, None, None, None)
+                )
+        result = []
+        unit_count = 0
+        for s in integer_symbols:
+            if isinstance(s, CND):
+                result.append(s)
+                unit_count = 0
+            elif isinstance(s, CNU):
+                current_unit = CNU(s.power, None, None, None, None)
+                unit_count += 1
+            if unit_count == 1:
+                result.append(current_unit)
+            elif unit_count > 1:
+                for i in range(len(result)):
+                    if (
+                        isinstance(result[-i - 1], CNU)
+                        and result[-i - 1].power < current_unit.power
+                    ):
+                        result[-i - 1] = CNU(
+                            result[-i - 1].power + current_unit.power,
+                            None,
+                            None,
+                            None,
+                            None,
+                        )
+        return result
+    def compute_value(integer_symbols):
+        """
+        Compute the value.
+        When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
+        e.g. '两千万' = 2000 * 10000 not 2000 + 10000
+        """
+        value = [0]
+        last_power = 0
+        for s in integer_symbols:
+            if isinstance(s, CND):
+                value[-1] = s.value
+            elif isinstance(s, CNU):
+                value[-1] *= pow(10, s.power)
+                if s.power > last_power:
+                    value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1]))
+                    last_power = s.power
+                value.append(0)
+        return sum(value)
+    system = create_system(numbering_type)
+    int_part, dec_part = string2symbols(chinese_string, system)
+    int_part = correct_symbols(int_part, system)
+    int_str = str(compute_value(int_part))
+    dec_str = "".join([str(d.value) for d in dec_part])
+    if dec_part:
+        return "{0}.{1}".format(int_str, dec_str)
+    else:
+        return int_str
+def num2chn(
+    number_string,
+    numbering_type=NUMBERING_TYPES[1],
+    big=False,
+    traditional=False,
+    alt_zero=False,
+    alt_one=False,
+    alt_two=True,
+    use_zeros=True,
+    use_units=True,
+):
+    def get_value(value_string, use_zeros=True):
+        striped_string = value_string.lstrip("0")
+        # record nothing if all zeros
+        if not striped_string:
+            return []
+        # record one digits
+        elif len(striped_string) == 1:
+            if use_zeros and len(value_string) != len(striped_string):
+                return [system.digits[0], system.digits[int(striped_string)]]
+            else:
+                return [system.digits[int(striped_string)]]
+        # recursively record multiple digits
+        else:
+            result_unit = next(
+                u for u in reversed(system.units) if u.power < len(striped_string)
+            )
+            result_string = value_string[: -result_unit.power]
+            return (
+                get_value(result_string)
+                + [result_unit]
+                + get_value(striped_string[-result_unit.power :])
+            )
+    system = create_system(numbering_type)
+    int_dec = number_string.split(".")
+    if len(int_dec) == 1:
+        int_string = int_dec[0]
+        dec_string = ""
+    elif len(int_dec) == 2:
+        int_string = int_dec[0]
+        dec_string = int_dec[1]
+    else:
+        raise ValueError(
+            "invalid input num string with more than one dot: {}".format(number_string)
+        )
+    if use_units and len(int_string) > 1:
+        result_symbols = get_value(int_string)
+    else:
+        result_symbols = [system.digits[int(c)] for c in int_string]
+    dec_symbols = [system.digits[int(c)] for c in dec_string]
+    if dec_string:
+        result_symbols += [system.math.point] + dec_symbols
+    if alt_two:
+        liang = CND(
+            2,
+            system.digits[2].alt_s,
+            system.digits[2].alt_t,
+            system.digits[2].big_s,
+            system.digits[2].big_t,
+        )
+        for i, v in enumerate(result_symbols):
+            if isinstance(v, CND) and v.value == 2:
+                next_symbol = (
+                    result_symbols[i + 1] if i < len(result_symbols) - 1 else None
+                )
+                previous_symbol = result_symbols[i - 1] if i > 0 else None
+                if isinstance(next_symbol, CNU) and isinstance(
+                    previous_symbol, (CNU, type(None))
+                ):
+                    if next_symbol.power != 1 and (
+                        (previous_symbol is None) or (previous_symbol.power != 1)
+                    ):
+                        result_symbols[i] = liang
+    # if big is True, '两' will not be used and `alt_two` has no impact on output
+    if big:
+        attr_name = "big_"
+        if traditional:
+            attr_name += "t"
+        else:
+            attr_name += "s"
+    else:
+        if traditional:
+            attr_name = "traditional"
+        else:
+            attr_name = "simplified"
+    result = "".join([getattr(s, attr_name) for s in result_symbols])
+    # if not use_zeros:
+    #     result = result.strip(getattr(system.digits[0], attr_name))
+    if alt_zero:
+        result = result.replace(
+            getattr(system.digits[0], attr_name), system.digits[0].alt_s
+        )
+    if alt_one:
+        result = result.replace(
+            getattr(system.digits[1], attr_name), system.digits[1].alt_s
+        )
+    for i, p in enumerate(POINT):
+        if result.startswith(p):
+            return CHINESE_DIGIS[0] + result
+    # ^10, 11, .., 19
+    if (
+        len(result) >= 2
+        and result[1]
+        in [
+            SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
+            SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0],
+        ]
+        and result[0]
+        in [
+            CHINESE_DIGIS[1],
+            BIG_CHINESE_DIGIS_SIMPLIFIED[1],
+            BIG_CHINESE_DIGIS_TRADITIONAL[1],
+        ]
+    ):
+        result = result[1:]
+    return result
+if __name__ == "__main__":
+    # 测试程序
+    all_chinese_number_string = (
+        CHINESE_DIGIS
+        + BIG_CHINESE_DIGIS_SIMPLIFIED
+        + BIG_CHINESE_DIGIS_TRADITIONAL
+        + LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED
+        + LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL
+        + SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED
+        + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL
+        + ZERO_ALT
+        + ONE_ALT
+        + "".join(TWO_ALTS + POSITIVE + NEGATIVE + POINT)
+    )
+    print("num:", chn2num("一万零四百零三点八零五"))
+    print("num:", chn2num("一亿六点三"))
+    print("num:", chn2num("一亿零六点三"))
+    print("num:", chn2num("两千零一亿六点三"))
+    # print('num:', chn2num('一零零八六'))
+    print("txt:", num2chn("10260.03", alt_zero=True))
+    print("txt:", num2chn("20037.090", numbering_type="low", traditional=True))
+    print("txt:", num2chn("100860001.77", numbering_type="high", big=True))
+    print(
+        "txt:",
+        num2chn(
+            "059523810880",
+            alt_one=True,
+            alt_two=False,
+            use_lzeros=True,
+            use_rzeros=True,
+            use_units=False,
+        ),
+    )
+    print(all_chinese_number_string)

fish_speech/text/chn_text_norm/cardinal.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+"""CARDINAL类 (包含小数DECIMAL类)
+纯数 <=> 中文字符串 方法
+中文字符串 <=> 纯数 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-03"
+from fish_speech.text.chn_text_norm.basic_util import *
+class Cardinal:
+    """
+    CARDINAL类
+    """
+    def __init__(self, cardinal=None, chntext=None):
+        self.cardinal = cardinal
+        self.chntext = chntext
+    def chntext2cardinal(self):
+        return chn2num(self.chntext)
+    def cardinal2chntext(self):
+        return num2chn(self.cardinal)
+if __name__ == "__main__":
+    # 测试程序
+    print(Cardinal(cardinal="21357.230").cardinal2chntext())

fish_speech/text/chn_text_norm/date.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# -*- coding: utf-8 -*-
+"""DATE类
+日期 <=> 中文字符串 方法
+中文字符串 <=> 日期 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-07"
+from fish_speech.text.chn_text_norm.cardinal import Cardinal
+from fish_speech.text.chn_text_norm.digit import Digit
+class Date:
+    """
+    DATE类
+    """
+    def __init__(self, date=None, chntext=None):
+        self.date = date
+        self.chntext = chntext
+    # def chntext2date(self):
+    #     chntext = self.chntext
+    #     try:
+    #         year, other = chntext.strip().split('年', maxsplit=1)
+    #         year = Digit(chntext=year).digit2chntext() + '年'
+    #     except ValueError:
+    #         other = chntext
+    #         year = ''
+    #     if other:
+    #         try:
+    #             month, day = other.strip().split('月', maxsplit=1)
+    #             month = Cardinal(chntext=month).chntext2cardinal() + '月'
+    #         except ValueError:
+    #             day = chntext
+    #             month = ''
+    #         if day:
+    #             day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
+    #     else:
+    #         month = ''
+    #         day = ''
+    #     date = year + month + day
+    #     self.date = date
+    #     return self.date
+    def date2chntext(self):
+        date = self.date
+        try:
+            year, other = date.strip().split("年", maxsplit=1)
+            year = Digit(digit=year).digit2chntext() + "年"
+        except ValueError:
+            other = date
+            year = ""
+        if other:
+            try:
+                month, day = other.strip().split("月", maxsplit=1)
+                month = Cardinal(cardinal=month).cardinal2chntext() + "月"
+            except ValueError:
+                day = date
+                month = ""
+            if day:
+                day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
+        else:
+            month = ""
+            day = ""
+        chntext = year + month + day
+        self.chntext = chntext
+        return self.chntext
+if __name__ == "__main__":
+    # 测试
+    print(Date(date="09年3月16日").date2chntext())

fish_speech/text/chn_text_norm/digit.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+"""DIGIT类
+数字串 <=> 中文字符串 方法
+中文字符串 <=> 数字串 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-03"
+from fish_speech.text.chn_text_norm.basic_util import *
+class Digit:
+    """
+    DIGIT类
+    """
+    def __init__(self, digit=None, chntext=None):
+        self.digit = digit
+        self.chntext = chntext
+    # def chntext2digit(self):
+    #     return chn2num(self.chntext)
+    def digit2chntext(self):
+        return num2chn(self.digit, alt_two=False, use_units=False)
+if __name__ == "__main__":
+    # 测试程序
+    print(Digit(digit="2016").digit2chntext())

fish_speech/text/chn_text_norm/fraction.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# -*- coding: utf-8 -*-
+"""FRACTION类
+分数 <=> 中文字符串 方法
+中文字符串 <=> 分数 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-03"
+from fish_speech.text.chn_text_norm.basic_util import *
+class Fraction:
+    """
+    FRACTION类
+    """
+    def __init__(self, fraction=None, chntext=None):
+        self.fraction = fraction
+        self.chntext = chntext
+    def chntext2fraction(self):
+        denominator, numerator = self.chntext.split("分之")
+        return chn2num(numerator) + "/" + chn2num(denominator)
+    def fraction2chntext(self):
+        numerator, denominator = self.fraction.split("/")
+        return num2chn(denominator) + "分之" + num2chn(numerator)
+if __name__ == "__main__":
+    # 测试程序
+    print(Fraction(fraction="2135/7230").fraction2chntext())
+    print(Fraction(chntext="五百八十一分之三百六十九").chntext2fraction())

fish_speech/text/chn_text_norm/money.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# -*- coding: utf-8 -*-
+"""MONEY类
+金钱 <=> 中文字符串 方法
+中文字符串 <=> 金钱 方法
+"""
+import re
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-08"
+from fish_speech.text.chn_text_norm.cardinal import Cardinal
+class Money:
+    """
+    MONEY类
+    """
+    def __init__(self, money=None, chntext=None):
+        self.money = money
+        self.chntext = chntext
+    # def chntext2money(self):
+    #     return self.money
+    def money2chntext(self):
+        money = self.money
+        pattern = re.compile(r"(\d+(\.\d+)?)")
+        matchers = pattern.findall(money)
+        if matchers:
+            for matcher in matchers:
+                money = money.replace(
+                    matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()
+                )
+        self.chntext = money
+        return self.chntext
+if __name__ == "__main__":
+    # 测试
+    print(Money(money="21.5万元").money2chntext())
+    print(Money(money="230块5毛").money2chntext())

fish_speech/text/chn_text_norm/percentage.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# -*- coding: utf-8 -*-
+"""PERCENTAGE类
+百分数 <=> 中文字符串 方法
+中文字符串 <=> 百分数 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-06"
+from fish_speech.text.chn_text_norm.basic_util import *
+class Percentage:
+    """
+    PERCENTAGE类
+    """
+    def __init__(self, percentage=None, chntext=None):
+        self.percentage = percentage
+        self.chntext = chntext
+    def chntext2percentage(self):
+        return chn2num(self.chntext.strip().strip("百分之")) + "%"
+    def percentage2chntext(self):
+        return "百分之" + num2chn(self.percentage.strip().strip("%"))
+if __name__ == "__main__":
+    # 测试程序
+    print(Percentage(chntext="百分之五十六点零三").chntext2percentage())
+    print(Percentage(percentage="65.3%").percentage2chntext())

fish_speech/text/chn_text_norm/telephone.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+"""TELEPHONE类
+电话号码 <=> 中文字符串 方法
+中文字符串 <=> 电话号码 方法
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-03"
+from fish_speech.text.chn_text_norm.basic_util import *
+class TelePhone:
+    """
+    TELEPHONE类
+    """
+    def __init__(self, telephone=None, raw_chntext=None, chntext=None):
+        self.telephone = telephone
+        self.raw_chntext = raw_chntext
+        self.chntext = chntext
+    # def chntext2telephone(self):
+    #     sil_parts = self.raw_chntext.split('<SIL>')
+    #     self.telephone = '-'.join([
+    #         str(chn2num(p)) for p in sil_parts
+    #     ])
+    #     return self.telephone
+    def telephone2chntext(self, fixed=False):
+        if fixed:
+            sil_parts = self.telephone.split("-")
+            self.raw_chntext = "<SIL>".join(
+                [num2chn(part, alt_two=False, use_units=False) for part in sil_parts]
+            )
+            self.chntext = self.raw_chntext.replace("<SIL>", "")
+        else:
+            sp_parts = self.telephone.strip("+").split()
+            self.raw_chntext = "<SP>".join(
+                [num2chn(part, alt_two=False, use_units=False) for part in sp_parts]
+            )
+            self.chntext = self.raw_chntext.replace("<SP>", "")
+        return self.chntext
+if __name__ == "__main__":
+    # 测试程序
+    print(TelePhone(telephone="0595-23980880").telephone2chntext())
+    # print(TelePhone(raw_chntext='零五九五杠二三八六五零九八').chntext2telephone())

fish_speech/text/chn_text_norm/text.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# -*- coding: utf-8 -*-
+"""
+TEXT类
+"""
+__author__ = "Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>"
+__data__ = "2019-05-03"
+import re
+from fish_speech.text.chn_text_norm.cardinal import Cardinal
+from fish_speech.text.chn_text_norm.date import Date
+from fish_speech.text.chn_text_norm.digit import Digit
+from fish_speech.text.chn_text_norm.fraction import Fraction
+from fish_speech.text.chn_text_norm.money import Money
+from fish_speech.text.chn_text_norm.percentage import Percentage
+from fish_speech.text.chn_text_norm.telephone import TelePhone
+CURRENCY_NAMES = (
+    "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|"
+    "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
+)
+CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
+COM_QUANTIFIERS = (
+    "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
+    "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
+    "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
+    "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
+    "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
+    "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|人|抽)"
+)
+class Text:
+    """
+    Text类
+    """
+    def __init__(self, raw_text, norm_text=None):
+        self.raw_text = "^" + raw_text + "$"
+        self.norm_text = norm_text
+    def _particular(self):
+        text = self.norm_text
+        pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('particular')
+            for matcher in matchers:
+                text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
+        self.norm_text = text
+        return self.norm_text
+    def normalize(self):
+        text = self.raw_text
+        # 规范化日期
+        pattern = re.compile(
+            r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)"
+        )
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('date')
+            for matcher in matchers:
+                text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
+        # 规范化金钱
+        pattern = re.compile(
+            r"\D+((\d+(\.\d+)?)[多余几]?"
+            + CURRENCY_UNITS
+            + "(\d"
+            + CURRENCY_UNITS
+            + "?)?)"
+        )
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('money')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], Money(money=matcher[0]).money2chntext(), 1
+                )
+        # 规范化固话/手机号码
+        # 手机
+        # http://www.jihaoba.com/news/show/13680
+        # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+        # 联通：130、131、132、156、155、186、185、176
+        # 电信：133、153、189、180、181、177
+        pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('telephone')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1
+                )
+        # 固话
+        pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('fixed telephone')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0],
+                    TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True),
+                    1,
+                )
+        # 规范化分数
+        pattern = re.compile(r"(\d+/\d+)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('fraction')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher, Fraction(fraction=matcher).fraction2chntext(), 1
+                )
+        # 规范化百分数
+        text = text.replace("％", "%")
+        pattern = re.compile(r"(\d+(\.\d+)?%)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('percentage')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0],
+                    Percentage(percentage=matcher[0]).percentage2chntext(),
+                    1,
+                )
+        # 规范化纯数+量词
+        pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('cardinal+quantifier')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
+                )
+        # 规范化数字编号
+        pattern = re.compile(r"(\d{4,32})")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('digit')
+            for matcher in matchers:
+                text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
+        # 规范化纯数
+        pattern = re.compile(r"(\d+(\.\d+)?)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('cardinal')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1
+                )
+        self.norm_text = text
+        self._particular()
+        return self.norm_text.lstrip("^").rstrip("$")
+if __name__ == "__main__":
+    # 测试程序
+    print(Text(raw_text="固话：0595-23865596或23880880。").normalize())
+    print(Text(raw_text="手机：+86 19859213959或15659451527。").normalize())
+    print(Text(raw_text="分数：32477/76391。").normalize())
+    print(Text(raw_text="百分数：80.03%。").normalize())
+    print(Text(raw_text="编号：31520181154418。").normalize())
+    print(Text(raw_text="纯数：2983.07克或12345.60米。").normalize())
+    print(Text(raw_text="日期：1999年2月20日或09年3月15号。").normalize())
+    print(Text(raw_text="金钱：12块5，34.5元，20.1万").normalize())
+    print(Text(raw_text="特殊：O2O或B2C。").normalize())

fish_speech/text/clean.py CHANGED Viewed

@@ -18,7 +18,6 @@ SYMBOLS_MAPPING = {
     "·": ",",
     "、": ",",
     "...": "…",
-    "$": ".",
     "“": "'",
     "”": "'",
     "‘": "'",
@@ -62,12 +61,9 @@ REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
 def clean_text(text):
     # Clean the text
     text = text.strip()
-    # Replace <p:(.*?)> with <PPP(.*?)PPP>
-    text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
     text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
-    # Replace <PPP(.*?)PPP> with <p:(.*?)>
-    text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
     return text

     "·": ",",
     "、": ",",
     "...": "…",
     "“": "'",
     "”": "'",
     "‘": "'",
 def clean_text(text):
     # Clean the text
     text = text.strip()
     # Replace all chinese symbols with their english counterparts
     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
     text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
     return text

fish_speech/text/spliter.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import re
+import string
+from fish_speech.text.clean import clean_text
+def utf_8_len(text):
+    return len(text.encode("utf-8"))
+def break_text(texts, length, splits: set):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if char in splits:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def break_text_by_length(texts, length):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if utf_8_len(curr) >= length:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def add_cleaned(curr, segments):
+    curr = curr.strip()
+    if curr and not all(c.isspace() or c in string.punctuation for c in curr):
+        segments.append(curr)
+def protect_float(text):
+    # Turns 3.14 into <3_f_14> to prevent splitting
+    return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
+def unprotect_float(text):
+    # Turns <3_f_14> into 3.14
+    return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
+def split_text(text, length):
+    text = clean_text(text)
+    # Break the text into pieces with following rules:
+    # 1. Split the text at ".", "!", "?" if text is NOT a float
+    # 2. If the text is longer than length, split at ","
+    # 3. If the text is still longer than length, split at " "
+    # 4. If the text is still longer than length, split at any character to length
+    texts = [text]
+    texts = map(protect_float, texts)
+    texts = break_text(texts, length, {".", "!", "?"})
+    texts = map(unprotect_float, texts)
+    texts = break_text(texts, length, {","})
+    texts = break_text(texts, length, {" "})
+    texts = list(break_text_by_length(texts, length))
+    # Then, merge the texts into segments with length <= length
+    segments = []
+    curr = ""
+    for text in texts:
+        if utf_8_len(curr) + utf_8_len(text) <= length:
+            curr += text
+        else:
+            add_cleaned(curr, segments)
+            curr = text
+    if curr:
+        add_cleaned(curr, segments)
+    return segments
+if __name__ == "__main__":
+    # Test the split_text function
+    text = "This is a test sentence. This is another test sentence. And a third one."
+    assert split_text(text, 50) == [
+        "This is a test sentence.",
+        "This is another test sentence. And a third one.",
+    ]
+    assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
+    assert split_text("   ", 10) == []
+    assert split_text("a", 10) == ["a"]
+    text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
+    assert split_text(text, 50) == [
+        "This is a test sentence with only commas,",
+        "and no dots, and no exclamation marks,",
+        "and no question marks, and no newlines.",
+    ]
+    text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
+    # First half split at " ", second half split at ","
+    assert split_text(text, 50) == [
+        "This is a test sentence This is a test sentence",
+        "This is a test sentence. This is a test sentence,",
+        "This is a test sentence, This is a test sentence.",
+    ]
+    text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
+    assert split_text(text, 50) == [
+        "这是一段很长的中文文本,",
+        "而且没有句号,也没有感叹号,",
+        "也没有问号,也没有换行符.",
+    ]

fish_speech/utils/file.py CHANGED Viewed

@@ -44,7 +44,7 @@ def list_files(
     if not path.exists():
         raise FileNotFoundError(f"Directory {path} does not exist.")
-    files = [file for ext in extensions for file in path.iglob(f"**/*{ext}")]
     if sort:
         files = natsorted(files)

     if not path.exists():
         raise FileNotFoundError(f"Directory {path} does not exist.")
+    files = [file for ext in extensions for file in path.rglob(f"*{ext}")]
     if sort:
         files = natsorted(files)

fish_speech/utils/rich_utils.py CHANGED Viewed

@@ -43,9 +43,13 @@ def print_config_tree(
     # add fields from `print_order` to queue
     for field in print_order:
-        queue.append(field) if field in cfg else log.warning(
-            f"Field '{field}' not found in config. "
-            + f"Skipping '{field}' config printing..."
         )
     # add all the other fields to queue (not specified in `print_order`)

     # add fields from `print_order` to queue
     for field in print_order:
+        (
+            queue.append(field)
+            if field in cfg
+            else log.warning(
+                f"Field '{field}' not found in config. "
+                + f"Skipping '{field}' config printing..."
+            )
         )
     # add all the other fields to queue (not specified in `print_order`)

fish_speech/utils/spectrogram.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+import torchaudio.functional as F
+from torch import Tensor, nn
+from torchaudio.transforms import MelScale
+class LinearSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        center=False,
+        mode="pow2_sqrt",
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.mode = mode
+        self.register_buffer("window", torch.hann_window(win_length), persistent=False)
+    def forward(self, y: Tensor) -> Tensor:
+        if y.ndim == 3:
+            y = y.squeeze(1)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                (self.win_length - self.hop_length) // 2,
+                (self.win_length - self.hop_length + 1) // 2,
+            ),
+            mode="reflect",
+        ).squeeze(1)
+        spec = torch.stft(
+            y,
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        if self.mode == "pow2_sqrt":
+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        return spec
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate=44100,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        center=False,
+        f_min=0.0,
+        f_max=None,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max or float(sample_rate // 2)
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
+        fb = F.melscale_fbanks(
+            n_freqs=self.n_fft // 2 + 1,
+            f_min=self.f_min,
+            f_max=self.f_max,
+            n_mels=self.n_mels,
+            sample_rate=self.sample_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.register_buffer(
+            "fb",
+            fb,
+            persistent=False,
+        )
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+    def apply_mel_scale(self, x: Tensor) -> Tensor:
+        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
+    def forward(
+        self, x: Tensor, return_linear: bool = False, sample_rate: int = None
+    ) -> Tensor:
+        if sample_rate is not None and sample_rate != self.sample_rate:
+            x = F.resample(x, orig_freq=sample_rate, new_freq=self.sample_rate)
+        linear = self.spectrogram(x)
+        x = self.apply_mel_scale(linear)
+        x = self.compress(x)
+        if return_linear:
+            return x, self.compress(linear)
+        return x

tools/api.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import base64
+import io
+import json
+import queue
+import random
+import traceback
+import wave
+from argparse import ArgumentParser
+from http import HTTPStatus
+from pathlib import Path
+from typing import Annotated, Literal, Optional
+import librosa
+import numpy as np
+import pyrootutils
+import soundfile as sf
+import torch
+from kui.asgi import (
+    Body,
+    HTTPException,
+    HttpView,
+    JSONResponse,
+    Kui,
+    OpenAPI,
+    StreamResponse,
+)
+from kui.asgi.routing import MultimethodRoutes
+from loguru import logger
+from pydantic import BaseModel, Field
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+# from fish_speech.models.vqgan.lit_module import VQGAN
+from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
+from tools.llama.generate import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+    launch_thread_safe_queue,
+)
+from tools.vqgan.inference import load_model as load_decoder_model
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+# Define utils for web server
+async def http_execption_handler(exc: HTTPException):
+    return JSONResponse(
+        dict(
+            statusCode=exc.status_code,
+            message=exc.content,
+            error=HTTPStatus(exc.status_code).phrase,
+        ),
+        exc.status_code,
+        exc.headers,
+    )
+async def other_exception_handler(exc: "Exception"):
+    traceback.print_exc()
+    status = HTTPStatus.INTERNAL_SERVER_ERROR
+    return JSONResponse(
+        dict(statusCode=status, message=str(exc), error=status.phrase),
+        status,
+    )
+def load_audio(reference_audio, sr):
+    if len(reference_audio) > 255 or not Path(reference_audio).exists():
+        try:
+            audio_data = base64.b64decode(reference_audio)
+            reference_audio = io.BytesIO(audio_data)
+        except base64.binascii.Error:
+            raise ValueError("Invalid path or base64 string")
+    audio, _ = librosa.load(reference_audio, sr=sr, mono=True)
+    return audio
+def encode_reference(*, decoder_model, reference_audio, enable_reference_audio):
+    if enable_reference_audio and reference_audio is not None:
+        # Load audios, and prepare basic info here
+        reference_audio_content = load_audio(
+            reference_audio, decoder_model.spec_transform.sample_rate
+        )
+        audios = torch.from_numpy(reference_audio_content).to(decoder_model.device)[
+            None, None, :
+        ]
+        audio_lengths = torch.tensor(
+            [audios.shape[2]], device=decoder_model.device, dtype=torch.long
+        )
+        logger.info(
+            f"Loaded audio with {audios.shape[2] / decoder_model.spec_transform.sample_rate:.2f} seconds"
+        )
+        # VQ Encoder
+        if isinstance(decoder_model, FireflyArchitecture):
+            prompt_tokens = decoder_model.encode(audios, audio_lengths)[0][0]
+        logger.info(f"Encoded prompt: {prompt_tokens.shape}")
+    else:
+        prompt_tokens = None
+        logger.info("No reference audio provided")
+    return prompt_tokens
+def decode_vq_tokens(
+    *,
+    decoder_model,
+    codes,
+):
+    feature_lengths = torch.tensor([codes.shape[1]], device=decoder_model.device)
+    logger.info(f"VQ features: {codes.shape}")
+    if isinstance(decoder_model, FireflyArchitecture):
+        # VQGAN Inference
+        return decoder_model.decode(
+            indices=codes[None],
+            feature_lengths=feature_lengths,
+        ).squeeze()
+    raise ValueError(f"Unknown model type: {type(decoder_model)}")
+routes = MultimethodRoutes(base_class=HttpView)
+def get_random_paths(base_path, data, speaker, emotion):
+    if base_path and data and speaker and emotion and (Path(base_path).exists()):
+        if speaker in data and emotion in data[speaker]:
+            files = data[speaker][emotion]
+            lab_files = [f for f in files if f.endswith(".lab")]
+            wav_files = [f for f in files if f.endswith(".wav")]
+            if lab_files and wav_files:
+                selected_lab = random.choice(lab_files)
+                selected_wav = random.choice(wav_files)
+                lab_path = Path(base_path) / speaker / emotion / selected_lab
+                wav_path = Path(base_path) / speaker / emotion / selected_wav
+                if lab_path.exists() and wav_path.exists():
+                    return lab_path, wav_path
+    return None, None
+def load_json(json_file):
+    if not json_file:
+        logger.info("Not using a json file")
+        return None
+    try:
+        with open(json_file, "r", encoding="utf-8") as file:
+            data = json.load(file)
+    except FileNotFoundError:
+        logger.warning(f"ref json not found: {json_file}")
+        data = None
+    except Exception as e:
+        logger.warning(f"Loading json failed: {e}")
+        data = None
+    return data
+class InvokeRequest(BaseModel):
+    text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游."
+    reference_text: Optional[str] = None
+    reference_audio: Optional[str] = None
+    max_new_tokens: int = 1024
+    chunk_length: Annotated[int, Field(ge=0, le=500, strict=True)] = 100
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    emotion: Optional[str] = None
+    format: Literal["wav", "mp3", "flac"] = "wav"
+    streaming: bool = False
+    ref_json: Optional[str] = "ref_data.json"
+    ref_base: Optional[str] = "ref_data"
+    speaker: Optional[str] = None
+def get_content_type(audio_format):
+    if audio_format == "wav":
+        return "audio/wav"
+    elif audio_format == "flac":
+        return "audio/flac"
+    elif audio_format == "mp3":
+        return "audio/mpeg"
+    else:
+        return "application/octet-stream"
+@torch.inference_mode()
+def inference(req: InvokeRequest):
+    # Parse reference audio aka prompt
+    prompt_tokens = None
+    ref_data = load_json(req.ref_json)
+    ref_base = req.ref_base
+    lab_path, wav_path = get_random_paths(ref_base, ref_data, req.speaker, req.emotion)
+    if lab_path and wav_path:
+        with open(lab_path, "r", encoding="utf-8") as lab_file:
+            ref_text = lab_file.read()
+        req.reference_audio = wav_path
+        req.reference_text = ref_text
+        logger.info("ref_path: " + str(wav_path))
+        logger.info("ref_text: " + ref_text)
+    # Parse reference audio aka prompt
+    prompt_tokens = encode_reference(
+        decoder_model=decoder_model,
+        reference_audio=req.reference_audio,
+        enable_reference_audio=req.reference_audio is not None,
+    )
+    logger.info(f"ref_text: {req.reference_text}")
+    # LLAMA Inference
+    request = dict(
+        device=decoder_model.device,
+        max_new_tokens=req.max_new_tokens,
+        text=req.text,
+        top_p=req.top_p,
+        repetition_penalty=req.repetition_penalty,
+        temperature=req.temperature,
+        compile=args.compile,
+        iterative_prompt=req.chunk_length > 0,
+        chunk_length=req.chunk_length,
+        max_length=2048,
+        prompt_tokens=prompt_tokens,
+        prompt_text=req.reference_text,
+    )
+    response_queue = queue.Queue()
+    llama_queue.put(
+        GenerateRequest(
+            request=request,
+            response_queue=response_queue,
+        )
+    )
+    if req.streaming:
+        yield wav_chunk_header()
+    segments = []
+    while True:
+        result: WrappedGenerateResponse = response_queue.get()
+        if result.status == "error":
+            raise result.response
+            break
+        result: GenerateResponse = result.response
+        if result.action == "next":
+            break
+        with torch.autocast(
+            device_type=decoder_model.device.type, dtype=args.precision
+        ):
+            fake_audios = decode_vq_tokens(
+                decoder_model=decoder_model,
+                codes=result.codes,
+            )
+        fake_audios = fake_audios.float().cpu().numpy()
+        if req.streaming:
+            yield (fake_audios * 32768).astype(np.int16).tobytes()
+        else:
+            segments.append(fake_audios)
+    if req.streaming:
+        return
+    if len(segments) == 0:
+        raise HTTPException(
+            HTTPStatus.INTERNAL_SERVER_ERROR,
+            content="No audio generated, please check the input text.",
+        )
+    fake_audios = np.concatenate(segments, axis=0)
+    yield fake_audios
+def auto_rerank_inference(req: InvokeRequest, use_auto_rerank: bool = True):
+    if not use_auto_rerank:
+        # 如果不使用 auto_rerank，直接调用原始的 inference 函数
+        return inference(req)
+    zh_model, en_model = load_model()
+    max_attempts = 5
+    best_wer = float("inf")
+    best_audio = None
+    for attempt in range(max_attempts):
+        # 调用原始的 inference 函数
+        audio_generator = inference(req)
+        fake_audios = next(audio_generator)
+        asr_result = batch_asr(
+            zh_model if is_chinese(req.text) else en_model, [fake_audios], 44100
+        )[0]
+        wer = calculate_wer(req.text, asr_result["text"])
+        if wer <= 0.1 and not asr_result["huge_gap"]:
+            return fake_audios
+        if wer < best_wer:
+            best_wer = wer
+            best_audio = fake_audios
+        if attempt == max_attempts - 1:
+            break
+    return best_audio
+async def inference_async(req: InvokeRequest):
+    for chunk in inference(req):
+        yield chunk
+async def buffer_to_async_generator(buffer):
+    yield buffer
+@routes.http.post("/v1/invoke")
+async def api_invoke_model(
+    req: Annotated[InvokeRequest, Body(exclusive=True)],
+):
+    """
+    Invoke model and generate audio
+    """
+    if args.max_text_length > 0 and len(req.text) > args.max_text_length:
+        raise HTTPException(
+            HTTPStatus.BAD_REQUEST,
+            content=f"Text is too long, max length is {args.max_text_length}",
+        )
+    if req.streaming and req.format != "wav":
+        raise HTTPException(
+            HTTPStatus.BAD_REQUEST,
+            content="Streaming only supports WAV format",
+        )
+    if req.streaming:
+        return StreamResponse(
+            iterable=inference_async(req),
+            headers={
+                "Content-Disposition": f"attachment; filename=audio.{req.format}",
+            },
+            content_type=get_content_type(req.format),
+        )
+    else:
+        fake_audios = next(inference(req))
+        buffer = io.BytesIO()
+        sf.write(
+            buffer,
+            fake_audios,
+            decoder_model.spec_transform.sample_rate,
+            format=req.format,
+        )
+        return StreamResponse(
+            iterable=buffer_to_async_generator(buffer.getvalue()),
+            headers={
+                "Content-Disposition": f"attachment; filename=audio.{req.format}",
+            },
+            content_type=get_content_type(req.format),
+        )
+@routes.http.post("/v1/health")
+async def api_health():
+    """
+    Health check
+    """
+    return JSONResponse({"status": "ok"})
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--llama-checkpoint-path",
+        type=str,
+        default="checkpoints/fish-speech-1.2-sft",
+    )
+    parser.add_argument(
+        "--decoder-checkpoint-path",
+        type=str,
+        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--half", action="store_true")
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--max-text-length", type=int, default=0)
+    parser.add_argument("--listen", type=str, default="127.0.0.1:8000")
+    parser.add_argument("--workers", type=int, default=1)
+    parser.add_argument("--use-auto-rerank", type=bool, default=True)
+    return parser.parse_args()
+# Define Kui app
+openapi = OpenAPI(
+    {
+        "title": "Fish Speech API",
+    },
+).routes
+app = Kui(
+    routes=routes + openapi[1:],  # Remove the default route
+    exception_handlers={
+        HTTPException: http_execption_handler,
+        Exception: other_exception_handler,
+    },
+    cors_config={},
+)
+if __name__ == "__main__":
+    import threading
+    import uvicorn
+    args = parse_args()
+    args.precision = torch.half if args.half else torch.bfloat16
+    logger.info("Loading Llama model...")
+    llama_queue = launch_thread_safe_queue(
+        checkpoint_path=args.llama_checkpoint_path,
+        device=args.device,
+        precision=args.precision,
+        compile=args.compile,
+    )
+    logger.info("Llama model loaded, loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
+        device=args.device,
+    )
+    logger.info("VQ-GAN model loaded, warming up...")
+    # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    list(
+        inference(
+            InvokeRequest(
+                text="Hello world.",
+                reference_text=None,
+                reference_audio=None,
+                max_new_tokens=0,
+                top_p=0.7,
+                repetition_penalty=1.2,
+                temperature=0.7,
+                emotion=None,
+                format="wav",
+                ref_base=None,
+                ref_json=None,
+            )
+        )
+    )
+    logger.info(f"Warming up done, starting server at http://{args.listen}")
+    host, port = args.listen.split(":")
+    uvicorn.run(app, host=host, port=int(port), workers=args.workers, log_level="info")

tools/auto_rerank.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+os.environ["MODELSCOPE_CACHE"] = ".cache/"
+import string
+import time
+from threading import Lock
+import librosa
+import numpy as np
+import opencc
+import torch
+from faster_whisper import WhisperModel
+t2s_converter = opencc.OpenCC("t2s")
+def load_model(*, device="cuda"):
+    model = WhisperModel(
+        "medium",
+        device=device,
+        compute_type="float16",
+        download_root="faster_whisper",
+    )
+    print("faster_whisper loaded!")
+    return model
+@torch.no_grad()
+def batch_asr_internal(model: WhisperModel, audios, sr):
+    resampled_audios = []
+    for audio in audios:
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio).float()
+        if audio.dim() > 1:
+            audio = audio.squeeze()
+        assert audio.dim() == 1
+        audio_np = audio.numpy()
+        resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
+        resampled_audios.append(resampled_audio)
+    trans_results = []
+    for resampled_audio in resampled_audios:
+        segments, info = model.transcribe(
+            resampled_audio,
+            language=None,
+            beam_size=5,
+            initial_prompt="Punctuation is needed in any language.",
+        )
+        trans_results.append(list(segments))
+    results = []
+    for trans_res, audio in zip(trans_results, audios):
+        duration = len(audio) / sr * 1000
+        huge_gap = False
+        max_gap = 0.0
+        text = None
+        last_tr = None
+        for tr in trans_res:
+            delta = tr.text.strip()
+            if tr.id > 1:
+                max_gap = max(tr.start - last_tr.end, max_gap)
+                text += delta
+            else:
+                text = delta
+            last_tr = tr
+            if max_gap > 3.0:
+                huge_gap = True
+                break
+        sim_text = t2s_converter.convert(text)
+        results.append(
+            {
+                "text": sim_text,
+                "duration": duration,
+                "huge_gap": huge_gap,
+            }
+        )
+    return results
+global_lock = Lock()
+def batch_asr(model, audios, sr):
+    return batch_asr_internal(model, audios, sr)
+def is_chinese(text):
+    return True
+def calculate_wer(text1, text2, debug=False):
+    chars1 = remove_punctuation(text1)
+    chars2 = remove_punctuation(text2)
+    m, n = len(chars1), len(chars2)
+    if m > n:
+        chars1, chars2 = chars2, chars1
+        m, n = n, m
+    prev = list(range(m + 1))  # row 0 distance: [0, 1, 2, ...]
+    curr = [0] * (m + 1)
+    for j in range(1, n + 1):
+        curr[0] = j
+        for i in range(1, m + 1):
+            if chars1[i - 1] == chars2[j - 1]:
+                curr[i] = prev[i - 1]
+            else:
+                curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1
+        prev, curr = curr, prev
+    edits = prev[m]
+    tot = max(len(chars1), len(chars2))
+    wer = edits / tot
+    if debug:
+        print("            gt:   ", chars1)
+        print("          pred:   ", chars2)
+        print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
+    return wer
+def remove_punctuation(text):
+    chinese_punctuation = (
+        " \n\t”“！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—"
+        '‛""„‟…‧﹏'
+    )
+    all_punctuation = string.punctuation + chinese_punctuation
+    translator = str.maketrans("", "", all_punctuation)
+    text_without_punctuation = text.translate(translator)
+    return text_without_punctuation
+if __name__ == "__main__":
+    model = load_model()
+    audios = [
+        librosa.load("44100.wav", sr=44100)[0],
+        librosa.load("lengyue.wav", sr=44100)[0],
+    ]
+    print(np.array(audios[0]))
+    print(batch_asr(model, audios, 44100))
+    start_time = time.time()
+    for _ in range(10):
+        print(batch_asr(model, audios, 44100))
+    print("Time taken:", time.time() - start_time)

tools/llama/build_dataset.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import itertools
+import os
+import re
+from collections import defaultdict
+from functools import partial
+from multiprocessing import Pool
+from pathlib import Path
+import click
+import numpy as np
+from loguru import logger
+from tqdm import tqdm
+from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
+from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
+from fish_speech.utils.file import load_filelist
+# To avoid CPU overload
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+def task_generator_folder(root: Path, text_extension: str):
+    files = list(tqdm(Path(root).rglob("*.npy"), desc=f"Loading {root}"))
+    files = sorted(files)
+    grouped_files = defaultdict(list)
+    for file in tqdm(files, desc=f"Grouping {root}"):
+        p = str(file.parent)
+        speaker = file.parent.name
+        try:
+            if isinstance(text_extension, str):
+                texts = [file.with_suffix(text_extension).read_text(encoding="utf-8")]
+            else:
+                texts = [
+                    file.with_suffix(ext).read_text(encoding="utf-8")
+                    for ext in text_extension
+                ]
+        except Exception as e:
+            logger.error(f"Failed to read text {file}: {e}")
+            continue
+        grouped_files[p].append((speaker, file, texts))
+    logger.info(
+        f"Found {len(grouped_files)} groups in {root}, {list(grouped_files.keys())[:5]}..."
+    )
+    for i in grouped_files.values():
+        subset = [(f, t) for _, f, t in i]
+        yield i[0][0], subset, "folder"
+def task_generator_filelist(filelist):
+    grouped_files = defaultdict(list)
+    for filename, speaker, _, text in load_filelist(filelist):
+        grouped_files[speaker].append((Path(filename), [text]))
+    logger.info(f"Found {len(grouped_files)} groups in {filelist}")
+    for speaker, values in grouped_files.items():
+        yield speaker, values, "filelist"
+def run_task(task):
+    name, subset, source = task
+    # Parse the files
+    sentences = []
+    for file, texts in subset:
+        np_file = file.with_suffix(".npy")
+        if np_file.exists() is False:
+            logger.warning(f"Can't find {np_file}")
+            continue
+        new_texts = []
+        for text in texts:
+            # Simple cleaning: replace { xxx } and < xxx > with space
+            text = re.sub(r"\{.*?\}", " ", text)
+            text = re.sub(r"<.*?>", " ", text)
+            text = re.sub(r"\s+", " ", text)
+            new_texts.append(text)
+        try:
+            semantics = np.load(np_file)
+        except Exception as e:
+            logger.error(f"Failed to parse {file}: {e}")
+            continue
+        if isinstance(semantics, np.ndarray):
+            semantics = semantics.tolist()
+        sentences.append(
+            Sentence(
+                texts=new_texts,
+                semantics=[Semantics(values=s) for s in semantics],
+            )
+        )
+    # Pack the sentences
+    return pack_pb_stream(
+        TextData(
+            source=source,
+            name=name,
+            sentences=sentences,
+        )
+    )
+@click.command()
+@click.option(
+    "--input",
+    type=click.Path(path_type=Path),
+    required=True,
+    help="A folder containing the dataset or a filelist",
+    multiple=True,
+)
+@click.option(
+    "--output", type=click.Path(path_type=Path), default="data/quantized-dataset-ft"
+)
+@click.option("--num-workers", type=int, default=16)
+@click.option("--text-extension", type=str, default=[".txt"], multiple=True)
+@click.option(
+    "--shard-size", type=int, default=10, help="The maximum size of each shard in mb"
+)
+def main(input, output, num_workers, text_extension, shard_size):
+    generator_fns = []
+    for f in input:
+        assert f.exists(), f"{f} not found"
+        if f.is_dir():
+            generator_fn = task_generator_folder(f, text_extension)
+        else:
+            generator_fn = task_generator_filelist(f)
+        generator_fns.append(generator_fn)
+    generator_fn = itertools.chain(*generator_fns)
+    output.mkdir(parents=True, exist_ok=True)
+    dataset_fp = None
+    tar_idx = 0
+    written_size = 0
+    with Pool(num_workers) as p:
+        for result in tqdm(p.imap_unordered(run_task, generator_fn)):
+            if dataset_fp is None:
+                dataset_fp = open(Path(output) / f"{tar_idx:08d}.protos", "wb")
+            dataset_fp.write(result)
+            written_size += len(result)
+            if written_size > shard_size * 1024 * 1024:
+                logger.info(f"Finished writing {tar_idx} shards to {output}")
+                dataset_fp.close()
+                dataset_fp = None
+                written_size = 0
+                tar_idx += 1
+    if dataset_fp is not None:
+        dataset_fp.close()
+    logger.info(f"Finished writing {tar_idx + 1} shards to {output}")
+if __name__ == "__main__":
+    main()

tools/llama/eval_in_context.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import pyrootutils
+import torch
+import torch.nn.functional as F
+from matplotlib import pyplot as plt
+from transformers import AutoTokenizer
+# register eval resolver and root
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from torch.utils.data import DataLoader
+from fish_speech.datasets.semantic import AutoAugTextDataset, TextDataCollator
+from tools.llama.generate import load_model
+def smooth(
+    scalars: list[float], weight: float
+) -> list[float]:  # Weight between 0 and 1
+    last = scalars[0]  # First value in the plot (first timestep)
+    smoothed = list()
+    for point in scalars:
+        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
+        smoothed.append(smoothed_val)  # Save it
+        last = smoothed_val  # Anchor the last smoothed value
+    return smoothed
+@torch.inference_mode()
+def analyze_one_model(loader, config, weight, max_length):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = load_model(
+        config,
+        weight,
+        device,
+        torch.bfloat16,
+        max_length,
+        compile=False,
+    )[0]
+    current_step = 0
+    model.eval()
+    semantic_loss_sum = torch.zeros(
+        max_length,
+        dtype=torch.float32,
+        device=device,
+    )
+    counter = torch.zeros(
+        max_length,
+        dtype=torch.long,
+        device=device,
+    )
+    for batch in loader:
+        batch = {k: v.to(device) for k, v in batch.items()}
+        labels = batch["labels"]
+        outputs = model(
+            inp=batch["inputs"],
+            key_padding_mask=batch["attention_masks"],
+        )
+        token_logits = outputs.token_logits
+        codebook_logits = outputs.codebook_logits
+        # Generate labels
+        base_loss = F.cross_entropy(
+            token_logits.reshape(-1, token_logits.size(-1)),
+            labels[:, 0].reshape(-1),
+            ignore_index=-100,
+            reduction="none",
+        )
+        codebook_labels = labels[:, 1 : 1 + model.config.num_codebooks].mT
+        semantic_loss = F.cross_entropy(
+            codebook_logits.reshape(-1, codebook_logits.size(-1)),
+            codebook_labels.reshape(-1),
+            ignore_index=-100,
+            reduction="none",
+        )
+        base_loss = base_loss.reshape(labels[:, 0].shape)
+        semantic_loss = semantic_loss.reshape(codebook_labels.shape)
+        semantic_loss_frame = semantic_loss.mean(-1)
+        pad_pos = codebook_labels.sum(-1) == -100 * model.config.num_codebooks
+        for loss_sample, pad in zip(semantic_loss_frame, pad_pos):
+            semantic_loss_sum[~pad] += loss_sample[~pad]
+            counter[~pad] += 1
+        current_step += 1
+        if current_step == 10:
+            break
+    semantic_loss = semantic_loss.cpu()
+    counter = counter.cpu()
+    xs, ys = [], []
+    for i, (loss, count) in enumerate(zip(semantic_loss_sum, counter)):
+        if count > 0:
+            xs.append(i)
+            ys.append((loss / count).item())  # for better loss visualization
+    smoothed_ys = smooth(ys, 0.95)
+    # Unload model
+    del model
+    torch.cuda.empty_cache()
+    return xs, ys, smoothed_ys
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("fishaudio/fish-speech-1")
+    max_length = 4096
+    ds = AutoAugTextDataset(
+        ["data/protos/sft/云天河"],
+        tokenizer=tokenizer,
+        use_speaker=False,
+        interactive_prob=1.0,
+        max_length=max_length,
+    )
+    loader = DataLoader(
+        ds,
+        batch_size=8,
+        collate_fn=TextDataCollator(tokenizer, max_length=max_length),
+        num_workers=0,
+        shuffle=False,
+    )
+    plt.figure(figsize=(10, 5), dpi=200)
+    plt.xlabel("Frame")
+    plt.ylabel("Loss")
+    plt.yscale("log")
+    plt.title("Semantic Loss")
+    plt.grid(which="both", axis="both")
+    plt.xlim(0, max_length)
+    tests = [
+        (
+            "pertrain-medium",
+            "dual_ar_2_codebook_medium",
+            "checkpoints/text2semantic-pretrain-medium-2k-v1.pth",
+        ),
+        (
+            "sft-medium",
+            "dual_ar_2_codebook_medium",
+            "checkpoints/text2semantic-sft-medium-v1.1-4k.pth",
+        ),
+        (
+            "sft-large",
+            "dual_ar_2_codebook_large",
+            "checkpoints/text2semantic-sft-large-v1.1-4k.pth",
+        ),
+    ]
+    for name, config, weight in tests:
+        xs, _, smoothed_ys = analyze_one_model(loader, config, weight, max_length)
+        plt.plot(xs, smoothed_ys, label=name)
+    plt.legend()
+    plt.savefig("semantic_loss.png")
+if __name__ == "__main__":
+    main()

tools/llama/generate.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 import queue
 import threading
 import time
 from pathlib import Path
-from typing import Optional, Tuple, Union
 import click
 import hydra
@@ -11,14 +12,11 @@ import numpy as np
 import torch
 import torch._dynamo.config
 import torch._inductor.config
-from hydra import compose, initialize
-from hydra.utils import instantiate
 from loguru import logger
 from tqdm import tqdm
-from transformers import AutoTokenizer
-from fish_speech.datasets.text import CODEBOOK_EOS_TOKEN_ID, CODEBOOK_PAD_TOKEN_ID
-from fish_speech.text.clean import clean_text
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 torch._inductor.config.coordinate_descent_tuning = True
@@ -29,7 +27,11 @@ if hasattr(torch._inductor.config, "fx_graph_cache"):
     torch._inductor.config.fx_graph_cache = True
-from fish_speech.models.text2semantic.llama import DualARTransformer, NaiveTransformer
 def multinomial_sample_one_no_sync(
@@ -94,7 +96,9 @@ def decode_one_token_ar(
     codebooks = [
         sample(
             x.logits,
-            previous_tokens=None,  # Disable repetition penalty for the token codebook
             **sampling_kwargs,
         )[0]
     ]
@@ -159,7 +163,6 @@ def decode_n_tokens(
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
-    eos_token_id: int = 2,
     im_end_id: int = 4,
     decode_one_token=decode_one_token_naive,
     **sampling_kwargs,
@@ -195,11 +198,7 @@ def decode_n_tokens(
             model.config.num_codebooks + 1, -1
         )
-        if (
-            cur_token[0, 0, -1] == eos_token_id
-            or cur_token[0, 0, -1] == im_end_id
-            or (cur_token[0, 1:, -1] == CODEBOOK_EOS_TOKEN_ID).any()
-        ):
             break
     return previous_tokens[:, : i + 1]
@@ -212,7 +211,6 @@ def generate(
     model: NaiveTransformer,
     prompt: torch.Tensor,
     max_new_tokens: int,
-    eos_token_id: int = 2,
     im_end_id: int = 4,
     decode_one_token=decode_one_token_naive,
     **sampling_kwargs,
@@ -253,6 +251,7 @@ def generate(
         if isinstance(model, NaiveTransformer)
         else decode_one_token_ar
     )
     next_token = prefill_decode(
         model, prompt.view(1, codebook_dim, -1), input_pos, **sampling_kwargs
     )
@@ -264,7 +263,6 @@ def generate(
         next_token.view(1, codebook_dim, -1),
         input_pos,
         max_new_tokens - 1,
-        eos_token_id=eos_token_id,
         im_end_id=im_end_id,
         decode_one_token=decode_one_token,
         **sampling_kwargs,
@@ -279,22 +277,12 @@ def generate(
 def encode_tokens(
     tokenizer,
     string,
-    bos=True,
     device="cuda",
     prompt_tokens=None,
-    speaker=None,
     num_codebooks=4,
 ):
     string = clean_text(string)
-    if speaker is None:
-        speaker = "assistant"
-    string = (
-        f"<|im_start|>user<|im_sep|>{string}<|im_end|><|im_start|>{speaker}<|im_sep|>"
-    )
-    if bos:
-        string = f"<|begin_of_sequence|>{string}"
     new_tokens = tokenizer.encode(
         string,
@@ -322,7 +310,7 @@ def encode_tokens(
         prompt_tokens = prompt_tokens[0]
     assert prompt_tokens.ndim == 2
-    data = prompt_tokens + 2
     if prompt_tokens.shape[0] > num_codebooks:
         logger.warning(
@@ -330,13 +318,9 @@ def encode_tokens(
         )
         data = data[:num_codebooks]
-    # Add eos token for each codebook
     data = torch.cat(
-        (
-            data,
-            torch.ones((data.size(0), 1), dtype=torch.int, device=device)
-            * CODEBOOK_EOS_TOKEN_ID,
-        ),
         dim=1,
     )
@@ -354,49 +338,13 @@ def encode_tokens(
     return prompt
-def load_model(
-    config_name, checkpoint_path, device, precision, max_length, compile=False
-):
-    hydra.core.global_hydra.GlobalHydra.instance().clear()
-    with initialize(version_base="1.3", config_path="../../fish_speech/configs/model"):
-        cfg = compose(
-            config_name=config_name, overrides=[f"config.max_seq_len={max_length}"]
-        )
-    model: Union[NaiveTransformer, DualARTransformer] = instantiate(cfg)
-    if "int8" in str(checkpoint_path):
-        logger.info("Using int8 weight-only quantization!")
-        from quantize import WeightOnlyInt8QuantHandler
-        simple_quantizer = WeightOnlyInt8QuantHandler(model)
-        model = simple_quantizer.convert_for_runtime()
-    if "int4" in str(checkpoint_path):
-        logger.info("Using int4 quantization!")
-        path_comps = checkpoint_path.name.split(".")
-        assert path_comps[-2].startswith("g")
-        groupsize = int(path_comps[-2][1:])
-        from quantize import WeightOnlyInt4QuantHandler
-        simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-        model = simple_quantizer.convert_for_runtime()
-    checkpoint = torch.load(str(checkpoint_path), map_location="cpu")
-    if "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-    if any(k.startswith("model.") for k in checkpoint):
-        checkpoint = {
-            k.replace("model.", ""): v
-            for k, v in checkpoint.items()
-            if k.startswith("model.")
-        }
-    model.load_state_dict(checkpoint, assign=True)
     model = model.to(device=device, dtype=precision)
-    logger.info("Restored model from checkpoint")
     if isinstance(model, DualARTransformer):
         decode_one_token = decode_one_token_ar
@@ -414,29 +362,16 @@ def load_model(
     return model.eval(), decode_one_token
-def split_text(text, min_length):
-    text = clean_text(text)
-    segments = []
-    curr = ""
-    for char in text:
-        curr += char
-        if char not in [".", ",", "!", "?"]:
-            continue
-        if len(curr) >= min_length:
-            segments.append(curr)
-            curr = ""
-    if curr:
-        segments.append(curr)
-    return segments
 def generate_long(
     *,
     model,
-    tokenizer: callable,
     device: str | torch.device,
     decode_one_token: callable,
     text: str,
@@ -448,42 +383,49 @@ def generate_long(
     compile: bool = False,
     iterative_prompt: bool = True,
     max_length: int = 2048,
-    chunk_length: int = 30,
-    speaker: Optional[str] = None,
-    prompt_text: Optional[str] = None,
-    prompt_tokens: Optional[torch.Tensor] = None,
-    is_streaming: bool = False,
 ):
     assert 0 < top_p <= 1, "top_p must be in (0, 1]"
     assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
     assert 0 < temperature < 2, "temperature must be in (0, 2)"
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
-    use_prompt = prompt_text is not None and prompt_tokens is not None
     encoded = []
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
     if use_prompt:
-        encoded_prompts = encode_tokens(
-            tokenizer,
-            prompt_text,
-            prompt_tokens=prompt_tokens,
-            bos=True,
-            device=device,
-            speaker=speaker,
-            num_codebooks=model.config.num_codebooks,
-        )
     for idx, text in enumerate(texts):
         encoded.append(
             encode_tokens(
                 tokenizer,
                 string=text,
-                bos=idx == 0 and not use_prompt,
                 device=device,
-                speaker=speaker,
                 num_codebooks=model.config.num_codebooks,
             )
         )
@@ -502,7 +444,6 @@ def generate_long(
             torch.cuda.synchronize()
         global_encoded = []
-        all_codes = []
         seg_idx = 0
         while seg_idx < len(encoded):
@@ -519,7 +460,9 @@ def generate_long(
             count = 0
             for i, length in enumerate(lengths):
                 count += length
-                if count + length > max_length - 1024:
                     break
             if i != 0 and i % 2 == 0:
@@ -532,7 +475,7 @@ def generate_long(
                 partial_encoded = global_encoded
             if use_prompt:
-                partial_encoded = [encoded_prompts] + partial_encoded
             cat_encoded = torch.cat(partial_encoded, dim=1)
             prompt_length = cat_encoded.size(1)
@@ -542,7 +485,6 @@ def generate_long(
                 model=model,
                 prompt=cat_encoded,
                 max_new_tokens=max_new_tokens,
-                eos_token_id=tokenizer.eos_token_id,
                 im_end_id=im_end_id,
                 decode_one_token=decode_one_token,
                 temperature=temperature,
@@ -574,76 +516,66 @@ def generate_long(
             # Put the generated tokens
             # since there is <im_end> and <eos> tokens, we remove last 2 tokens
-            codes = y[1:, prompt_length:-2].clone()
-            codes = codes - 2
             assert (codes >= 0).all(), f"Negative code found"
             decoded = y[:, prompt_length:-1].clone()
-            if decoded[0, -1] != im_end_id:  # <im_end>
-                val = [[im_end_id]] + [[CODEBOOK_EOS_TOKEN_ID]] * (decoded.size(0) - 1)
-                decoded = torch.cat(
-                    (decoded, torch.tensor(val, device=device, dtype=torch.int)), dim=1
-                )
             # But for global encoding, we should keep the <im_end> token
             global_encoded.append(decoded)
-            if is_streaming:
-                assert (codes >= 0).all(), f"Negative code found: {codes}"
-                yield codes
-            else:
-                all_codes.append(codes)
-            seg_idx += 1
-        if is_streaming:
-            # This indicates the end of the current sample
-            yield "next"
-        else:
-            all_codes = torch.cat(all_codes, dim=1)
-            assert (all_codes >= 0).all(), f"Negative code found: {codes}"
-            yield all_codes
 def launch_thread_safe_queue(
-    config_name,
     checkpoint_path,
     device,
     precision,
-    max_length,
-    compile=False,
 ):
     input_queue = queue.Queue()
     init_event = threading.Event()
     def worker():
         model, decode_one_token = load_model(
-            config_name, checkpoint_path, device, precision, max_length, compile=compile
         )
         init_event.set()
         while True:
-            item = input_queue.get()
             if item is None:
                 break
-            kwargs = item["request"]
-            response_queue = item["response_queue"]
             try:
-                item["success"] = True
                 for chunk in generate_long(
                     model=model, decode_one_token=decode_one_token, **kwargs
                 ):
-                    response_queue.put(chunk)
-                response_queue.put("done")
             except Exception as e:
-                item["success"] = False
-                item["response"] = e
-                response_queue.put("done")
     threading.Thread(target=worker, daemon=True).start()
     init_event.wait()
@@ -657,57 +589,58 @@ def launch_thread_safe_queue(
     type=str,
     default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
 )
-@click.option("--prompt-text", type=str, default=None)
 @click.option(
-    "--prompt-tokens", type=click.Path(path_type=Path, exists=True), default=None
 )
 @click.option("--num-samples", type=int, default=1)
 @click.option("--max-new-tokens", type=int, default=0)
 @click.option("--top-p", type=float, default=0.7)
-@click.option("--repetition-penalty", type=float, default=1.5)
 @click.option("--temperature", type=float, default=0.7)
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="results/text2semantic_400m_finetune/step_000002000.pth",
 )
-@click.option("--config-name", type=str, default="dual_ar_8_codebook_small")
-@click.option("--tokenizer", type=str, default="fishaudio/fish-speech-1")
 @click.option("--compile/--no-compile", default=False)
 @click.option("--seed", type=int, default=42)
-@click.option("--speaker", type=str, default=None)
 @click.option("--half/--no-half", default=False)
 @click.option("--iterative-prompt/--no-iterative-prompt", default=True)
-@click.option("--max-length", type=int, default=2048)
-@click.option("--chunk-length", type=int, default=30)
 def main(
     text: str,
-    prompt_text: Optional[str],
-    prompt_tokens: Optional[Path],
     num_samples: int,
     max_new_tokens: int,
     top_p: int,
     repetition_penalty: float,
     temperature: float,
     checkpoint_path: Path,
-    config_name: str,
-    tokenizer: str,
     compile: bool,
     seed: int,
-    speaker: Optional[str],
     half: bool,
     iterative_prompt: bool,
-    max_length: int,
     chunk_length: int,
 ) -> None:
-    device = "cuda"
     precision = torch.half if half else torch.bfloat16
     logger.info("Loading model ...")
     t0 = time.time()
     model, decode_one_token = load_model(
-        config_name, checkpoint_path, device, precision, max_length, compile=compile
     )
     if torch.cuda.is_available():
@@ -715,13 +648,9 @@ def main(
     logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
-    prompt_tokens = (
-        torch.from_numpy(np.load(prompt_tokens)).to(device)
-        if prompt_tokens is not None
-        else None
-    )
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
     torch.manual_seed(seed)
     if torch.cuda.is_available():
@@ -737,19 +666,29 @@ def main(
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,
-        tokenizer=tokenizer,
         compile=compile,
-        speaker=speaker,
         iterative_prompt=iterative_prompt,
-        max_length=max_length,
         chunk_length=chunk_length,
         prompt_text=prompt_text,
         prompt_tokens=prompt_tokens,
     )
-    for idx, codes in enumerate(generator):
-        np.save(f"codes_{idx}.npy", codes.cpu().numpy())
-        logger.info(f"Saved codes to codes_{idx}.npy")
 if __name__ == "__main__":

 import queue
 import threading
 import time
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Literal, Optional, Tuple, Union
 import click
 import hydra
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 from loguru import logger
 from tqdm import tqdm
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.text import clean_text, split_text
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 torch._inductor.config.coordinate_descent_tuning = True
     torch._inductor.config.fx_graph_cache = True
+from fish_speech.models.text2semantic.llama import (
+    BaseTransformer,
+    DualARTransformer,
+    NaiveTransformer,
+)
 def multinomial_sample_one_no_sync(
     codebooks = [
         sample(
             x.logits,
+            previous_tokens=(
+                previous_tokens[0] if previous_tokens is not None else None
+            ),  # Disable repetition penalty for the token codebook
             **sampling_kwargs,
         )[0]
     ]
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
     im_end_id: int = 4,
     decode_one_token=decode_one_token_naive,
     **sampling_kwargs,
             model.config.num_codebooks + 1, -1
         )
+        if cur_token[0, 0, -1] == im_end_id:
             break
     return previous_tokens[:, : i + 1]
     model: NaiveTransformer,
     prompt: torch.Tensor,
     max_new_tokens: int,
     im_end_id: int = 4,
     decode_one_token=decode_one_token_naive,
     **sampling_kwargs,
         if isinstance(model, NaiveTransformer)
         else decode_one_token_ar
     )
     next_token = prefill_decode(
         model, prompt.view(1, codebook_dim, -1), input_pos, **sampling_kwargs
     )
         next_token.view(1, codebook_dim, -1),
         input_pos,
         max_new_tokens - 1,
         im_end_id=im_end_id,
         decode_one_token=decode_one_token,
         **sampling_kwargs,
 def encode_tokens(
     tokenizer,
     string,
     device="cuda",
     prompt_tokens=None,
     num_codebooks=4,
 ):
     string = clean_text(string)
+    string = f"<|im_start|>user\n{string}<|im_end|><|im_start|>assistant\n"
     new_tokens = tokenizer.encode(
         string,
         prompt_tokens = prompt_tokens[0]
     assert prompt_tokens.ndim == 2
+    data = prompt_tokens + 1
     if prompt_tokens.shape[0] > num_codebooks:
         logger.warning(
         )
         data = data[:num_codebooks]
+    # Add pad token for each codebook
     data = torch.cat(
+        (data, torch.zeros((data.size(0), 1), dtype=torch.int, device=device)),
         dim=1,
     )
     return prompt
+def load_model(checkpoint_path, device, precision, compile=False):
+    model: Union[NaiveTransformer, DualARTransformer] = BaseTransformer.from_pretrained(
+        checkpoint_path, load_weights=True
+    )
     model = model.to(device=device, dtype=precision)
+    logger.info(f"Restored model from checkpoint")
     if isinstance(model, DualARTransformer):
         decode_one_token = decode_one_token_ar
     return model.eval(), decode_one_token
+@dataclass
+class GenerateResponse:
+    action: Literal["sample", "next"]
+    codes: Optional[torch.Tensor] = None
+    text: Optional[str] = None
 def generate_long(
     *,
     model,
     device: str | torch.device,
     decode_one_token: callable,
     text: str,
     compile: bool = False,
     iterative_prompt: bool = True,
     max_length: int = 2048,
+    chunk_length: int = 150,
+    prompt_text: Optional[str | list[str]] = None,
+    prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
 ):
     assert 0 < top_p <= 1, "top_p must be in (0, 1]"
     assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
     assert 0 < temperature < 2, "temperature must be in (0, 2)"
+    use_prompt = prompt_text is not None and prompt_tokens is not None
+    if use_prompt and isinstance(prompt_text, str):
+        prompt_text = [prompt_text]
+        prompt_tokens = [prompt_tokens]
+    assert use_prompt is False or len(prompt_text) == len(
+        prompt_tokens
+    ), "Prompt text and tokens must have the same length"
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    tokenizer = model.tokenizer
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
     encoded = []
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
+    encoded_prompts = []
     if use_prompt:
+        for idx, (t, c) in enumerate(zip(prompt_text, prompt_tokens)):
+            encoded_prompts.append(
+                encode_tokens(
+                    tokenizer,
+                    string=t,
+                    device=device,
+                    prompt_tokens=c,
+                    num_codebooks=model.config.num_codebooks,
+                )
+            )
     for idx, text in enumerate(texts):
         encoded.append(
             encode_tokens(
                 tokenizer,
                 string=text,
                 device=device,
                 num_codebooks=model.config.num_codebooks,
             )
         )
             torch.cuda.synchronize()
         global_encoded = []
         seg_idx = 0
         while seg_idx < len(encoded):
             count = 0
             for i, length in enumerate(lengths):
                 count += length
+                if count + length > max_length - 1024 - sum(
+                    t.shape[1] for t in encoded_prompts
+                ):
                     break
             if i != 0 and i % 2 == 0:
                 partial_encoded = global_encoded
             if use_prompt:
+                partial_encoded = encoded_prompts + partial_encoded
             cat_encoded = torch.cat(partial_encoded, dim=1)
             prompt_length = cat_encoded.size(1)
                 model=model,
                 prompt=cat_encoded,
                 max_new_tokens=max_new_tokens,
                 im_end_id=im_end_id,
                 decode_one_token=decode_one_token,
                 temperature=temperature,
             # Put the generated tokens
             # since there is <im_end> and <eos> tokens, we remove last 2 tokens
+            codes = y[1:, prompt_length:-1].clone()
+            codes = codes - 1
             assert (codes >= 0).all(), f"Negative code found"
             decoded = y[:, prompt_length:-1].clone()
             # But for global encoding, we should keep the <im_end> token
             global_encoded.append(decoded)
+            assert (codes >= 0).all(), f"Negative code found: {codes}"
+            yield GenerateResponse(action="sample", codes=codes, text=texts[seg_idx])
+            seg_idx += 1
+        # This indicates the end of the current sample
+        yield GenerateResponse(action="next")
+@dataclass
+class WrappedGenerateResponse:
+    status: Literal["success", "error"]
+    response: Optional[GenerateResponse | Exception] = None
+@dataclass
+class GenerateRequest:
+    request: dict
+    response_queue: queue.Queue
 def launch_thread_safe_queue(
     checkpoint_path,
     device,
     precision,
+    compile: bool = False,
 ):
     input_queue = queue.Queue()
     init_event = threading.Event()
     def worker():
         model, decode_one_token = load_model(
+            checkpoint_path, device, precision, compile=compile
         )
         init_event.set()
         while True:
+            item: GenerateRequest | None = input_queue.get()
             if item is None:
                 break
+            kwargs = item.request
+            response_queue = item.response_queue
             try:
                 for chunk in generate_long(
                     model=model, decode_one_token=decode_one_token, **kwargs
                 ):
+                    response_queue.put(
+                        WrappedGenerateResponse(status="success", response=chunk)
+                    )
             except Exception as e:
+                response_queue.put(WrappedGenerateResponse(status="error", response=e))
     threading.Thread(target=worker, daemon=True).start()
     init_event.wait()
     type=str,
     default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
 )
+@click.option("--prompt-text", type=str, default=None, multiple=True)
 @click.option(
+    "--prompt-tokens",
+    type=click.Path(path_type=Path, exists=True),
+    default=None,
+    multiple=True,
 )
 @click.option("--num-samples", type=int, default=1)
 @click.option("--max-new-tokens", type=int, default=0)
 @click.option("--top-p", type=float, default=0.7)
+@click.option("--repetition-penalty", type=float, default=1.2)
 @click.option("--temperature", type=float, default=0.7)
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
+    default="checkpoints/fish-speech-1.2-sft",
 )
+@click.option("--device", type=str, default="cuda")
 @click.option("--compile/--no-compile", default=False)
 @click.option("--seed", type=int, default=42)
 @click.option("--half/--no-half", default=False)
 @click.option("--iterative-prompt/--no-iterative-prompt", default=True)
+@click.option("--chunk-length", type=int, default=100)
 def main(
     text: str,
+    prompt_text: Optional[list[str]],
+    prompt_tokens: Optional[list[Path]],
     num_samples: int,
     max_new_tokens: int,
     top_p: int,
     repetition_penalty: float,
     temperature: float,
     checkpoint_path: Path,
+    device: str,
     compile: bool,
     seed: int,
     half: bool,
     iterative_prompt: bool,
     chunk_length: int,
 ) -> None:
     precision = torch.half if half else torch.bfloat16
+    if prompt_text is not None and len(prompt_text) != len(prompt_tokens):
+        raise ValueError(
+            f"Number of prompt text ({len(prompt_text)}) and prompt tokens ({len(prompt_tokens)}) should be the same"
+        )
     logger.info("Loading model ...")
     t0 = time.time()
     model, decode_one_token = load_model(
+        checkpoint_path, device, precision, compile=compile
     )
     if torch.cuda.is_available():
     logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
+    if prompt_tokens is not None:
+        prompt_tokens = [torch.from_numpy(np.load(p)).to(device) for p in prompt_tokens]
     torch.manual_seed(seed)
     if torch.cuda.is_available():
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,
         compile=compile,
         iterative_prompt=iterative_prompt,
         chunk_length=chunk_length,
         prompt_text=prompt_text,
         prompt_tokens=prompt_tokens,
     )
+    idx = 0
+    codes = []
+    for response in generator:
+        if response.action == "sample":
+            codes.append(response.codes)
+            logger.info(f"Sampled text: {response.text}")
+        elif response.action == "next":
+            if codes:
+                np.save(f"codes_{idx}.npy", torch.cat(codes, dim=1).cpu().numpy())
+                logger.info(f"Saved codes to codes_{idx}.npy")
+            logger.info(f"Next sample")
+            codes = []
+            idx += 1
+        else:
+            logger.error(f"Error: {response}")
 if __name__ == "__main__":

tools/llama/merge_lora.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import shutil
+from copy import deepcopy
+from pathlib import Path
+import click
+import hydra
+import torch
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from loguru import logger
+from fish_speech.models.text2semantic.llama import BaseTransformer
+from fish_speech.models.text2semantic.lora import get_merged_state_dict
+@click.command()
+@click.option("--lora-config", type=str, default="r_8_alpha_16")
+@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.2-sft")
+@click.option("--lora-weight", type=str, required=True)
+@click.option("--output", type=str, required=True)
+def merge(lora_config, base_weight, lora_weight, output):
+    output = Path(output)
+    logger.info(
+        f"Merging {base_weight} and {lora_weight} into {output} with {lora_config}"
+    )
+    with initialize(version_base="1.3", config_path="../../fish_speech/configs/lora"):
+        cfg = compose(config_name=lora_config)
+    lora_config = instantiate(cfg)
+    logger.info(f"Loaded lora model with config {lora_config}")
+    llama_model = BaseTransformer.from_pretrained(
+        path=base_weight,
+        load_weights=True,
+        lora_config=lora_config,
+    )
+    logger.info(f"Loaded llama model")
+    llama_state_dict = llama_model.state_dict()
+    llama_state_dict = {k: v for k, v in llama_state_dict.items() if "lora" not in k}
+    llama_state_dict_copy = deepcopy(llama_state_dict)
+    lora_state_dict = torch.load(lora_weight, map_location="cpu")
+    if "state_dict" in llama_state_dict:
+        llama_state_dict = llama_state_dict["state_dict"]
+    if "state_dict" in lora_state_dict:
+        lora_state_dict = lora_state_dict["state_dict"]
+    # remove prefix model.
+    if any(k.startswith("model.") for k in llama_state_dict.keys()):
+        llama_state_dict = {
+            k.replace("model.", ""): v
+            for k, v in llama_state_dict.items()
+            if k.startswith("model.")
+        }
+    if any(k.startswith("model.") for k in lora_state_dict.keys()):
+        lora_state_dict = {
+            k.replace("model.", ""): v
+            for k, v in lora_state_dict.items()
+            if k.startswith("model.")
+        }
+    logger.info(f"Found {len(llama_state_dict)} keys in llama model")
+    logger.info(f"Found {len(lora_state_dict)} keys in lora model")
+    merged_state_dict = llama_state_dict | lora_state_dict
+    llama_model.load_state_dict(merged_state_dict, strict=True)
+    logger.info(f"Merged model loaded")
+    # Trigger eval mode to merge lora
+    llama_model.eval()
+    llama_model.save_pretrained(output, drop_lora=True)
+    logger.info(f"Saved merged model to {output}, validating")
+    new_state_dict = torch.load(output / "model.pth", map_location="cpu")
+    original_keys = set(llama_state_dict_copy.keys())
+    merged_keys = set(new_state_dict.keys())
+    assert original_keys == merged_keys, "Keys should be same"
+    for key in original_keys:
+        diff_l1 = (new_state_dict[key] - llama_state_dict_copy[key]).abs().sum().item()
+        if diff_l1 != 0:
+            break
+    else:
+        logger.error("Merged model is same as the original model")
+        exit(1)
+    logger.info("Merged model is different from the original model, check passed")
+if __name__ == "__main__":
+    merge()

tools/llama/quantize.py CHANGED Viewed

@@ -1,16 +1,20 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import time
 from pathlib import Path
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from fish_speech.models.text2semantic.llama import ModelArgs, Transformer, find_multiple
 ##### Quantization Primitives ######
@@ -414,13 +418,26 @@ class WeightOnlyInt4Linear(torch.nn.Module):
         )
-def quantize(
-    checkpoint_path: Path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
-    mode: str = "int8",
-    # following arguments only available when setting int4 quantization.
-    groupsize: int = 128,
-) -> None:
-    assert checkpoint_path.is_file(), checkpoint_path
     device = "cpu"
     precision = torch.bfloat16
@@ -428,31 +445,14 @@ def quantize(
     print("Loading model ...")
     t0 = time.time()
-    with torch.device("meta"):
-        model = Transformer(
-            ModelArgs(
-                max_seq_len=4096,
-                vocab_size=36408,
-                n_layer=24,
-                n_head=16,
-                dim=1024,
-                rope_base=10000,
-                norm_eps=1e-5,
-                num_codebooks=4,  # single codebook
-                codebook_size=168,  # codebook size 160 + 2 special tokens
-            )
-        )
-    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
-    if "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-    checkpoint = {
-        k.replace("model.", ""): v
-        for k, v in checkpoint.items()
-        if k.startswith("model.")
-    }
-    model.load_state_dict(checkpoint, assign=True)
-    model = model.to(dtype=precision, device=device)
     if mode == "int8":
         print(
@@ -461,10 +461,12 @@ def quantize(
         quant_handler = WeightOnlyInt8QuantHandler(model)
         quantized_state_dict = quant_handler.create_quantized_state_dict()
-        dir_name = checkpoint_path.parent
-        base_name = checkpoint_path.stem
-        suffix = checkpoint_path.suffix
-        quantize_path = dir_name / f"{base_name}.int8{suffix}"
     elif mode == "int4":
         print(
@@ -473,10 +475,12 @@ def quantize(
         quant_handler = WeightOnlyInt4QuantHandler(model, groupsize)
         quantized_state_dict = quant_handler.create_quantized_state_dict()
-        dir_name = checkpoint_path.parent
-        base_name = checkpoint_path.name
-        suffix = checkpoint_path.suffix
-        quantize_path = dir_name / f"{base_name}.int4.g{groupsize}{suffix}"
     else:
         raise ValueError(
@@ -490,26 +494,4 @@ def quantize(
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Quantize a model.")
-    parser.add_argument(
-        "--checkpoint_path",
-        type=Path,
-        default=Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
-        help="Path to the model checkpoint to be quantized.",
-    )
-    parser.add_argument(
-        "--mode",
-        "-q",
-        type=str,
-        default="int8",
-        choices=["int8", "int4"],
-        help="type of quantization to perform",
-    )
-    parser.add_argument(
-        "--groupsize", type=int, default=32, help="Group size for int4 quantization."
-    )
-    args = parser.parse_args()
-    quantize(args.checkpoint_path, args.mode, args.groupsize)

 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+import datetime
+import shutil
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import time
 from pathlib import Path
+import click
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from fish_speech.models.text2semantic.llama import find_multiple
+from tools.llama.generate import load_model
 ##### Quantization Primitives ######
         )
+def generate_folder_name():
+    now = datetime.datetime.now()
+    folder_name = now.strftime("%Y%m%d_%H%M%S")
+    return folder_name
+@click.command()
+@click.option(
+    "--checkpoint-path",
+    type=click.Path(path_type=Path, exists=True),
+    default="checkpoints/fish-speech-1.2-sft",
+)
+@click.option(
+    "--mode", type=str, default="int8", help="type of quantization to perform"
+)
+@click.option(
+    "--groupsize", type=int, default=128, help="Group size for int4 quantization."
+)
+@click.option("--timestamp", type=str, default="None", help="When to do quantization")
+def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -> None:
     device = "cpu"
     precision = torch.bfloat16
     print("Loading model ...")
     t0 = time.time()
+    model, _ = load_model(
+        checkpoint_path=checkpoint_path,
+        device=device,
+        precision=precision,
+        compile=False,
+    )
+    vq_model = "firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+    now = timestamp if timestamp != "None" else generate_folder_name()
     if mode == "int8":
         print(
         quant_handler = WeightOnlyInt8QuantHandler(model)
         quantized_state_dict = quant_handler.create_quantized_state_dict()
+        dir_name = checkpoint_path
+        dst_name = Path(f"checkpoints/fs-1.2-int8-{now}")
+        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))
+        if (dst_name / vq_model).exists():
+            (dst_name / vq_model).unlink()
+        quantize_path = dst_name / "model.pth"
     elif mode == "int4":
         print(
         quant_handler = WeightOnlyInt4QuantHandler(model, groupsize)
         quantized_state_dict = quant_handler.create_quantized_state_dict()
+        dir_name = checkpoint_path
+        dst_name = Path(f"checkpoints/fs-1.2-int4-g{groupsize}-{now}")
+        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))
+        if (dst_name / vq_model).exists():
+            (dst_name / vq_model).unlink()
+        quantize_path = dst_name / "model.pth"
     else:
         raise ValueError(
 if __name__ == "__main__":
+    quantize()

tools/llama/rebuild_tokenizer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+# Initialize a tokenizer
+tokenizer = Tokenizer(models.BPE())
+# Customize pre-tokenization and decoding
+tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+tokenizer.decoder = decoders.ByteLevel()
+tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+# Don't train the tokenizer
+trainer = trainers.BpeTrainer(
+    vocab_size=0,
+    min_frequency=2,
+    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    special_tokens=[
+        "<|begin_of_sequence|>",
+        "<|end_of_sequence|>",
+        "<|im_start|>",
+        "<|im_sep|>",  # system, user, assistant, etc.
+        "<|im_end|>",
+        "<|semantic|>",  # audio features
+        "<|pad|>",
+    ],
+)
+# <|im_start|>user<|im_sep|>...<|im_end|>
+# <|im_start|>assistant<|im_sep|><|semantic|><|semantic|><|semantic|><|semantic|><|semantic|><|im_end|>
+tokenizer.train_from_iterator([], trainer=trainer)
+print(len(tokenizer.get_vocab()))
+x = tokenizer.encode(
+    "Hello, how are you? dfgnviadfjoiviouajeiodfjv 你好世界 🈶<|semantic|>"
+).ids
+print(x, len(x))
+print(tokenizer.decode(x, skip_special_tokens=True))
+tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    pad_token="<|pad|>",
+    bos_token="<|begin_of_sequence|>",
+    eos_token="<|end_of_sequence|>",
+)
+# Try tokenizing a new sequence
+sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene. 测试中文, 你好世界 🈶<|semantic|>"
+encoded = tokenizer(sequence).input_ids
+print("Test encoding....")
+print(f"\tSentence: {sequence}")
+print(f"\tEncoded: {encoded}")
+print(f"\tDecoded: {tokenizer.batch_decode(encoded)}")
+print(f"\tDecoded: {tokenizer.decode(encoded)}")
+tokenizer.push_to_hub("fishaudio/fish-speech-1", private=True)

tools/vqgan/create_train_split.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import math
+from pathlib import Path
+from random import Random
+import click
+from loguru import logger
+from pydub import AudioSegment
+from tqdm import tqdm
+from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist
+@click.command()
+@click.argument("root", type=click.Path(exists=True, path_type=Path))
+@click.option("--val-ratio", type=float, default=None)
+@click.option("--val-count", type=int, default=None)
+@click.option("--filelist", default=None, type=Path)
+@click.option("--min-duration", default=None, type=float)
+@click.option("--max-duration", default=None, type=float)
+def main(root, val_ratio, val_count, filelist, min_duration, max_duration):
+    if filelist:
+        files = [i[0] for i in load_filelist(filelist)]
+    else:
+        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)
+    if min_duration is None and max_duration is None:
+        filtered_files = list(map(str, [file.relative_to(root) for file in files]))
+    else:
+        filtered_files = []
+        for file in tqdm(files):
+            try:
+                audio = AudioSegment.from_file(str(file))
+                duration = len(audio) / 1000.0
+                if min_duration is not None and duration < min_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}"
+                    )
+                    continue
+                if max_duration is not None and duration > max_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}"
+                    )
+                    continue
+                filtered_files.append(str(file.relative_to(root)))
+            except Exception as e:
+                logger.info(f"Error processing {file}: {e}")
+    logger.info(
+        f"Found {len(files)} files, remaining {len(filtered_files)} files after filtering"
+    )
+    Random(42).shuffle(filtered_files)
+    if val_count is None and val_ratio is None:
+        logger.info("Validation ratio and count not specified, using min(20%, 100)")
+        val_size = min(100, math.ceil(len(filtered_files) * 0.2))
+    elif val_count is not None and val_ratio is not None:
+        logger.error("Cannot specify both val_count and val_ratio")
+        return
+    elif val_count is not None:
+        if val_count < 1 or val_count > len(filtered_files):
+            logger.error("val_count must be between 1 and number of files")
+            return
+        val_size = val_count
+    else:
+        val_size = math.ceil(len(filtered_files) * val_ratio)
+    logger.info(f"Using {val_size} files for validation")
+    with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[val_size:]))
+    with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[:val_size]))
+    logger.info("Done")
+if __name__ == "__main__":
+    main()

tools/vqgan/extract_vq.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os
+import subprocess as sp
+import sys
+import time
+from datetime import timedelta
+from functools import lru_cache
+from pathlib import Path
+from random import Random
+import click
+import numpy as np
+import torch
+import torchaudio
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from lightning import LightningModule
+from loguru import logger
+from omegaconf import OmegaConf
+from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist
+# register eval resolver
+OmegaConf.register_new_resolver("eval", eval)
+# This file is used to convert the audio files to text files using the Whisper model.
+# It's mainly used to generate the training data for the VQ model.
+RANK = int(os.environ.get("SLURM_PROCID", 0))
+WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))
+logger_format = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
+    "<level>{level: <8}</level> | "
+    "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
+    "{extra[rank]} - <level>{message}</level>"
+)
+logger.configure(extra={"rank": f"RANK: {RANK} / {WORLD_SIZE}"})
+logger.remove()
+logger.add(sys.stderr, format=logger_format)
+@lru_cache(maxsize=1)
+def get_model(
+    config_name: str = "firefly_gan_vq",
+    checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    device: str | torch.device = "cuda",
+):
+    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
+        cfg = compose(config_name=config_name)
+    model = instantiate(cfg)
+    state_dict = torch.load(
+        checkpoint_path,
+        map_location=device,
+    )
+    if "state_dict" in state_dict:
+        state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.to(device)
+    logger.info(f"Loaded model")
+    return model
+@torch.inference_mode()
+def process_batch(files: list[Path], model) -> float:
+    wavs = []
+    audio_lengths = []
+    new_files = []
+    max_length = total_time = 0
+    for file in files:
+        try:
+            wav, sr = torchaudio.load(
+                str(file), backend="sox" if sys.platform == "linux" else "soundfile"
+            )  # Need to install libsox-dev
+        except Exception as e:
+            logger.error(f"Error reading {file}: {e}")
+            continue
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        wav = torchaudio.functional.resample(
+            wav.cuda(), sr, model.spec_transform.sample_rate
+        )[0]
+        total_time += len(wav) / model.spec_transform.sample_rate
+        max_length = max(max_length, len(wav))
+        wavs.append(wav)
+        audio_lengths.append(len(wav))
+        new_files.append(file)
+    files = new_files
+    # Pad to max length
+    for i, wav in enumerate(wavs):
+        wavs[i] = torch.nn.functional.pad(wav, (0, max_length - len(wav)), "constant")
+    audios = torch.stack(wavs, dim=0)[:, None]
+    audio_lengths = torch.tensor(audio_lengths, device=model.device, dtype=torch.long)
+    # Calculate lengths
+    indices, feature_lengths = model.encode(audios, audio_lengths)
+    # Save to disk
+    outputs = indices.cpu().numpy()
+    for file, length, feature, audio_length in zip(
+        files, feature_lengths, outputs, audio_lengths
+    ):
+        feature = feature[:, :length]
+        # (T,)
+        with open(file.with_suffix(".npy"), "wb") as f:
+            np.save(f, feature)
+    return total_time
+@click.command()
+@click.argument("folder")
+@click.option("--num-workers", default=1)
+@click.option("--config-name", default="firefly_gan_vq")
+@click.option(
+    "--checkpoint-path",
+    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+)
+@click.option("--batch-size", default=64)
+@click.option("--filelist", default=None, type=Path)
+def main(
+    folder: str,
+    num_workers: int,
+    config_name: str,
+    checkpoint_path: str,
+    batch_size: int,
+    filelist: Path,
+):
+    if num_workers > 1 and WORLD_SIZE != num_workers:
+        assert WORLD_SIZE == 1, "You should either use SLURM or this launcher, not both"
+        logger.info(f"Spawning {num_workers} workers")
+        if torch.cuda.is_available():
+            visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+            if visible_devices is None:
+                visible_devices = list(range(torch.cuda.device_count()))
+            else:
+                visible_devices = visible_devices.split(",")
+        else:
+            # Set to empty string to avoid using GPU
+            visible_devices = [""]
+        processes = []
+        for i in range(num_workers):
+            env = os.environ.copy()
+            env["CUDA_VISIBLE_DEVICES"] = str(visible_devices[i % len(visible_devices)])
+            env["SLURM_PROCID"] = str(i)
+            env["SLURM_NTASKS"] = str(num_workers)
+            processes.append(
+                sp.Popen(
+                    [sys.executable] + sys.argv.copy(),
+                    env=env,
+                )
+            )
+        for p in processes:
+            p.wait()
+        logger.info(f"All workers finished")
+        return
+    # This is a worker
+    logger.info(f"Starting worker")
+    if filelist:
+        files = [i[0] for i in load_filelist(filelist)]
+    else:
+        files = list_files(folder, AUDIO_EXTENSIONS, recursive=True, sort=False)
+    print(f"Found {len(files)} files")
+    files = [Path(f) for f in files if not Path(f).with_suffix(".npy").exists()]
+    total_files = len(files)
+    files = files[RANK::WORLD_SIZE]
+    logger.info(f"Processing {len(files)}/{total_files} files")
+    # Batch processing
+    total_time = 0
+    begin_time = time.time()
+    processed_files = 0
+    model = get_model(config_name, checkpoint_path)
+    for n_batch, idx in enumerate(range(0, len(files), batch_size)):
+        batch = files[idx : idx + batch_size]
+        batch_time = process_batch(batch, model)
+        total_time += batch_time
+        processed_files += len(batch)
+        if (n_batch + 1) % 10 == 0:
+            eta = (
+                (time.time() - begin_time)
+                / processed_files
+                * (len(files) - processed_files)
+            )
+            logger.info(
+                f"Processed {processed_files} files, {total_time / 3600:.2f} hours of audio, "
+                + f"ETA: {timedelta(seconds=round(eta))}s"
+            )
+    logger.info(
+        f"Finished processing {len(files)} files, {total_time / 3600:.2f} hours of audio"
+    )
+if __name__ == "__main__":
+    main()

tools/vqgan/inference.py CHANGED Viewed

@@ -2,13 +2,12 @@ from pathlib import Path
 import click
 import hydra
-import librosa
 import numpy as np
 import soundfile as sf
 import torch
 from hydra import compose, initialize
 from hydra.utils import instantiate
-from lightning import LightningModule
 from loguru import logger
 from omegaconf import OmegaConf
@@ -23,20 +22,26 @@ def load_model(config_name, checkpoint_path, device="cuda"):
     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
         cfg = compose(config_name=config_name)
-    model: LightningModule = instantiate(cfg.model)
     state_dict = torch.load(
         checkpoint_path,
-        map_location=model.device,
     )
     if "state_dict" in state_dict:
         state_dict = state_dict["state_dict"]
-    model.load_state_dict(state_dict, strict=False)
     model.eval()
     model.to(device)
-    logger.info("Restored model from checkpoint")
     return model
@@ -51,11 +56,10 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 @click.option(
     "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
 )
-@click.option("--config-name", "-cfg", default="vqgan_pretrain")
 @click.option(
     "--checkpoint-path",
-    "-ckpt",
-    default="checkpoints/vq-gan-group-fsq-2x1024.pth",
 )
 @click.option(
     "--device",
@@ -67,21 +71,22 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
     if input_path.suffix in AUDIO_EXTENSIONS:
         logger.info(f"Processing in-place reconstruction of {input_path}")
         # Load audio
-        audio, _ = librosa.load(
-            input_path,
-            sr=model.sampling_rate,
-            mono=True,
         )
-        audios = torch.from_numpy(audio).to(model.device)[None, None, :]
         logger.info(
-            f"Loaded audio with {audios.shape[2] / model.sampling_rate:.2f} seconds"
         )
         # VQ Encoder
-        audio_lengths = torch.tensor(
-            [audios.shape[2]], device=model.device, dtype=torch.long
-        )
         indices = model.encode(audios, audio_lengths)[0][0]
         logger.info(f"Generated indices of shape {indices.shape}")
@@ -91,17 +96,15 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
     elif input_path.suffix == ".npy":
         logger.info(f"Processing precomputed indices from {input_path}")
         indices = np.load(input_path)
-        indices = torch.from_numpy(indices).to(model.device).long()
         assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
     else:
         raise ValueError(f"Unknown input type: {input_path}")
     # Restore
-    feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
-    fake_audios = model.decode(
-        indices=indices[None], feature_lengths=feature_lengths, return_audios=True
-    )
-    audio_time = fake_audios.shape[-1] / model.sampling_rate
     logger.info(
         f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
@@ -109,7 +112,7 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
     # Save audio
     fake_audio = fake_audios[0, 0].float().cpu().numpy()
-    sf.write(output_path, fake_audio, model.sampling_rate)
     logger.info(f"Saved audio to {output_path}")

 import click
 import hydra
 import numpy as np
 import soundfile as sf
 import torch
+import torchaudio
 from hydra import compose, initialize
 from hydra.utils import instantiate
 from loguru import logger
 from omegaconf import OmegaConf
     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
         cfg = compose(config_name=config_name)
+    model = instantiate(cfg)
     state_dict = torch.load(
         checkpoint_path,
+        map_location=device,
     )
     if "state_dict" in state_dict:
         state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    result = model.load_state_dict(state_dict, strict=False)
     model.eval()
     model.to(device)
+    logger.info(f"Loaded model: {result}")
     return model
 @click.option(
     "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
 )
+@click.option("--config-name", default="firefly_gan_vq")
 @click.option(
     "--checkpoint-path",
+    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
 )
 @click.option(
     "--device",
     if input_path.suffix in AUDIO_EXTENSIONS:
         logger.info(f"Processing in-place reconstruction of {input_path}")
         # Load audio
+        audio, sr = torchaudio.load(str(input_path))
+        if audio.shape[0] > 1:
+            audio = audio.mean(0, keepdim=True)
+        audio = torchaudio.functional.resample(
+            audio, sr, model.spec_transform.sample_rate
         )
+        audios = audio[None].to(device)
         logger.info(
+            f"Loaded audio with {audios.shape[2] / model.spec_transform.sample_rate:.2f} seconds"
         )
         # VQ Encoder
+        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
         indices = model.encode(audios, audio_lengths)[0][0]
         logger.info(f"Generated indices of shape {indices.shape}")
     elif input_path.suffix == ".npy":
         logger.info(f"Processing precomputed indices from {input_path}")
         indices = np.load(input_path)
+        indices = torch.from_numpy(indices).to(device).long()
         assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
     else:
         raise ValueError(f"Unknown input type: {input_path}")
     # Restore
+    feature_lengths = torch.tensor([indices.shape[1]], device=device)
+    fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
+    audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
     logger.info(
         f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
     # Save audio
     fake_audio = fake_audios[0, 0].float().cpu().numpy()
+    sf.write(output_path, fake_audio, model.spec_transform.sample_rate)
     logger.info(f"Saved audio to {output_path}")