fish-speech-1

Sleeping

App Files Files Community

lengyue233 commited on Apr 30

Commit

12b4214

•

1 Parent(s): 662d788

Update model to large sft

Browse files

Files changed (9) hide show

app.py +53 -37
tools/extract_model.py +0 -21
tools/llama/build_dataset.py +0 -165
tools/llama/generate.py +64 -8
tools/llama/rebuild_tokenizer.py +0 -57
tools/merge_asr_files.py +0 -55
tools/vqgan/create_train_split.py +0 -54
tools/vqgan/extract_vq.py +0 -213
tools/whisper_asr.py +0 -113

app.py CHANGED Viewed

@@ -1,34 +1,26 @@
 import subprocess as sp
 import os
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
-if not os.path.exists("checkpoints/text2semantic-medium-v1-2k.pth"):
-    print("Downloading text2semantic-medium-v1-2k.pth")
-    sp.run(["wget", "-q", "-O", "checkpoints/text2semantic-medium-v1-2k.pth", os.environ["CKPT_SEMANTIC"]])
-if not os.path.exists("checkpoints/vq-gan-group-fsq-2x1024.pth"):
-    print("Downloading vq-gan-group-fsq-2x1024.pth")
-    sp.run(["wget", "-q", "-O", "checkpoints/vq-gan-group-fsq-2x1024.pth", os.environ["CKPT_VQGAN"]])
 print("All checkpoints downloaded")
 import html
 from argparse import ArgumentParser
-from io import BytesIO
 from pathlib import Path
 import gradio as gr
 import librosa
-import spaces
 import torch
 from loguru import logger
-from torchaudio import functional as AF
 from transformers import AutoTokenizer
-from tools.llama.generate import generate_long
-from tools.llama.generate import load_model as load_llama_model
 from tools.vqgan.inference import load_model as load_vqgan_model
 # Make einx happy
@@ -52,16 +44,30 @@ We are not responsible for any misuse of the model, please consider your local l
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
 def build_html_error_message(error):
     return f"""
-    <div style="color: red; font-weight: bold;">
         {html.escape(error)}
     </div>
     """
-@spaces.GPU
 def inference(
     text,
     enable_reference_audio,
@@ -73,13 +79,10 @@ def inference(
     top_p,
     repetition_penalty,
     temperature,
-    speaker=None,
 ):
-    if len(reference_text) > 100:
-        return None, "Ref text is too long, please keep it under 100 characters."
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
-        return None, "Text is too long, please keep it under 1000 characters."
     # Parse reference audio aka prompt
     prompt_tokens = None
@@ -103,11 +106,9 @@ def inference(
         prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
     # LLAMA Inference
-    result = generate_long(
-        model=llama_model,
         tokenizer=llama_tokenizer,
         device=vqgan_model.device,
-        decode_one_token=decode_one_token,
         max_new_tokens=max_new_tokens,
         text=text,
         top_k=int(top_k) if top_k > 0 else None,
@@ -123,7 +124,18 @@ def inference(
         prompt_text=reference_text if enable_reference_audio else None,
     )
-    codes = next(result)
     # VQGAN Inference
     feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
@@ -151,9 +163,7 @@ def build_app():
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
-                    label="Input Text / 输入文本",
-                    placeholder=TEXTBOX_PLACEHOLDER,
-                    lines=15,
                 )
                 with gr.Row():
@@ -198,11 +208,11 @@ def build_app():
                             step=0.01,
                         )
-                        # speaker = gr.Textbox(
-                        #     label="Speaker / 说话人",
-                        #     placeholder="Type name of the speaker / 输入说话人的名称",
-                        #     lines=1,
-                        # )
                     with gr.Tab(label="Reference Audio / 参考音频"):
                         gr.Markdown(
@@ -248,7 +258,7 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
-                # speaker,
             ],
             [audio, error],
             concurrency_limit=1,
@@ -262,10 +272,10 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/text2semantic-medium-v1-2k.pth",
     )
     parser.add_argument(
-        "--llama-config-name", type=str, default="dual_ar_2_codebook_medium"
     )
     parser.add_argument(
         "--vqgan-checkpoint-path",
@@ -278,7 +288,7 @@ def parse_args():
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--max-length", type=int, default=2048)
     parser.add_argument("--compile", action="store_true")
-    parser.add_argument("--max-gradio-length", type=int, default=1024)
     return parser.parse_args()
@@ -288,9 +298,15 @@ if __name__ == "__main__":
     args.precision = torch.half if args.half else torch.bfloat16
     args.compile = True
     logger.info("Loading Llama model...")
-    llama_model, decode_one_token = load_llama_model(
         config_name=args.llama_config_name,
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,

 import subprocess as sp
 import os
+from huggingface_hub import hf_hub_download
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
+hf_hub_download("fishaudio/fish-speech-1", "./checkpoints/fish-speech-1")
 print("All checkpoints downloaded")
 import html
+import os
+import threading
 from argparse import ArgumentParser
 from pathlib import Path
 import gradio as gr
 import librosa
 import torch
 from loguru import logger
 from transformers import AutoTokenizer
+from tools.llama.generate import launch_thread_safe_queue
 from tools.vqgan.inference import load_model as load_vqgan_model
 # Make einx happy
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
+try:
+    import spaces
+    GPU_DECORATOR = spaces.GPU
+except ImportError:
+    def GPU_DECORATOR(func):
+        def wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+        return wrapper
 def build_html_error_message(error):
     return f"""
+    <div style="color: red;
+    font-weight: bold;">
         {html.escape(error)}
     </div>
     """
+@GPU_DECORATOR
+@torch.inference_mode()
 def inference(
     text,
     enable_reference_audio,
     top_p,
     repetition_penalty,
     temperature,
+    speaker,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
+        return None, f"Text is too long, please keep it under {args.max_gradio_length} characters."
     # Parse reference audio aka prompt
     prompt_tokens = None
         prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
     # LLAMA Inference
+    request = dict(
         tokenizer=llama_tokenizer,
         device=vqgan_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_k=int(top_k) if top_k > 0 else None,
         prompt_text=reference_text if enable_reference_audio else None,
     )
+    payload = dict(
+        event=threading.Event(),
+        request=request,
+    )
+    llama_queue.put(payload)
+    # Wait for the result
+    payload["event"].wait()
+    if payload["success"] is False:
+        raise payload["response"]
+    codes = payload["response"][0]
     # VQGAN Inference
     feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
+                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
                 )
                 with gr.Row():
                             step=0.01,
                         )
+                        speaker = gr.Textbox(
+                            label="Speaker / 说话人",
+                            placeholder="Type name of the speaker / 输入说话人的名称",
+                            lines=1,
+                        )
                     with gr.Tab(label="Reference Audio / 参考音频"):
                         gr.Markdown(
                 top_p,
                 repetition_penalty,
                 temperature,
+                speaker,
             ],
             [audio, error],
             concurrency_limit=1,
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
+        default="checkpoints/text2semantic-sft-large-v1-4k.pth",
     )
     parser.add_argument(
+        "--llama-config-name", type=str, default="dual_ar_2_codebook_large"
     )
     parser.add_argument(
         "--vqgan-checkpoint-path",
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--max-length", type=int, default=2048)
     parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--max-gradio-length", type=int, default=0)
     return parser.parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
     args.compile = True
+    args.max_gradio_length = 1024
+    args.tokenizer = "./checkpoints/fish-speech-1"
+    args.llama_checkpoint_path = "./checkpoints/text2semantic-sft-large-v1-4k.pth"
+    args.llama_config_name = "dual_ar_2_codebook_large"
+    args.vqgan_checkpoint_path = "./checkpoints/vq-gan-group-fsq-2x1024.pth"
+    args.vqgan_config_name = "vqgan_pretrain"
     logger.info("Loading Llama model...")
+    llama_queue = launch_thread_safe_queue(
         config_name=args.llama_config_name,
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,

tools/extract_model.py DELETED Viewed

@@ -1,21 +0,0 @@
-import click
-import torch
-from loguru import logger
-@click.command()
-@click.argument("model_path")
-@click.argument("output_path")
-def main(model_path, output_path):
-    if model_path == output_path:
-        logger.error("Model path and output path are the same")
-        return
-    logger.info(f"Loading model from {model_path}")
-    state_dict = torch.load(model_path, map_location="cpu")["state_dict"]
-    torch.save(state_dict, output_path)
-    logger.info(f"Model saved to {output_path}")
-if __name__ == "__main__":
-    main()

tools/llama/build_dataset.py DELETED Viewed

@@ -1,165 +0,0 @@
-import itertools
-import os
-import re
-from collections import defaultdict
-from functools import partial
-from multiprocessing import Pool
-from pathlib import Path
-import click
-import numpy as np
-from loguru import logger
-from tqdm import tqdm
-from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
-from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
-from fish_speech.utils.file import load_filelist
-# To avoid CPU overload
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["OMP_NUM_THREADS"] = "1"
-def task_generator_folder(root: Path, text_extension: str):
-    files = list(tqdm(Path(root).rglob("*.npy"), desc=f"Loading {root}"))
-    files = sorted(files)
-    grouped_files = defaultdict(list)
-    for file in tqdm(files, desc=f"Grouping {root}"):
-        p = str(file.parent)
-        try:
-            if isinstance(text_extension, str):
-                texts = [file.with_suffix(text_extension).read_text()]
-            else:
-                texts = [file.with_suffix(ext).read_text() for ext in text_extension]
-        except Exception as e:
-            logger.error(f"Failed to read text {file}: {e}")
-            continue
-        grouped_files[p].append((file, texts))
-    logger.info(
-        f"Found {len(grouped_files)} groups in {root}, {list(grouped_files.keys())[:5]}..."
-    )
-    for name, subset in grouped_files.items():
-        yield name, subset, "folder"
-def task_generator_filelist(filelist):
-    grouped_files = defaultdict(list)
-    for filename, speaker, _, text in load_filelist(filelist):
-        grouped_files[speaker].append((Path(filename), [text]))
-    logger.info(f"Found {len(grouped_files)} groups in {filelist}")
-    for speaker, values in grouped_files.items():
-        yield speaker, values, "filelist"
-def run_task(task):
-    name, subset, source = task
-    # Parse the files
-    sentences = []
-    for file in subset:
-        file, texts = file
-        np_file = file.with_suffix(".npy")
-        if np_file.exists() is False:
-            logger.warning(f"Can't find {np_file}")
-            continue
-        new_texts = []
-        for text in texts:
-            # Simple cleaning: replace { xxx } and < xxx > with space
-            text = re.sub(r"\{.*?\}", " ", text)
-            text = re.sub(r"<.*?>", " ", text)
-            text = re.sub(r"\s+", " ", text)
-            new_texts.append(text)
-        try:
-            semantics = np.load(np_file)
-        except Exception as e:
-            logger.error(f"Failed to parse {file}: {e}")
-            continue
-        if isinstance(semantics, np.ndarray):
-            semantics = semantics.tolist()
-        sentences.append(
-            Sentence(
-                texts=new_texts,
-                semantics=[Semantics(values=s) for s in semantics],
-            )
-        )
-    # Pack the sentences
-    return pack_pb_stream(
-        TextData(
-            source=source,
-            name=name,
-            sentences=sentences,
-        )
-    )
-@click.command()
-@click.option(
-    "--input",
-    type=click.Path(path_type=Path),
-    required=True,
-    help="A folder containing the dataset or a filelist",
-    multiple=True,
-)
-@click.option(
-    "--output", type=click.Path(path_type=Path), default="data/quantized-dataset-ft"
-)
-@click.option("--num-workers", type=int, default=16)
-@click.option("--text-extension", type=str, default=[".txt"], multiple=True)
-@click.option(
-    "--shard-size", type=int, default=10, help="The maximum size of each shard in mb"
-)
-def main(input, output, num_workers, text_extension, shard_size):
-    generator_fns = []
-    for f in input:
-        assert f.exists(), f"{f} not found"
-        if f.is_dir():
-            generator_fn = task_generator_folder(f, text_extension)
-        else:
-            generator_fn = task_generator_filelist(f)
-        generator_fns.append(generator_fn)
-    generator_fn = itertools.chain(*generator_fns)
-    output.mkdir(parents=True, exist_ok=True)
-    dataset_fp = None
-    tar_idx = 0
-    written_size = 0
-    with Pool(num_workers) as p:
-        for result in tqdm(p.imap_unordered(run_task, generator_fn)):
-            if dataset_fp is None:
-                dataset_fp = open(Path(output) / f"{tar_idx:08d}.protos", "wb")
-            dataset_fp.write(result)
-            written_size += len(result)
-            if written_size > shard_size * 1024 * 1024:
-                logger.info(f"Finished writing {tar_idx} shards to {output}")
-                dataset_fp.close()
-                dataset_fp = None
-                written_size = 0
-                tar_idx += 1
-    if dataset_fp is not None:
-        dataset_fp.close()
-    logger.info(f"Finished writing {tar_idx + 1} shards to {output}")
-if __name__ == "__main__":
-    main()

tools/llama/generate.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import os
 import time
 from pathlib import Path
 from typing import Optional, Tuple, Union
 import click
 import numpy as np
 import torch
 import torch._dynamo.config
@@ -361,6 +364,7 @@ def encode_tokens(
 def load_model(
     config_name, checkpoint_path, device, precision, max_length, compile=False
 ):
     with initialize(version_base="1.3", config_path="../../fish_speech/configs/model"):
         cfg = compose(
             config_name=config_name, overrides=[f"config.max_seq_len={max_length}"]
@@ -456,6 +460,7 @@ def generate_long(
     speaker: Optional[str] = None,
     prompt_text: Optional[str] = None,
     prompt_tokens: Optional[torch.Tensor] = None,
 ):
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
@@ -496,6 +501,10 @@ def generate_long(
         all_codes = []
         seg_idx = 0
         while seg_idx < len(encoded):
             logger.info(
                 f"Generating sentence {seg_idx + 1}/{len(encoded)} of sample {sample_idx + 1}/{num_samples}"
@@ -562,10 +571,7 @@ def generate_long(
             codes = y[1:, prompt_length:-2].clone()
             codes = codes - 2
-            if not (codes >= 0).all():
-                global_encoded.pop()
-                logger.warning(f"Negative code found: {codes}, retrying ...")
-                continue
             decoded = y[:, prompt_length:-1].clone()
             if decoded[0, -1] != im_end_id:  # <im_end>
@@ -576,13 +582,63 @@ def generate_long(
             # But for global encoding, we should keep the <im_end> token
             global_encoded.append(decoded)
-            all_codes.append(codes)
             seg_idx += 1
-        codes = torch.cat(all_codes, dim=1)
-        assert (codes >= 0).all(), f"Negative code found: {codes}"
-        yield codes
 @click.command()

 import os
+import queue
+import threading
 import time
 from pathlib import Path
 from typing import Optional, Tuple, Union
 import click
+import hydra
 import numpy as np
 import torch
 import torch._dynamo.config
 def load_model(
     config_name, checkpoint_path, device, precision, max_length, compile=False
 ):
+    hydra.core.global_hydra.GlobalHydra.instance().clear()
     with initialize(version_base="1.3", config_path="../../fish_speech/configs/model"):
         cfg = compose(
             config_name=config_name, overrides=[f"config.max_seq_len={max_length}"]
     speaker: Optional[str] = None,
     prompt_text: Optional[str] = None,
     prompt_tokens: Optional[torch.Tensor] = None,
+    is_streaming: bool = False,
 ):
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
         all_codes = []
         seg_idx = 0
+        if use_prompt:
+            seg_idx = 1
+            global_encoded.append(encoded[0])
         while seg_idx < len(encoded):
             logger.info(
                 f"Generating sentence {seg_idx + 1}/{len(encoded)} of sample {sample_idx + 1}/{num_samples}"
             codes = y[1:, prompt_length:-2].clone()
             codes = codes - 2
+            assert (codes >= 0).all(), f"Negative code found"
             decoded = y[:, prompt_length:-1].clone()
             if decoded[0, -1] != im_end_id:  # <im_end>
             # But for global encoding, we should keep the <im_end> token
             global_encoded.append(decoded)
+            if is_streaming:
+                assert (codes >= 0).all(), f"Negative code found: {codes}"
+                yield codes
+            else:
+                all_codes.append(codes)
             seg_idx += 1
+        if is_streaming:
+            # This indicates the end of the current sample
+            yield None
+        else:
+            all_codes = torch.cat(all_codes, dim=1)
+            assert (all_codes >= 0).all(), f"Negative code found: {codes}"
+            yield all_codes
+def launch_thread_safe_queue(
+    config_name,
+    checkpoint_path,
+    device,
+    precision,
+    max_length,
+    compile=False,
+):
+    input_queue = queue.Queue()
+    def worker():
+        model, decode_one_token = load_model(
+            config_name, checkpoint_path, device, precision, max_length, compile=compile
+        )
+        while True:
+            item = input_queue.get()
+            if item is None:
+                break
+            kwargs = item["request"]
+            event = item["event"]
+            try:
+                item["success"] = True
+                item["response"] = list(
+                    generate_long(
+                        model=model, decode_one_token=decode_one_token, **kwargs
+                    )
+                )
+            except Exception as e:
+                item["success"] = False
+                item["response"] = e
+            event.set()
+    threading.Thread(target=worker, daemon=True).start()
+    return input_queue
 @click.command()

tools/llama/rebuild_tokenizer.py DELETED Viewed

@@ -1,57 +0,0 @@
-from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-# Initialize a tokenizer
-tokenizer = Tokenizer(models.BPE())
-# Customize pre-tokenization and decoding
-tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
-tokenizer.decoder = decoders.ByteLevel()
-tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
-# Don't train the tokenizer
-trainer = trainers.BpeTrainer(
-    vocab_size=0,
-    min_frequency=2,
-    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
-    special_tokens=[
-        "<|begin_of_sequence|>",
-        "<|end_of_sequence|>",
-        "<|im_start|>",
-        "<|im_sep|>",  # system, user, assistant, etc.
-        "<|im_end|>",
-        "<|semantic|>",  # audio features
-        "<|pad|>",
-    ],
-)
-# <|im_start|>user<|im_sep|>...<|im_end|>
-# <|im_start|>assistant<|im_sep|><|semantic|><|semantic|><|semantic|><|semantic|><|semantic|><|im_end|>
-tokenizer.train_from_iterator([], trainer=trainer)
-print(len(tokenizer.get_vocab()))
-x = tokenizer.encode(
-    "Hello, how are you? dfgnviadfjoiviouajeiodfjv 你好世界 🈶<|semantic|>"
-).ids
-print(x, len(x))
-print(tokenizer.decode(x, skip_special_tokens=True))
-tokenizer = PreTrainedTokenizerFast(
-    tokenizer_object=tokenizer,
-    pad_token="<|pad|>",
-    bos_token="<|begin_of_sequence|>",
-    eos_token="<|end_of_sequence|>",
-)
-# Try tokenizing a new sequence
-sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene. 测试中文, 你好世界 🈶<|semantic|>"
-encoded = tokenizer(sequence).input_ids
-print("Test encoding....")
-print(f"\tSentence: {sequence}")
-print(f"\tEncoded: {encoded}")
-print(f"\tDecoded: {tokenizer.batch_decode(encoded)}")
-print(f"\tDecoded: {tokenizer.decode(encoded)}")
-tokenizer.push_to_hub("fishaudio/fish-speech-1", private=True)

tools/merge_asr_files.py DELETED Viewed

@@ -1,55 +0,0 @@
-import os
-from pathlib import Path
-from pydub import AudioSegment
-from tqdm import tqdm
-from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
-def merge_and_delete_files(save_dir, original_files):
-    save_path = Path(save_dir)
-    audio_slice_files = list_files(
-        path=save_dir, extensions=AUDIO_EXTENSIONS.union([".lab"]), recursive=True
-    )
-    audio_files = {}
-    label_files = {}
-    for file_path in tqdm(audio_slice_files, desc="Merging audio files"):
-        rel_path = Path(file_path).relative_to(save_path)
-        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
-        if file_path.suffix == ".wav":
-            prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
-            if prefix == rel_path.parent / file_path.stem:
-                continue
-            audio = AudioSegment.from_wav(file_path)
-            if prefix in audio_files.keys():
-                audio_files[prefix] = audio_files[prefix] + audio
-            else:
-                audio_files[prefix] = audio
-        elif file_path.suffix == ".lab":
-            prefix = rel_path.parent / file_path.stem.rsplit("-", 1)[0]
-            if prefix == rel_path.parent / file_path.stem:
-                continue
-            with open(file_path, "r", encoding="utf-8") as f:
-                label = f.read()
-            if prefix in label_files.keys():
-                label_files[prefix] = label_files[prefix] + ", " + label
-            else:
-                label_files[prefix] = label
-    for prefix, audio in audio_files.items():
-        output_audio_path = save_path / f"{prefix}.wav"
-        audio.export(output_audio_path, format="wav")
-    for prefix, label in label_files.items():
-        output_label_path = save_path / f"{prefix}.lab"
-        with open(output_label_path, "w", encoding="utf-8") as f:
-            f.write(label)
-    for file_path in original_files:
-        os.remove(file_path)
-if __name__ == "__main__":
-    merge_and_delete_files("/made/by/spicysama/laziman", [__file__])

tools/vqgan/create_train_split.py DELETED Viewed

@@ -1,54 +0,0 @@
-import math
-from pathlib import Path
-from random import Random
-import click
-from loguru import logger
-from tqdm import tqdm
-from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist
-@click.command()
-@click.argument("root", type=click.Path(exists=True, path_type=Path))
-@click.option("--val-ratio", type=float, default=None)
-@click.option("--val-count", type=int, default=None)
-@click.option("--filelist", default=None, type=Path)
-def main(root, val_ratio, val_count, filelist):
-    if filelist:
-        files = [i[0] for i in load_filelist(filelist)]
-    else:
-        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)
-    logger.info(f"Found {len(files)} files")
-    files = [str(file.relative_to(root)) for file in tqdm(files)]
-    Random(42).shuffle(files)
-    if val_count is None and val_ratio is None:
-        logger.info("Validation ratio and count not specified, using min(20%, 100)")
-        val_size = min(100, math.ceil(len(files) * 0.2))
-    elif val_count is not None and val_ratio is not None:
-        logger.error("Cannot specify both val_count and val_ratio")
-        return
-    elif val_count is not None:
-        if val_count < 1 or val_count > len(files):
-            logger.error("val_count must be between 1 and number of files")
-            return
-        val_size = val_count
-    else:
-        val_size = math.ceil(len(files) * val_ratio)
-    logger.info(f"Using {val_size} files for validation")
-    with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
-        f.write("\n".join(files[val_size:]))
-    with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
-        f.write("\n".join(files[:val_size]))
-    logger.info("Done")
-if __name__ == "__main__":
-    main()

tools/vqgan/extract_vq.py DELETED Viewed

@@ -1,213 +0,0 @@
-import os
-import subprocess as sp
-import sys
-import time
-from datetime import timedelta
-from functools import lru_cache
-from pathlib import Path
-from random import Random
-import click
-import numpy as np
-import torch
-import torchaudio
-from hydra import compose, initialize
-from hydra.utils import instantiate
-from lightning import LightningModule
-from loguru import logger
-from omegaconf import OmegaConf
-from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist
-# register eval resolver
-OmegaConf.register_new_resolver("eval", eval)
-# This file is used to convert the audio files to text files using the Whisper model.
-# It's mainly used to generate the training data for the VQ model.
-RANK = int(os.environ.get("SLURM_PROCID", 0))
-WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))
-logger_format = (
-    "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
-    "<level>{level: <8}</level> | "
-    "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
-    "{extra[rank]} - <level>{message}</level>"
-)
-logger.configure(extra={"rank": f"RANK: {RANK} / {WORLD_SIZE}"})
-logger.remove()
-logger.add(sys.stderr, format=logger_format)
-@lru_cache(maxsize=1)
-def get_model(
-    config_name: str = "vqgan_pretrain",
-    checkpoint_path: str = "checkpoints/vqgan/step_000380000.ckpt",
-):
-    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
-        cfg = compose(config_name=config_name)
-    model: LightningModule = instantiate(cfg.model)
-    state_dict = torch.load(
-        checkpoint_path,
-        map_location=model.device,
-    )
-    if "state_dict" in state_dict:
-        state_dict = state_dict["state_dict"]
-    model.load_state_dict(state_dict, strict=False)
-    model.eval()
-    model.cuda()
-    logger.info(f"Loaded model")
-    return model
-@torch.inference_mode()
-def process_batch(files: list[Path], model) -> float:
-    wavs = []
-    audio_lengths = []
-    new_files = []
-    max_length = total_time = 0
-    for file in files:
-        try:
-            wav, sr = torchaudio.load(
-                str(file), backend="sox"
-            )  # Need to install libsox-dev
-        except Exception as e:
-            logger.error(f"Error reading {file}: {e}")
-            continue
-        if wav.shape[0] > 1:
-            wav = wav.mean(dim=0, keepdim=True)
-        wav = torchaudio.functional.resample(wav.cuda(), sr, model.sampling_rate)[0]
-        total_time += len(wav) / model.sampling_rate
-        max_length = max(max_length, len(wav))
-        wavs.append(wav)
-        audio_lengths.append(len(wav))
-        new_files.append(file)
-    files = new_files
-    # Pad to max length
-    for i, wav in enumerate(wavs):
-        wavs[i] = torch.nn.functional.pad(wav, (0, max_length - len(wav)), "constant")
-    audios = torch.stack(wavs, dim=0)[:, None]
-    audio_lengths = torch.tensor(audio_lengths, device=model.device, dtype=torch.long)
-    # Calculate lengths
-    indices, feature_lengths = model.encode(audios, audio_lengths)
-    # Save to disk
-    outputs = indices.cpu().numpy()
-    for file, length, feature, audio_length in zip(
-        files, feature_lengths, outputs, audio_lengths
-    ):
-        feature = feature[:, :length]
-        # (T,)
-        with open(file.with_suffix(".npy"), "wb") as f:
-            np.save(f, feature)
-    return total_time
-@click.command()
-@click.argument("folder")
-@click.option("--num-workers", default=1)
-@click.option("--config-name", default="vqgan_pretrain")
-@click.option(
-    "--checkpoint-path",
-    default="checkpoints/vq-gan-group-fsq-8x1024-wn-20x768-30kh.pth",
-)
-@click.option("--batch-size", default=64)
-@click.option("--filelist", default=None, type=Path)
-def main(
-    folder: str,
-    num_workers: int,
-    config_name: str,
-    checkpoint_path: str,
-    batch_size: int,
-    filelist: Path,
-):
-    if num_workers > 1 and WORLD_SIZE != num_workers:
-        assert WORLD_SIZE == 1, "You should either use SLURM or this launcher, not both"
-        logger.info(f"Spawning {num_workers} workers")
-        visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-        if visible_devices is None:
-            visible_devices = list(range(torch.cuda.device_count()))
-        else:
-            visible_devices = visible_devices.split(",")
-        processes = []
-        for i in range(num_workers):
-            env = os.environ.copy()
-            env["CUDA_VISIBLE_DEVICES"] = str(visible_devices[i % len(visible_devices)])
-            env["SLURM_PROCID"] = str(i)
-            env["SLURM_NTASKS"] = str(num_workers)
-            processes.append(
-                sp.Popen(
-                    [sys.executable] + sys.argv.copy(),
-                    env=env,
-                )
-            )
-        for p in processes:
-            p.wait()
-        logger.info(f"All workers finished")
-        return
-    # This is a worker
-    logger.info(f"Starting worker")
-    if filelist:
-        files = [i[0] for i in load_filelist(filelist)]
-    else:
-        files = list_files(folder, AUDIO_EXTENSIONS, recursive=True, sort=False)
-    print(f"Found {len(files)} files")
-    # files = [Path(f) for f in files if not Path(f).with_suffix(".npy").exists()]
-    total_files = len(files)
-    files = files[RANK::WORLD_SIZE]
-    logger.info(f"Processing {len(files)}/{total_files} files")
-    # Batch processing
-    total_time = 0
-    begin_time = time.time()
-    processed_files = 0
-    model = get_model(config_name, checkpoint_path)
-    for n_batch, idx in enumerate(range(0, len(files), batch_size)):
-        batch = files[idx : idx + batch_size]
-        batch_time = process_batch(batch, model)
-        total_time += batch_time
-        processed_files += len(batch)
-        if (n_batch + 1) % 10 == 0:
-            eta = (
-                (time.time() - begin_time)
-                / processed_files
-                * (len(files) - processed_files)
-            )
-            logger.info(
-                f"Processed {processed_files} files, {total_time / 3600:.2f} hours of audio, "
-                + f"ETA: {timedelta(seconds=round(eta))}s"
-            )
-    logger.info(
-        f"Finished processing {len(files)} files, {total_time / 3600:.2f} hours of audio"
-    )
-if __name__ == "__main__":
-    main()

tools/whisper_asr.py DELETED Viewed

@@ -1,113 +0,0 @@
-"""
-Used to transcribe all audio files in one folder into another folder.
-e.g.
-Directory structure:
---pre_data_root
-----SP_1
-------01.wav
-------02.wav
-------......
-----SP_2
-------01.wav
-------02.wav
-------......
-Use
-python tools/whisper_asr.py --audio_dir pre_data_root/SP_1 --save_dir data/SP_1
-to transcribe the first speaker.
-Use
-python tools/whisper_asr.py --audio_dir pre_data_root/SP_2 --save_dir data/SP_2
-to transcribe the second speaker.
-Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
-"""
-from pathlib import Path
-import click
-import librosa
-import soundfile as sf
-import whisper
-from loguru import logger
-from merge_asr_files import merge_and_delete_files
-from tqdm import tqdm
-from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
-@click.command()
-@click.option("--model-size", default="large", help="Size of the Whisper model")
-@click.option("--audio-dir", required=True, help="Directory containing audio files")
-@click.option(
-    "--save-dir", required=True, help="Directory to save processed audio files"
-)
-@click.option(
-    "--sample-rate",
-    default=None,
-    type=int,
-    help="Output sample rate, default to input sample rate",
-)
-@click.option("--device", default="cuda", help="Device to use")
-@click.option("--language", default="ZH", help="Language of the transcription")
-def main(model_size, audio_dir, save_dir, sample_rate, device, language):
-    logger.info("Loading / Downloading OpenAI Whisper model...")
-    model = whisper.load_model(
-        name=model_size,
-        device=device,
-        download_root=str(Path(".cache/whisper").resolve()),
-    )
-    logger.info("Model loaded.")
-    save_path = Path(save_dir)
-    save_path.mkdir(parents=True, exist_ok=True)
-    original_files = []
-    audio_files = list_files(
-        path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
-    )
-    for file_path in tqdm(audio_files, desc="Processing audio file"):
-        file_stem = file_path.stem
-        file_suffix = file_path.suffix
-        rel_path = Path(file_path).relative_to(audio_dir)
-        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
-        if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and (
-            save_path / rel_path.parent / f"{rel_path.stem}.lab"
-        ).exists():
-            continue
-        audio, sr = librosa.load(file_path, sr=sample_rate, mono=False)
-        transcription = model.transcribe(str(file_path), language=language)
-        for segment in transcription.get("segments", []):
-            id, text, start, end = (
-                segment["id"],
-                segment["text"],
-                segment["start"],
-                segment["end"],
-            )
-            extract = audio[..., int(start * sr) : int(end * sr)]
-            audio_save_path = (
-                save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}"
-            )
-            sf.write(
-                audio_save_path,
-                extract,
-                samplerate=sr,
-            )
-            original_files.append(audio_save_path)
-            transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab"
-            with open(
-                transcript_save_path,
-                "w",
-                encoding="utf-8",
-            ) as f:
-                f.write(text)
-            original_files.append(transcript_save_path)
-    merge_and_delete_files(save_dir, original_files)
-if __name__ == "__main__":
-    main()