Spaces:

rinflan
/

fish-diffusion_demo

No application file

App Files Files Community

rinflan commited on Feb 24, 2023

Commit

5f84dff

•

1 Parent(s): 7d24597

Upload 17 files

Browse files

Files changed (15) hide show

.gitignore +155 -0
Dockerfile +16 -0
LICENSE +201 -0
README.en.md +151 -0
flask_api.py +63 -0
inference.py +425 -0
inference_svs.py +237 -0
inference_vst.py +217 -0
poetry.lock +0 -0
poetry.toml +2 -0
pyproject.toml +51 -0
requirements.txt +103 -0
train.py +228 -0
tst +1 -0
开始处理.bat +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,155 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Text tool
+tools/text/create_symbol_dict.py
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data
+dataset
+.vscode
+*.pt
+*.pth
+hifigan/model
+output
+lightning_logs
+logs
+wandb
+*.ckpt
+checkpoints
+filelists
+raw
+results
+configs/exp_*.py
+exp_*.sh
+.DS_Store
+.vscode
+exported

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu22.04 AS fish-diffusion
+# Install Poetry
+RUN apt-get update && apt-get install -y git curl python3 python3-pip build-essential ffmpeg libsm6 libxext6
+RUN curl -sSL https://install.python-poetry.org | python3 -
+ENV PATH="/root/.local/bin:${PATH}"
+RUN poetry config virtualenvs.create false
+# Install dependencies
+WORKDIR /root
+RUN pip3 install torch torchvision torchaudio
+RUN git clone https://github.com/fishaudio/fish-diffusion.git && cd fish-diffusion && poetry install
+WORKDIR /root/fish-diffusion
+RUN python3 tools/download_nsf_hifigan.py --agree-license

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2023] [Fish Audio]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.en.md ADDED Viewed

	@@ -0,0 +1,151 @@

+<div align="center">
+<img alt="LOGO" src="https://cdn.jsdelivr.net/gh/fishaudio/fish-diffusion@main/images/logo_512x512.png" width="256" height="256" />
+# Fish Diffusion
+<div>
+<a href="https://github.com/fishaudio/fish-diffusion/actions/workflows/ci.yml">
+<img alt="Build Status" src="https://img.shields.io/github/actions/workflow/status/fishaudio/fish-diffusion/ci.yml?style=flat-square&logo=GitHub">
+</a>
+<a href="https://hub.docker.com/r/lengyue233/fish-diffusion">
+<img alt="Docker Hub" src="https://img.shields.io/docker/cloud/build/lengyue233/fish-diffusion?style=flat-square&logo=Docker&logoColor=white">
+</a>
+<a href="https://discord.gg/wbYSRBrW2E">
+<img alt="Discord" src="https://img.shields.io/discord/1044927142900809739?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square">
+</a>
+</div>
+</div>
+------
+An easy to understand TTS / SVS / SVC training framework.
+> Check our [Wiki](https://fishaudio.github.io/fish-diffusion/) to get started!
+[中文文档](README.md)
+## Summary
+Using Diffusion Model to solve different voice generating tasks. Compared with the original diffsvc repository, the advantages and disadvantages of this repository are as follows:
++ Support multi-speaker
++ The code structure of this repository is simpler and easier to understand, and all modules are decoupled
++ Support [441khz Diff Singer community vocoder](https://openvpi.github.io/vocoders/)
++ Support multi-machine multi-devices training, support half-precision training, save your training speed and memory
+## Preparing the environment
+The following commands need to be executed in the conda environment of python 3.10
+```bash
+# Install PyTorch related core dependencies, skip if installed
+# Reference: https://pytorch.org/get-started/locally/
+conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+# Install Poetry dependency management tool, skip if installed
+# Reference: https://python-poetry.org/docs/#installation
+curl -sSL https://install.python-poetry.org | python3 -
+# Install the project dependencies
+poetry install
+```
+## Vocoder preparation
+Fish Diffusion requires the [OPENVPI 441khz NSF-HiFiGAN](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1) vocoder to generate audio.
+### Automatic download
+```bash
+python tools/download_nsf_hifigan.py
+```
+If you are using the script to download the model, you can use the `--agree-license` parameter to agree to the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
+```bash
+python tools/download_nsf_hifigan.py --agree-license
+```
+### Manual download
+Download and unzip `nsf_hifigan_20221211.zip` from [441khz vocoder](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1)
+Copy the `nsf_hifigan` folder to the `checkpoints` directory (create if not exist)
+## Dataset preparation
+You only need to put the dataset into the `dataset` directory in the following file structure
+```shell
+dataset
+├───train
+│   ├───xxx1-xxx1.wav
+│   ├───...
+│   ├───Lxx-0xx8.wav
+│   └───speaker0 (Subdirectory is also supported)
+│       └───xxx1-xxx1.wav
+└───valid
+    ├───xx2-0xxx2.wav
+    ├───...
+    └───xxx7-xxx007.wav
+```
+```bash
+# Extract all data features, such as pitch, text features, mel features, etc.
+python tools/preprocessing/extract_features.py --config configs/svc_hubert_soft.py --path dataset --clean
+```
+## Baseline training
+> The project is under active development, please backup your config file
+> The project is under active development, please backup your config file
+> The project is under active development, please backup your config file
+```bash
+# Single machine single card / multi-card training
+python train.py --config configs/svc_hubert_soft.py
+# Resume training
+python train.py --config configs/svc_hubert_soft.py --resume [checkpoint]
+# Fine-tune the pre-trained model
+# Note: You should adjust the learning rate scheduler in the config file to warmup_cosine_finetune
+python train.py --config configs/svc_hubert_soft.py --pretrained [checkpoint]
+```
+## Inference
+```bash
+# Inference using shell, you can use --help to view more parameters
+python inference.py --config [config] \
+    --checkpoint [checkpoint] \
+    --input [input audio] \
+    --output [output audio]
+# Gradio Web Inference, other parameters will be used as gradio default parameters
+python inference/gradio_inference.py --config [config] \
+    --checkpoint [checkpoint] \
+    --gradio
+```
+## Convert a DiffSVC model to Fish Diffusion
+```bash
+python tools/diff_svc_converter.py --config configs/svc_hubert_soft_diff_svc.py \
+    --input-path [DiffSVC ckpt] \
+    --output-path [Fish Diffusion ckpt]
+```
+## Contributing
+If you have any questions, please submit an issue or pull request.
+You should run `tools/lint.sh` before submitting a pull request.
+Real-time documentation can be generated by
+```bash
+sphinx-autobuild docs docs/_build/html
+```
+## Credits
++ [diff-svc original](https://github.com/prophesier/diff-svc)
++ [diff-svc optimized](https://github.com/innnky/diff-svc/)
++ [DiffSinger](https://github.com/openvpi/DiffSinger/)
++ [SpeechSplit](https://github.com/auspicious3000/SpeechSplit)
+## Thanks to all contributors for their efforts
+<a href="https://github.com/fishaudio/fish-diffusion/graphs/contributors" target="_blank">
+  <img src="https://contrib.rocks/image?repo=fishaudio/fish-diffusion" />
+</a>

flask_api.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import io
+import logging
+import librosa
+import soundfile
+from flask import Flask, request, send_file
+from flask_cors import CORS
+#from infer_tools.infer_tool import Svc
+from inference_vst import SvcFish
+#from utils.hparams import hparams
+app = Flask(__name__)
+CORS(app)
+logging.getLogger('numba').setLevel(logging.WARNING)
+@app.route("/voiceChangeModel", methods=["POST"])
+def voice_change_model():
+    request_form = request.form
+    wave_file = request.files.get("sample", None)
+    # 变调信息
+    f_pitch_change = float(request_form.get("fPitchChange", 0))
+    # 获取spkid
+    int_speak_Id = int(request_form.get("sSpeakId", 0))
+    # DAW所需的采样率
+    daw_sample = int(float(request_form.get("sampleRate", 0)))
+    # http获得wav文件并转换
+    input_wav_path = io.BytesIO(wave_file.read())
+    # 模型推理
+    _audio, _model_sr = svc_model.infer(input_wav_path, f_pitch_change, int_speak_Id, daw_sample)
+    tar_audio = librosa.resample(_audio, _model_sr, daw_sample)
+    # 返回音频
+    out_wav_path = io.BytesIO()
+    soundfile.write(out_wav_path, tar_audio, daw_sample, format="wav")
+    out_wav_path.seek(0)
+    return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
+if __name__ == '__main__':
+    # fish下只需传入下列参数
+    checkpoint_path = 'logs/DiffSVC/version_0/checkpoints/epoch=123-step=300000-valid_loss=0.17.ckpt'
+    config_path = 'configs/svc_cn_hubert_soft_ms.py'
+    # 加速倍率，None即采用配置文件的值
+    sampler_interval = None
+    # 是否提取人声，是否合成非人声，以及人声响度增益
+    extract_vocals = True
+    merge_non_vocals = False
+    vocals_loudness_gain = 0.0
+    # 最大切片时长
+    max_slice_duration = 30.0
+    # 静音阈值
+    silence_threshold = 60
+    svc_model = SvcFish(checkpoint_path, config_path, sampler_interval=sampler_interval,
+                    extract_vocals=extract_vocals,merge_non_vocals=merge_non_vocals,
+                    vocals_loudness_gain=vocals_loudness_gain,silence_threshold=silence_threshold,
+                    max_slice_duration=max_slice_duration)
+    # 此处与vst插件对应，不建议更改
+    app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)

inference.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import argparse
+import json
+import os
+from functools import partial
+from typing import Union
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from fish_audio_preprocess.utils import loudness_norm, separate_audio
+from loguru import logger
+from mmengine import Config
+from fish_diffusion.feature_extractors import FEATURE_EXTRACTORS, PITCH_EXTRACTORS
+from fish_diffusion.utils.audio import get_mel_from_audio, slice_audio
+from fish_diffusion.utils.inference import load_checkpoint
+from fish_diffusion.utils.tensor import repeat_expand
+@torch.no_grad()
+def inference(
+    config,
+    checkpoint,
+    input_path,
+    output_path,
+    speaker_id=0,
+    pitch_adjust=0,
+    silence_threshold=30,
+    max_slice_duration=5,
+    extract_vocals=True,
+    merge_non_vocals=True,
+    vocals_loudness_gain=0.0,
+    sampler_interval=None,
+    sampler_progress=False,
+    device="cuda",
+    gradio_progress=None,
+):
+    """Inference
+    Args:
+        config: config
+        checkpoint: checkpoint path
+        input_path: input path
+        output_path: output path
+        speaker_id: speaker id
+        pitch_adjust: pitch adjust
+        silence_threshold: silence threshold of librosa.effects.split
+        max_slice_duration: maximum duration of each slice
+        extract_vocals: extract vocals
+        merge_non_vocals: merge non-vocals, only works when extract_vocals is True
+        vocals_loudness_gain: loudness gain of vocals (dB)
+        sampler_interval: sampler interval, lower value means higher quality
+        sampler_progress: show sampler progress
+        device: device
+        gradio_progress: gradio progress callback
+    """
+    if sampler_interval is not None:
+        config.model.diffusion.sampler_interval = sampler_interval
+    if os.path.isdir(checkpoint):
+        # Find the latest checkpoint
+        checkpoints = sorted(os.listdir(checkpoint))
+        logger.info(f"Found {len(checkpoints)} checkpoints, using {checkpoints[-1]}")
+        checkpoint = os.path.join(checkpoint, checkpoints[-1])
+    audio, sr = librosa.load(input_path, sr=config.sampling_rate, mono=True)
+    # Extract vocals
+    if extract_vocals:
+        logger.info("Extracting vocals...")
+        if gradio_progress is not None:
+            gradio_progress(0, "Extracting vocals...")
+        model = separate_audio.init_model("htdemucs", device=device)
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=model.samplerate)[None]
+        # To two channels
+        audio = np.concatenate([audio, audio], axis=0)
+        audio = torch.from_numpy(audio).to(device)
+        tracks = separate_audio.separate_audio(
+            model, audio, shifts=1, num_workers=0, progress=True
+        )
+        audio = separate_audio.merge_tracks(tracks, filter=["vocals"]).cpu().numpy()
+        non_vocals = (
+            separate_audio.merge_tracks(tracks, filter=["drums", "bass", "other"])
+            .cpu()
+            .numpy()
+        )
+        audio = librosa.resample(audio[0], orig_sr=model.samplerate, target_sr=sr)
+        non_vocals = librosa.resample(
+            non_vocals[0], orig_sr=model.samplerate, target_sr=sr
+        )
+        # Normalize loudness
+        non_vocals = loudness_norm.loudness_norm(non_vocals, sr)
+    # Normalize loudness
+    audio = loudness_norm.loudness_norm(audio, sr)
+    # Slice into segments
+    segments = list(
+        slice_audio(
+            audio, sr, max_duration=max_slice_duration, top_db=silence_threshold
+        )
+    )
+    logger.info(f"Sliced into {len(segments)} segments")
+    # Load models
+    text_features_extractor = FEATURE_EXTRACTORS.build(
+        config.preprocessing.text_features_extractor
+    ).to(device)
+    text_features_extractor.eval()
+    model = load_checkpoint(config, checkpoint, device=device)
+    pitch_extractor = PITCH_EXTRACTORS.build(config.preprocessing.pitch_extractor)
+    assert pitch_extractor is not None, "Pitch extractor not found"
+    generated_audio = np.zeros_like(audio)
+    audio_torch = torch.from_numpy(audio).to(device)[None]
+    for idx, (start, end) in enumerate(segments):
+        if gradio_progress is not None:
+            gradio_progress(idx / len(segments), "Generating audio...")
+        segment = audio_torch[:, start:end]
+        logger.info(
+            f"Processing segment {idx + 1}/{len(segments)}, duration: {segment.shape[-1] / sr:.2f}s"
+        )
+        # Extract mel
+        mel = get_mel_from_audio(segment, sr)
+        # Extract pitch (f0)
+        pitch = pitch_extractor(segment, sr, pad_to=mel.shape[-1]).float()
+        pitch *= 2 ** (pitch_adjust / 12)
+        # Extract text features
+        text_features = text_features_extractor(segment, sr)[0]
+        text_features = repeat_expand(text_features, mel.shape[-1]).T
+        # Predict
+        src_lens = torch.tensor([mel.shape[-1]]).to(device)
+        features = model.model.forward_features(
+            speakers=torch.tensor([speaker_id]).long().to(device),
+            contents=text_features[None].to(device),
+            src_lens=src_lens,
+            max_src_len=max(src_lens),
+            mel_lens=src_lens,
+            max_mel_len=max(src_lens),
+            pitches=pitch[None].to(device),
+        )
+        result = model.model.diffusion(features["features"], progress=sampler_progress)
+        wav = model.vocoder.spec2wav(result[0].T, f0=pitch).cpu().numpy()
+        max_wav_len = generated_audio.shape[-1] - start
+        generated_audio[start : start + wav.shape[-1]] = wav[:max_wav_len]
+    # Loudness normalization
+    generated_audio = loudness_norm.loudness_norm(generated_audio, sr)
+    # Loudness gain
+    loudness_float = 10 ** (vocals_loudness_gain / 20)
+    generated_audio = generated_audio * loudness_float
+    # Merge non-vocals
+    if extract_vocals and merge_non_vocals:
+        generated_audio = (generated_audio + non_vocals) / 2
+    logger.info("Done")
+    if output_path is not None:
+        sf.write(output_path, generated_audio, sr)
+    return generated_audio, sr
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the config file",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="Path to the checkpoint file",
+    )
+    parser.add_argument(
+        "--gradio",
+        action="store_true",
+        help="Run in gradio mode",
+    )
+    parser.add_argument(
+        "--gradio_share",
+        action="store_true",
+        help="Share gradio app",
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=False,
+        help="Path to the input audio file",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=False,
+        help="Path to the output audio file",
+    )
+    parser.add_argument(
+        "--speaker_id",
+        type=int,
+        default=0,
+        help="Speaker id",
+    )
+    parser.add_argument(
+        "--speaker_mapping",
+        type=str,
+        default=None,
+        help="Speaker mapping file (gradio mode only)",
+    )
+    parser.add_argument(
+        "--pitch_adjust",
+        type=int,
+        default=0,
+        help="Pitch adjustment in semitones",
+    )
+    parser.add_argument(
+        "--extract_vocals",
+        action="store_true",
+        help="Extract vocals",
+    )
+    parser.add_argument(
+        "--merge_non_vocals",
+        action="store_true",
+        help="Merge non-vocals",
+    )
+    parser.add_argument(
+        "--vocals_loudness_gain",
+        type=float,
+        default=0,
+        help="Loudness gain for vocals",
+    )
+    parser.add_argument(
+        "--sampler_interval",
+        type=int,
+        default=None,
+        required=False,
+        help="Sampler interval, if not specified, will be taken from config",
+    )
+    parser.add_argument(
+        "--sampler_progress",
+        action="store_true",
+        help="Show sampler progress",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        required=False,
+        help="Device to use",
+    )
+    return parser.parse_args()
+def run_inference(
+    config_path: str,
+    model_path: str,
+    input_path: str,
+    speaker: Union[int, str],
+    pitch_adjust: int,
+    sampler_interval: int,
+    extract_vocals: bool,
+    device: str,
+    progress=gr.Progress(),
+    speaker_mapping: dict = None,
+):
+    if speaker_mapping is not None and isinstance(speaker, str):
+        speaker = speaker_mapping[speaker]
+    audio, sr = inference(
+        Config.fromfile(config_path),
+        model_path,
+        input_path=input_path,
+        output_path=None,
+        speaker_id=speaker,
+        pitch_adjust=pitch_adjust,
+        sampler_interval=round(sampler_interval),
+        extract_vocals=extract_vocals,
+        merge_non_vocals=False,
+        device=device,
+        gradio_progress=progress,
+    )
+    return (sr, audio)
+def launch_gradio(args):
+    with gr.Blocks(title="Fish Diffusion") as app:
+        gr.Markdown("# Fish Diffusion SVC Inference")
+        with gr.Row():
+            with gr.Column():
+                input_audio = gr.Audio(
+                    label="Input Audio",
+                    type="filepath",
+                    value=args.input,
+                )
+                output_audio = gr.Audio(label="Output Audio")
+            with gr.Column():
+                if args.speaker_mapping is not None:
+                    speaker_mapping = json.load(open(args.speaker_mapping))
+                    speaker = gr.Dropdown(
+                        label="Speaker Name (Used for Multi-Speaker Models)",
+                        choices=list(speaker_mapping.keys()),
+                        value=list(speaker_mapping.keys())[0],
+                    )
+                else:
+                    speaker_mapping = None
+                    speaker = gr.Number(
+                        label="Speaker ID (Used for Multi-Speaker Models)",
+                        value=args.speaker_id,
+                    )
+                pitch_adjust = gr.Number(
+                    label="Pitch Adjust (Semitones)", value=args.pitch_adjust
+                )
+                sampler_interval = gr.Slider(
+                    label="Sampler Interval (⬆️ Faster Generation, ⬇️ Better Quality)",
+                    value=args.sampler_interval or 10,
+                    minimum=1,
+                    maximum=100,
+                )
+                extract_vocals = gr.Checkbox(
+                    label="Extract Vocals (For low quality audio)",
+                    value=args.extract_vocals,
+                )
+                device = gr.Radio(
+                    label="Device", choices=["cuda", "cpu"], value=args.device or "cuda"
+                )
+                run_btn = gr.Button(label="Run")
+            run_btn.click(
+                partial(
+                    run_inference,
+                    args.config,
+                    args.checkpoint,
+                    speaker_mapping=speaker_mapping,
+                ),
+                [
+                    input_audio,
+                    speaker,
+                    pitch_adjust,
+                    sampler_interval,
+                    extract_vocals,
+                    device,
+                ],
+                output_audio,
+            )
+    app.queue(concurrency_count=2).launch(share=args.gradio_share)
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.gradio or (
+        args.input is not None and args.output is not None
+    ), "Either --gradio or --input and --output should be specified"
+    if args.device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        device = torch.device(args.device)
+    if args.gradio:
+        args.device = device
+        launch_gradio(args)
+    else:
+        inference(
+            Config.fromfile(args.config),
+            args.checkpoint,
+            args.input,
+            args.output,
+            speaker_id=args.speaker_id,
+            pitch_adjust=args.pitch_adjust,
+            extract_vocals=args.extract_vocals,
+            merge_non_vocals=args.merge_non_vocals,
+            vocals_loudness_gain=args.vocals_loudness_gain,
+            sampler_interval=args.sampler_interval,
+            sampler_progress=args.sampler_progress,
+            device=device,
+        )

inference_svs.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import argparse
+import json
+import math
+import os
+import numpy as np
+import soundfile as sf
+import torch
+from fish_audio_preprocess.utils import loudness_norm
+from loguru import logger
+from mmengine import Config
+from fish_diffusion.feature_extractors import FEATURE_EXTRACTORS, PITCH_EXTRACTORS
+from fish_diffusion.utils.tensor import repeat_expand
+from train import FishDiffusion
+@torch.no_grad()
+def inference(
+    config,
+    checkpoint,
+    input_path,
+    output_path,
+    dictionary_path="dictionaries/opencpop-strict.txt",
+    speaker_id=0,
+    sampler_interval=None,
+    sampler_progress=False,
+    device="cuda",
+):
+    """Inference
+    Args:
+        config: config
+        checkpoint: checkpoint path
+        input_path: input path
+        output_path: output path
+        dictionary_path: dictionary path
+        speaker_id: speaker id
+        sampler_interval: sampler interval, lower value means higher quality
+        sampler_progress: show sampler progress
+        device: device
+    """
+    if sampler_interval is not None:
+        config.model.diffusion.sampler_interval = sampler_interval
+    if os.path.isdir(checkpoint):
+        # Find the latest checkpoint
+        checkpoints = sorted(os.listdir(checkpoint))
+        logger.info(f"Found {len(checkpoints)} checkpoints, using {checkpoints[-1]}")
+        checkpoint = os.path.join(checkpoint, checkpoints[-1])
+    # Load models
+    phoneme_features_extractor = FEATURE_EXTRACTORS.build(
+        config.preprocessing.phoneme_features_extractor
+    ).to(device)
+    phoneme_features_extractor.eval()
+    model = FishDiffusion(config)
+    state_dict = torch.load(checkpoint, map_location="cpu")
+    if "state_dict" in state_dict:  # Checkpoint is saved by pl
+        state_dict = state_dict["state_dict"]
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    pitch_extractor = PITCH_EXTRACTORS.build(config.preprocessing.pitch_extractor)
+    assert pitch_extractor is not None, "Pitch extractor not found"
+    # Load dictionary
+    phones_list = []
+    for i in open(dictionary_path):
+        _, phones = i.strip().split("\t")
+        for j in phones.split():
+            if j not in phones_list:
+                phones_list.append(j)
+    phones_list = ["<PAD>", "<EOS>", "<UNK>", "AP", "SP"] + sorted(phones_list)
+    # Load ds file
+    with open(input_path) as f:
+        ds = json.load(f)
+    generated_audio = np.zeros(
+        math.ceil(
+            (
+                float(ds[-1]["offset"])
+                + float(ds[-1]["f0_timestep"]) * len(ds[-1]["f0_seq"].split(" "))
+            )
+            * config.sampling_rate
+        )
+    )
+    for idx, chunk in enumerate(ds):
+        offset = float(chunk["offset"])
+        phones = np.array([phones_list.index(i) for i in chunk["ph_seq"].split(" ")])
+        durations = np.array([0] + [float(i) for i in chunk["ph_dur"].split(" ")])
+        durations = np.cumsum(durations)
+        f0_timestep = float(chunk["f0_timestep"])
+        f0_seq = torch.FloatTensor([float(i) for i in chunk["f0_seq"].split(" ")])
+        f0_seq *= 2 ** (6 / 12)
+        total_duration = f0_timestep * len(f0_seq)
+        logger.info(
+            f"Processing segment {idx + 1}/{len(ds)}, duration: {total_duration:.2f}s"
+        )
+        n_mels = round(total_duration * config.sampling_rate / 512)
+        f0_seq = repeat_expand(f0_seq, n_mels, mode="linear")
+        f0_seq = f0_seq.to(device)
+        # aligned is in 20ms
+        aligned_phones = torch.zeros(int(total_duration * 50), dtype=torch.long)
+        for i, phone in enumerate(phones):
+            start = int(durations[i] / f0_timestep / 4)
+            end = int(durations[i + 1] / f0_timestep / 4)
+            aligned_phones[start:end] = phone
+        # Extract text features
+        phoneme_features = phoneme_features_extractor.forward(
+            aligned_phones.to(device)
+        )[0]
+        phoneme_features = repeat_expand(phoneme_features, n_mels).T
+        # Predict
+        src_lens = torch.tensor([phoneme_features.shape[0]]).to(device)
+        features = model.model.forward_features(
+            speakers=torch.tensor([speaker_id]).long().to(device),
+            contents=phoneme_features[None].to(device),
+            src_lens=src_lens,
+            max_src_len=max(src_lens),
+            mel_lens=src_lens,
+            max_mel_len=max(src_lens),
+            pitches=f0_seq[None],
+        )
+        result = model.model.diffusion(features["features"], progress=sampler_progress)
+        wav = model.vocoder.spec2wav(result[0].T, f0=f0_seq).cpu().numpy()
+        start = round(offset * config.sampling_rate)
+        max_wav_len = generated_audio.shape[-1] - start
+        generated_audio[start : start + wav.shape[-1]] = wav[:max_wav_len]
+    # Loudness normalization
+    generated_audio = loudness_norm.loudness_norm(generated_audio, config.sampling_rate)
+    sf.write(output_path, generated_audio, config.sampling_rate)
+    logger.info("Done")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/svc_hubert_soft.py",
+        help="Path to the config file",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="Path to the checkpoint file",
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to the input audio file",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Path to the output audio file",
+    )
+    parser.add_argument(
+        "--speaker_id",
+        type=int,
+        default=0,
+        help="Speaker id",
+    )
+    parser.add_argument(
+        "--sampler_interval",
+        type=int,
+        default=None,
+        required=False,
+        help="Sampler interval, if not specified, will be taken from config",
+    )
+    parser.add_argument(
+        "--sampler_progress",
+        action="store_true",
+        help="Show sampler progress",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        required=False,
+        help="Device to use",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    if args.device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        device = torch.device(args.device)
+    inference(
+        Config.fromfile(args.config),
+        args.checkpoint,
+        args.input,
+        args.output,
+        speaker_id=args.speaker_id,
+        sampler_interval=args.sampler_interval,
+        sampler_progress=args.sampler_progress,
+        device=device,
+    )

inference_vst.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import argparse
+import json
+import os
+from functools import partial
+from typing import Union
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from fish_audio_preprocess.utils import loudness_norm, separate_audio
+from loguru import logger
+from mmengine import Config
+from fish_diffusion.feature_extractors import FEATURE_EXTRACTORS, PITCH_EXTRACTORS
+from fish_diffusion.utils.audio import get_mel_from_audio, slice_audio
+from fish_diffusion.utils.inference import load_checkpoint
+from fish_diffusion.utils.tensor import repeat_expand
+@torch.no_grad()
+def inference(
+    in_sample,
+    config_path,
+    checkpoint,
+    input_path,
+    output_path,
+    speaker_id=0,
+    pitch_adjust=0,
+    silence_threshold=60,
+    max_slice_duration=30.0,
+    extract_vocals=True,
+    merge_non_vocals=True,
+    vocals_loudness_gain=0.0,
+    sampler_interval=None,
+    sampler_progress=False,
+    device="cuda",
+    gradio_progress=None,
+):
+    """Inference
+    Args:
+        config: config
+        checkpoint: checkpoint path
+        input_path: input path
+        output_path: output path
+        speaker_id: speaker id
+        pitch_adjust: pitch adjust
+        silence_threshold: silence threshold of librosa.effects.split
+        max_slice_duration: maximum duration of each slice
+        extract_vocals: extract vocals
+        merge_non_vocals: merge non-vocals, only works when extract_vocals is True
+        vocals_loudness_gain: loudness gain of vocals (dB)
+        sampler_interval: sampler interval, lower value means higher quality
+        sampler_progress: show sampler progress
+        device: device
+        gradio_progress: gradio progress callback
+    """
+    config = Config.fromfile(config_path)
+    if sampler_interval is not None:
+        config.model.diffusion.sampler_interval = sampler_interval
+    if os.path.isdir(checkpoint):
+        # Find the latest checkpoint
+        checkpoints = sorted(os.listdir(checkpoint))
+        logger.info(f"Found {len(checkpoints)} checkpoints, using {checkpoints[-1]}")
+        checkpoint = os.path.join(checkpoint, checkpoints[-1])
+    audio, sr = librosa.load(input_path, config.sampling_rate, mono=True)
+    #sr = in_sample
+    #audio = sf.read(input_path)
+    # Extract vocals
+    if extract_vocals:
+        logger.info("Extracting vocals...")
+        if gradio_progress is not None:
+            gradio_progress(0, "Extracting vocals...")
+        model = separate_audio.init_model("htdemucs", device=device)
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=model.samplerate)[None]
+        # To two channels
+        audio = np.concatenate([audio, audio], axis=0)
+        audio = torch.from_numpy(audio).to(device)
+        tracks = separate_audio.separate_audio(
+            model, audio, shifts=1, num_workers=0, progress=True
+        )
+        audio = separate_audio.merge_tracks(tracks, filter=["vocals"]).cpu().numpy()
+        non_vocals = (
+            separate_audio.merge_tracks(tracks, filter=["drums", "bass", "other"])
+            .cpu()
+            .numpy()
+        )
+        audio = librosa.resample(audio[0], orig_sr=model.samplerate, target_sr=sr)
+        non_vocals = librosa.resample(
+            non_vocals[0], orig_sr=model.samplerate, target_sr=sr
+        )
+        # Normalize loudness
+        non_vocals = loudness_norm.loudness_norm(non_vocals, sr)
+    # Normalize loudness
+    audio = loudness_norm.loudness_norm(audio, sr)
+    # Slice into segments
+    segments = list(
+        slice_audio(
+            audio, sr, max_duration=max_slice_duration, top_db=silence_threshold
+        )
+    )
+    logger.info(f"Sliced into {len(segments)} segments")
+    # Load models
+    text_features_extractor = FEATURE_EXTRACTORS.build(
+        config.preprocessing.text_features_extractor
+    ).to(device)
+    text_features_extractor.eval()
+    model = load_checkpoint(config, checkpoint, device=device)
+    pitch_extractor = PITCH_EXTRACTORS.build(config.preprocessing.pitch_extractor)
+    assert pitch_extractor is not None, "Pitch extractor not found"
+    generated_audio = np.zeros_like(audio)
+    audio_torch = torch.from_numpy(audio).to(device)[None]
+    for idx, (start, end) in enumerate(segments):
+        if gradio_progress is not None:
+            gradio_progress(idx / len(segments), "Generating audio...")
+        segment = audio_torch[:, start:end]
+        logger.info(
+            f"Processing segment {idx + 1}/{len(segments)}, duration: {segment.shape[-1] / sr:.2f}s"
+        )
+        # Extract mel
+        mel = get_mel_from_audio(segment, sr)
+        # Extract pitch (f0)
+        pitch = pitch_extractor(segment, sr, pad_to=mel.shape[-1]).float()
+        pitch *= 2 ** (pitch_adjust / 12)
+        # Extract text features
+        text_features = text_features_extractor(segment, sr)[0]
+        text_features = repeat_expand(text_features, mel.shape[-1]).T
+        # Predict
+        src_lens = torch.tensor([mel.shape[-1]]).to(device)
+        features = model.model.forward_features(
+            speakers=torch.tensor([speaker_id]).long().to(device),
+            contents=text_features[None].to(device),
+            src_lens=src_lens,
+            max_src_len=max(src_lens),
+            mel_lens=src_lens,
+            max_mel_len=max(src_lens),
+            pitches=pitch[None].to(device),
+        )
+        result = model.model.diffusion(features["features"], progress=sampler_progress)
+        wav = model.vocoder.spec2wav(result[0].T, f0=pitch).cpu().numpy()
+        max_wav_len = generated_audio.shape[-1] - start
+        generated_audio[start : start + wav.shape[-1]] = wav[:max_wav_len]
+    # Loudness normalization
+    generated_audio = loudness_norm.loudness_norm(generated_audio, sr)
+    # Loudness gain
+    loudness_float = 10 ** (vocals_loudness_gain / 20)
+    generated_audio = generated_audio * loudness_float
+    # Merge non-vocals
+    if extract_vocals and merge_non_vocals:
+        generated_audio = (generated_audio + non_vocals) / 2
+    logger.info("Done")
+    if output_path is not None:
+        sf.write(output_path, generated_audio, sr)
+    return generated_audio, sr
+class SvcFish:
+    def __init__(self, checkpoint_path, config_path, sampler_interval=None, extract_vocals=True,
+                 merge_non_vocals=True,vocals_loudness_gain=0.0,silence_threshold=60, max_slice_duration=30.0):
+        self.config_path = config_path
+        self.checkpoint_path = checkpoint_path
+        self.sampler_interval = sampler_interval
+        self.silence_threshold = silence_threshold
+        self.max_slice_duration = max_slice_duration
+        self.extract_vocals = extract_vocals
+        self.merge_non_vocals = merge_non_vocals
+        self.vocals_loudness_gain = vocals_loudness_gain
+    def infer(self, input_path, pitch_adjust, speaker_id, in_sample):
+        return inference(
+            in_sample=in_sample,
+            config_path=self.config_path,
+            checkpoint=self.checkpoint_path,
+            input_path=input_path,
+            output_path=None,
+            speaker_id=speaker_id,
+            pitch_adjust=pitch_adjust,
+            silence_threshold=self.silence_threshold,
+            max_slice_duration=self.max_slice_duration,
+            extract_vocals=self.extract_vocals,
+            merge_non_vocals=self.merge_non_vocals,
+            vocals_loudness_gain=self.vocals_loudness_gain,
+            sampler_interval=self.sampler_interval,
+            sampler_progress=True,
+            device="cuda",
+            gradio_progress=None,
+        )

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

poetry.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [virtualenvs]
2	+ create = false

pyproject.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+[tool.poetry]
+name = "fish-diffusion"
+version = "0.1.0"
+description = ""
+authors = ["Lengyue <lengyue@lengyue.me>"]
+license = "Apache"
+packages = [{ include = "fish_diffusion" }]
+[tool.poetry.dependencies]
+python = "^3.10"
+praat-parselmouth = "^0.4.3"
+soundfile = "^0.11.0"
+librosa = "^0.9.1"
+pytorch-lightning = "^1.8.6"
+numba = "^0.56.4"
+fish-audio-preprocess = "^0.1.9"
+wandb = "^0.13.9"
+transformers = "^4.25.1"
+torchcrepe = "^0.0.17"
+mmengine = "^0.4.0"
+loguru = "^0.6.0"
+click = "^8.1.3"
+tensorboard = "^2.11.2"
+openai-whisper = "^20230124"
+pypinyin = "^0.48.0"
+TextGrid = "^1.5"
+pyworld = "^0.3.2"
+pykakasi = "^2.2.1"
+gradio = "^3.18.0"
+onnxruntime = "^1.14.0"
+[tool.poetry.group.dev.dependencies]
+isort = "^5.11.4"
+black = "^22.12.0"
+[tool.poetry.group.docs]
+optional = true
+[tool.poetry.group.docs.dependencies]
+furo = "^2022.12.7"
+sphinx-autobuild = "^2021.3.14"
+myst-parser = "^0.18.1"
+[build-system]
+requires = ["poetry-core>=1.2.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.isort]
+profile = "black"
+extend_skip = ["dataset", "logs"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,103 @@

+absl-py==1.3.0
+aiohttp==3.8.3
+aiosignal==1.3.1
+appdirs==1.4.4
+asttokens==2.1.0
+async-timeout==4.0.2
+attrs==22.1.0
+audioread==3.0.0
+backcall==0.2.0
+cachetools==5.2.0
+certifi==2022.9.24
+cffi==1.15.1
+charset-normalizer==2.1.1
+contourpy==1.0.6
+cycler==0.11.0
+debugpy==1.6.3
+decorator==5.1.1
+einops==0.6.0
+entrypoints==0.4
+executing==1.2.0
+fonttools==4.38.0
+frozenlist==1.3.3
+fsspec==2022.11.0
+future==0.18.2
+google-auth==2.14.1
+google-auth-oauthlib==0.4.6
+grpcio==1.50.0
+h5py==3.7.0
+hparams==0.3.0
+idna==3.4
+imageio==2.22.4
+importlib-metadata==5.0.0
+ipykernel==6.17.1
+ipython==8.6.0
+jedi==0.18.1
+joblib==1.2.0
+jupyter_client==7.4.7
+jupyter_core==5.0.0
+kiwisolver==1.4.4
+librosa==0.9.1
+llvmlite==0.39.1
+Markdown==3.4.1
+MarkupSafe==2.1.1
+matplotlib==3.6.2
+matplotlib-inline==0.1.6
+multidict==6.0.2
+nest-asyncio==1.5.6
+networkx==2.8.8
+numba==0.56.4
+numpy==1.23.5
+oauthlib==3.2.2
+packaging==21.3
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.3.0
+platformdirs==2.5.4
+pooch==1.6.0
+praat-parselmouth==0.5.0
+prompt-toolkit==3.0.32
+protobuf==3.20.3
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.21
+pycwt==0.3.0a22
+pyDeprecate==0.3.0
+Pygments==2.13.0
+pyloudnorm==0.2.0
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytorch-lightning==2.0.0
+PyWavelets==1.4.1
+PyYAML==5.4.1
+pyzmq==24.0.1
+requests==2.28.1
+requests-oauthlib==1.3.1
+resampy==0.4.2
+rsa==4.9
+scikit-image==0.19.3
+scikit-learn==1.1.3
+scipy==1.9.3
+six==1.16.0
+soundfile==0.11.0
+stack-data==0.6.1
+tensorboard==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+threadpoolctl==3.1.0
+tifffile==2022.10.10
+tqdm==4.64.1
+traitlets==5.5.0
+typeguard==2.13.3
+typing_extensions==4.4.0
+urllib3==1.26.12
+utils==1.0.1
+wcwidth==0.2.5
+webrtcvad==2.0.10
+Werkzeug==2.2.2
+yarl==1.8.1
+zipp==3.10.0

train.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from argparse import ArgumentParser
+import matplotlib.pyplot as plt
+import pytorch_lightning as pl
+import torch
+import wandb
+from loguru import logger
+from mmengine import Config
+from mmengine.optim import OPTIMIZERS
+from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
+from torch.utils.data import DataLoader
+from fish_diffusion.archs.diffsinger import DiffSinger
+from fish_diffusion.datasets import DATASETS
+from fish_diffusion.datasets.repeat import RepeatDataset
+from fish_diffusion.utils.scheduler import LR_SCHEUDLERS
+from fish_diffusion.utils.viz import viz_synth_sample
+from fish_diffusion.vocoders import VOCODERS
+class FishDiffusion(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = DiffSinger(config.model)
+        self.config = config
+        # 音频编码器, 将梅尔谱转换为音频
+        self.vocoder = VOCODERS.build(config.model.vocoder)
+        self.vocoder.freeze()
+    def configure_optimizers(self):
+        self.config.optimizer.params = self.parameters()
+        optimizer = OPTIMIZERS.build(self.config.optimizer)
+        self.config.scheduler.optimizer = optimizer
+        scheduler = LR_SCHEUDLERS.build(self.config.scheduler)
+        return [optimizer], dict(scheduler=scheduler, interval="step")
+    def _step(self, batch, batch_idx, mode):
+        assert batch["pitches"].shape[1] == batch["mels"].shape[1]
+        pitches = batch["pitches"].clone()
+        batch_size = batch["speakers"].shape[0]
+        output = self.model(
+            speakers=batch["speakers"],
+            contents=batch["contents"],
+            src_lens=batch["content_lens"],
+            max_src_len=batch["max_content_len"],
+            mels=batch["mels"],
+            mel_lens=batch["mel_lens"],
+            max_mel_len=batch["max_mel_len"],
+            pitches=batch["pitches"],
+        )
+        self.log(f"{mode}_loss", output["loss"], batch_size=batch_size, sync_dist=True)
+        if mode != "valid":
+            return output["loss"]
+        x = self.model.diffusion(output["features"])
+        for idx, (gt_mel, gt_pitch, predict_mel, predict_mel_len) in enumerate(
+            zip(batch["mels"], pitches, x, batch["mel_lens"])
+        ):
+            image_mels, wav_reconstruction, wav_prediction = viz_synth_sample(
+                gt_mel=gt_mel,
+                gt_pitch=gt_pitch,
+                predict_mel=predict_mel,
+                predict_mel_len=predict_mel_len,
+                vocoder=self.vocoder,
+                return_image=False,
+            )
+            wav_reconstruction = wav_reconstruction.to(torch.float32).cpu().numpy()
+            wav_prediction = wav_prediction.to(torch.float32).cpu().numpy()
+            # WanDB logger
+            if isinstance(self.logger, WandbLogger):
+                self.logger.experiment.log(
+                    {
+                        f"reconstruction_mel": wandb.Image(image_mels, caption="mels"),
+                        f"wavs": [
+                            wandb.Audio(
+                                wav_reconstruction,
+                                sample_rate=44100,
+                                caption=f"reconstruction (gt)",
+                            ),
+                            wandb.Audio(
+                                wav_prediction,
+                                sample_rate=44100,
+                                caption=f"prediction",
+                            ),
+                        ],
+                    },
+                )
+            # TensorBoard logger
+            if isinstance(self.logger, TensorBoardLogger):
+                self.logger.experiment.add_figure(
+                    f"sample-{idx}/mels",
+                    image_mels,
+                    global_step=self.global_step,
+                )
+                self.logger.experiment.add_audio(
+                    f"sample-{idx}/wavs/gt",
+                    wav_reconstruction,
+                    self.global_step,
+                    sample_rate=44100,
+                )
+                self.logger.experiment.add_audio(
+                    f"sample-{idx}/wavs/prediction",
+                    wav_prediction,
+                    self.global_step,
+                    sample_rate=44100,
+                )
+            if isinstance(image_mels, plt.Figure):
+                plt.close(image_mels)
+        return output["loss"]
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, mode="train")
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, mode="valid")
+if __name__ == "__main__":
+    pl.seed_everything(42, workers=True)
+    parser = ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--resume", type=str, default=None)
+    parser.add_argument(
+        "--tensorboard",
+        action="store_true",
+        default=False,
+        help="Use tensorboard logger, default is wandb.",
+    )
+    parser.add_argument("--resume-id", type=str, default=None, help="Wandb run id.")
+    parser.add_argument("--entity", type=str, default=None, help="Wandb entity.")
+    parser.add_argument("--name", type=str, default=None, help="Wandb run name.")
+    parser.add_argument(
+        "--pretrained", type=str, default=None, help="Pretrained model."
+    )
+    parser.add_argument(
+        "--only-train-speaker-embeddings",
+        action="store_true",
+        default=False,
+        help="Only train speaker embeddings.",
+    )
+    args = parser.parse_args()
+    cfg = Config.fromfile(args.config)
+    model = FishDiffusion(cfg)
+    # We only load the state_dict of the model, not the optimizer.
+    if args.pretrained:
+        state_dict = torch.load(args.pretrained, map_location="cpu")
+        if "state_dict" in state_dict:
+            state_dict = state_dict["state_dict"]
+        result = model.load_state_dict(state_dict, strict=False)
+        missing_keys = set(result.missing_keys)
+        unexpected_keys = set(result.unexpected_keys)
+        # Make sure incorrect keys are just noise predictor keys.
+        unexpected_keys = unexpected_keys - set(
+            i.replace(".naive_noise_predictor.", ".") for i in missing_keys
+        )
+        assert len(unexpected_keys) == 0
+        if args.only_train_speaker_embeddings:
+            for name, param in model.named_parameters():
+                if "speaker_encoder" not in name:
+                    param.requires_grad = False
+            logger.info(
+                "Only train speaker embeddings, all other parameters are frozen."
+            )
+    logger = (
+        TensorBoardLogger("logs", name=cfg.model.type)
+        if args.tensorboard
+        else WandbLogger(
+            project=cfg.model.type,
+            save_dir="logs",
+            log_model=True,
+            name=args.name,
+            entity=args.entity,
+            resume="must" if args.resume_id else False,
+            id=args.resume_id,
+        )
+    )
+    trainer = pl.Trainer(
+        logger=logger,
+        **cfg.trainer,
+    )
+    train_dataset = DATASETS.build(cfg.dataset.train)
+    train_loader = DataLoader(
+        train_dataset,
+        collate_fn=train_dataset.collate_fn,
+        **cfg.dataloader.train,
+    )
+    valid_dataset = DATASETS.build(cfg.dataset.valid)
+    valid_dataset = RepeatDataset(
+        valid_dataset, repeat=trainer.num_devices, collate_fn=valid_dataset.collate_fn
+    )
+    valid_loader = DataLoader(
+        valid_dataset,
+        collate_fn=valid_dataset.collate_fn,
+        **cfg.dataloader.valid,
+    )
+    trainer.fit(model, train_loader, valid_loader, ckpt_path=args.resume)

tst ADDED Viewed

	@@ -0,0 +1 @@


1	+ python inference.py --config configs\svc_cn_hubert_soft_finetune_crepe.py --checkpoint checkpoints\epoch=909-step=20000-valid_loss=0.23.ckpt--gradio

开始处理.bat ADDED Viewed

	@@ -0,0 +1,4 @@

+@echo off
+env310\python.exe train.py --config configs/train_my_config.py --pretrained checkpoints\hubert\cn-hubert-soft-600-singers-pretrained-v1.ckpt
+pause