{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "7c7beac1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Loaded checkpoint 'logs/48k/G_168000.pth' (iteration 52)\n" ] } ], "source": [ "import io\n", "import logging\n", "import time\n", "from pathlib import Path\n", "\n", "import librosa\n", "import numpy as np\n", "import soundfile\n", "import IPython.display as ipd\n", "from inference import infer_tool\n", "from inference import slicer\n", "from inference.infer_tool import Svc\n", "\n", "logging.getLogger('numba').setLevel(logging.WARNING)\n", "chunks_dict = infer_tool.read_temp(\"inference/chunks_temp.json\")\n", "\n", "model_path = \"logs/48k/G_168000.pth\"\n", "config_path = \"configs/config.json\"\n", "svc_model = Svc(model_path, config_path)\n", "infer_tool.mkdir([\"raw\", \"results\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "b8a69a25", "metadata": {}, "outputs": [], "source": [ "# 支持多个wav文件,放在raw文件夹下\n", "clean_names = [\"7_1\"]\n", "trans = [2] # 音高调整,支持正负(半音)\n", "spk_list = ['钟离'] # 每次同时合成多语者音色\n", "slice_db = -40 # 默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50\n", "wav_format = 'flac' # 音频输出格式\n", "\n", "infer_tool.fill_a_to_b(trans, clean_names)\n", "for clean_name, tran in zip(clean_names, trans):\n", " raw_audio_path = f\"raw/{clean_name}\"\n", " if \".\" not in raw_audio_path:\n", " raw_audio_path += \".wav\"\n", " infer_tool.format_wav(raw_audio_path)\n", " wav_path = Path(raw_audio_path).with_suffix('.wav')\n", " audio, sr = librosa.load(wav_path, mono=True, sr=None)\n", " wav_hash = infer_tool.get_md5(audio)\n", " if wav_hash in chunks_dict.keys():\n", " print(\"load chunks from temp\")\n", " chunks = chunks_dict[wav_hash][\"chunks\"]\n", " else:\n", " chunks = slicer.cut(wav_path, db_thresh=slice_db)\n", " print(chunks)\n", " chunks_dict[wav_hash] = {\"chunks\": chunks, \"time\": int(time.time())}\n", " infer_tool.write_temp(\"inference/chunks_temp.json\", chunks_dict)\n", " audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)\n", "\n", " for spk in spk_list:\n", " audio = []\n", " for (slice_tag, data) in audio_data:\n", " print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')\n", " length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))\n", " raw_path = io.BytesIO()\n", " soundfile.write(raw_path, data, audio_sr, format=\"wav\")\n", " raw_path.seek(0)\n", " if slice_tag:\n", " print('jump empty segment')\n", " _audio = np.zeros(length)\n", " else:\n", " out_audio, out_sr = svc_model.infer(spk, tran, raw_path)\n", " _audio = out_audio.cpu().numpy()\n", " audio.extend(list(_audio))\n", "\n", " res_path = f'./results/{clean_name}_{tran}key_{spk}.{wav_format}'\n", " soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)\n", "ipd.display(ipd.Audio(audio, rate=audio_sr, normalize=False))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 5 }