NeuCoSVC-2

Sleeping

App Files Files Community

NeuCoSVC-2 / app.py

Delik

Update app.py

26603bf verified about 2 months ago

raw history blame contribute delete

No virus

12.1 kB

	import re, os
	import requests
	import json
	import torch
	import spaces
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	headers = {
	"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
	}
	pattern = r'//www\.bilibili\.com/video[^"]*'

	def get_bilibili_video_id(url):
	match = re.search(r'/video/([a-zA-Z0-9]+)/', url)
	extracted_value = match.group(1)
	return extracted_value

	# Get bilibili audio
	def find_first_appearance_with_neighborhood(text, pattern):
	match = re.search(pattern, text)

	if match:
	return match.group()
	else:
	return None

	def search_bilibili(keyword):
	if keyword.startswith("BV"):
	req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1".format(keyword), headers=headers).text
	else:
	req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1&tids=3&page=1".format(keyword), headers=headers).text

	video_link = "https:" + find_first_appearance_with_neighborhood(req, pattern)

	return video_link

	def get_response(html_url):
	headers = {
	"referer": "https://www.bilibili.com/",
	"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
	}
	response = requests.get(html_url, headers=headers)
	return response

	def get_video_info(html_url):
	response = get_response(html_url)
	html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
	json_data = json.loads(html_data)
	if json_data['data']['dash']['audio'][0]['backupUrl']!=None:
	audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
	else:
	audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
	video_url = json_data['data']['dash']['video'][0]['baseUrl']
	return audio_url, video_url

	def save_audio(title, html_url):
	audio_url = get_video_info(html_url)[0]
	#video_url = get_video_info(html_url)[1]

	audio_content = get_response(audio_url).content
	#video_content = get_response(video_url).content

	with open(title + '.mp3', mode='wb') as f:
	f.write(audio_content)
	print("音乐内容保存完成")
	#with open(title + '.mp4', mode='wb') as f:
	# f.write(video_content)
	#print("视频内容保存完成"

	from uvr5.vr import AudioPre
	weight_uvr5_root = "uvr5/uvr_model"
	uvr5_names = []
	for name in os.listdir(weight_uvr5_root):
	if name.endswith(".pth") or "onnx" in name:
	uvr5_names.append(name.replace(".pth", ""))

	func = AudioPre
	pre_fun_hp2 = func(
	agg=int(10),
	model_path=os.path.join(weight_uvr5_root, "UVR-HP2.pth"),
	device=device,
	is_half=True,
	)

	pre_fun_hp5 = func(
	agg=int(10),
	model_path=os.path.join(weight_uvr5_root, "UVR-HP5.pth"),
	device=device,
	is_half=True,
	)

	import webrtcvad
	from pydub import AudioSegment
	from pydub.utils import make_chunks

	import os
	import librosa
	import soundfile
	import gradio as gr


	def vad(audio_name):
	audio = AudioSegment.from_file(audio_name, format="wav")
	# Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz)
	audio = audio.set_frame_rate(48000)
	# Set single channel (mono)
	audio = audio.set_channels(1)

	# Initialize VAD
	vad = webrtcvad.Vad()
	# Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive)
	vad.set_mode(3)

	# Convert pydub audio to bytes
	frame_duration = 30 # Duration of a frame in ms
	frame_width = int(audio.frame_rate * frame_duration / 1000) # width of a frame in samples
	frames = make_chunks(audio, frame_duration)

	# Perform voice activity detection
	voiced_frames = []
	for frame in frames:
	if len(frame.raw_data) < frame_width * 2: # Ensure frame is correct length
	break
	is_speech = vad.is_speech(frame.raw_data, audio.frame_rate)
	if is_speech:
	voiced_frames.append(frame)

	# Combine voiced frames back to an audio segment
	voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0))

	voiced_audio.export("voiced_audio.wav", format="wav")




	def youtube_downloader(
	video_identifier,
	filename,
	split_model,
	start_time
	):
	print(video_identifier)
	video_info = get_video_info(video_identifier)[0]
	print(video_info)
	audio_content = get_response(video_info).content
	with open(filename.strip() + ".wav", mode="wb") as f:
	f.write(audio_content)
	audio_path = filename.strip() + ".wav"
	start_ms = start_time * 1000
	end_ms = start_ms + 45000
	# make dir output
	os.makedirs("output", exist_ok=True)

	if split_model=="UVR-HP2":
	pre_fun = pre_fun_hp2
	else:
	pre_fun = pre_fun_hp5

	audio_orig = AudioSegment.from_file(audio_path)
	if len(audio_orig) > end_ms:

	# Extract the segment
	segment = audio_orig[start_ms:end_ms]
	segment.export(filename.strip() + ".wav", format="wav")
	pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
	os.remove(filename.strip()+".wav")
	else:
	segment = audio_orig[start_ms:len(audio_orig)]
	segment.export(filename.strip() + ".wav", format="wav")
	pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
	os.remove(filename.strip()+".wav")


	return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"


	def youtube_downloader_100s(
	video_identifier,
	filename,
	split_model
	):
	print(video_identifier)
	video_info = get_video_info(video_identifier)[0]
	print(video_info)
	audio_content = get_response(video_info).content
	with open(filename.strip() + ".wav", mode="wb") as f:
	f.write(audio_content)
	audio_path = filename.strip() + ".wav"
	if split_model=="UVR-HP2":
	pre_fun = pre_fun_hp2
	else:
	pre_fun = pre_fun_hp5

	os.makedirs("output", exist_ok=True)
	audio_orig = AudioSegment.from_file(audio_path)

	if len(audio_orig) > 180000:
	start_ms = 30000
	end_ms = start_ms + 150000

	# Extract the segment

	segment = audio_orig[start_ms:end_ms]

	segment.export(filename.strip() + ".wav", format="wav")

	pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
	os.remove(filename.strip()+".wav")
	else:
	pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav")
	os.remove(filename.strip()+".wav")

	return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav"

	@spaces.GPU(duration=300)
	def convert(start_time, song_name_src, song_name_ref, ref_audio, check_song, auto_key, key_shift, vocal_vol, inst_vol):
	split_model = "UVR-HP5"
	song_name_src = song_name_src.strip().replace(" ", "")
	video_identifier_src = search_bilibili(song_name_src)
	song_id_src = get_bilibili_video_id(video_identifier_src)

	if ref_audio is None:
	song_name_ref = song_name_ref.strip().replace(" ", "")
	video_identifier = search_bilibili(song_name_ref)
	song_id = get_bilibili_video_id(video_identifier)

	if not os.path.isdir(f"./output/{split_model}/{song_id}"):
	audio, sr = librosa.load(youtube_downloader_100s(video_identifier, song_id, split_model)[0], sr=24000, mono=True)
	soundfile.write("audio_ref.wav", audio, sr)
	else:
	audio, sr = librosa.load(f"./output/{split_model}/{song_id}/vocal_{song_id}.wav_10.wav", sr=24000, mono=True)
	soundfile.write("audio_ref.wav", audio, sr)

	vad("audio_ref.wav")
	else:
	multi_channel_audio = AudioSegment.from_file(ref_audio, format="wav")
	mono_audio = multi_channel_audio.set_channels(1)
	mono_audio.export("voiced_audio.wav", format="wav")

	audio_src, sr_src = librosa.load(youtube_downloader(video_identifier_src, song_id_src, split_model, start_time)[0], sr=24000, mono=True)
	soundfile.write("audio_src.wav", audio_src, sr_src)

	if os.path.isfile("output_svc/NeuCoSVCv2.wav"):
	os.remove("output_svc/NeuCoSVCv2.wav")

	if check_song:
	if auto_key:
	os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav")
	else:
	os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift}")
	else:
	if auto_key:
	os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --speech_enroll")
	else:
	os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift} --speech_enroll")

	audio_vocal = AudioSegment.from_file("output_svc/NeuCoSVCv2.wav", format="wav")
	audio_inst = AudioSegment.from_file(f"output/{split_model}/{song_id_src}/instrument_{song_id_src}.wav_10.wav", format="wav")

	audio_vocal += vocal_vol
	audio_inst += inst_vol

	combined_audio = audio_vocal.overlay(audio_inst)
	output_file = f"{song_name_src}-AI翻唱.wav"
	combined_audio.export(output_file, format="wav")

	return output_file

	# Gradio App
	app = gr.Blocks()

	with app:
	gr.Markdown("# <center>🥳💕🎶 NeuCoSVC v2 0 shot voice cloning</center>")
	gr.Markdown("## <center>🌟 This space is an zerogpu implementation and translation of the original space by Kevinwang676.</center>")
	gr.Markdown("### <center>🌊 [NeuCoSVC v2](https://github.com/thuhcsi/NeuCoSVC) early access. Powered by Tencent ARC Lab & Tsinghua University 💕</center>")
	with gr.Row():
	with gr.Column():
	with gr.Row():
	inp1 = gr.Textbox(label="Source audio", placeholder="七里香周杰伦", info="Put links from youtube/bilibili on this for source audio")
	inp2 = gr.Textbox(label="Reference audio", placeholder="遇见孙燕姿", info="The artist who you want to clone")
	with gr.Row():
	inp0 = gr.Number(value=0, label="Beginning time", info="It will automatically inference 45 seconds of the cover song starting from the beginning time.")
	inp3 = gr.Checkbox(label="Is reference audio singing?", info="If the reference audio is normal speech, please untick.", value=True)
	inp4 = gr.Checkbox(label="Predict vocal pitch shift", info="if you need to manually change pitch shift please untick", value=True)
	with gr.Row():
	inp5 = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="vocal pitch shift")
	inp6 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="Vocals volume, default=0")
	inp7 = gr.Slider(minimum=-3, maximum=3, value=0, step=1, label="Instrumentals volume, default=0")
	btn = gr.Button("Start inference pipeline", variant="primary")
	with gr.Column():
	with gr.Row():
	src_audio = gr.Audio(label="Source audio. If you want to use the source audio online please do not upload here", type="filepath", interactive=True)
	ref_audio = gr.Audio(label="Reference audio (60-90 seconds；If you want to use the reference online please do not upload here", type="filepath", interactive=True)
	out = gr.Audio(label="Output", type="filepath", interactive=False)

	btn.click(convert, [inp0, inp1, inp2, ref_audio, inp3, inp4, inp5, inp6, inp7], out)

	gr.Markdown("### <center>Do not generate anything illegal or harmful towards organizations and/or individuals.</center>")
	gr.HTML('''
	<div class="footer">
	<p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。明·顾璘
	</p>
	</div>
	''')


	app.launch(show_error=True)