EZ-Voice-Clone-EZ

Runtime error

App Files Files Community

EZ-Voice-Clone-EZ / vc.py

Omnibus

Update vc.py

aa3ff9e verified about 1 year ago

raw

history blame contribute delete

6.91 kB

	import gradio as gr
	import torch
	from pathlib import Path
	from pytube import YouTube
	from pydub import AudioSegment
	from transformers import AutoProcessor, BarkModel

	import numpy as np
	from bark import SAMPLE_RATE, generate_audio, preload_models
	from bark.generation import SUPPORTED_LANGS

	from TTS.api import TTS
	import scipy
	import uuid
	import os

	test_audio="./shufflin.wav"

	uid = uuid.uuid4()

	device = "cuda" if torch.cuda.is_available() else "cpu"

	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = BarkModel.from_pretrained("suno/bark-small").to(device)
	num_list = ["1","2","3","4","5","6","7","8","9","10"]
	lang_list = ["en","de"]
	#SAMPLE_RATE = 24_000
	DEBUG_MODE=False

	AVAILABLE_PROMPTS = ["Unconditional", "Announcer"]
	PROMPT_LOOKUP = {}
	for _, lang in SUPPORTED_LANGS:
	for n in range(10):
	label = f"Speaker {n} ({lang})"
	AVAILABLE_PROMPTS.append(label)
	PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}"
	PROMPT_LOOKUP["Unconditional"] = None
	PROMPT_LOOKUP["Announcer"] = "announcer"

	def gen_tts(text, history_prompt): # , temp_semantic, temp_waveform):
	history_prompt = PROMPT_LOOKUP[history_prompt]
	if DEBUG_MODE:
	audio_arr = np.zeros(SAMPLE_RATE)
	else:
	# , text_temp=temp_semantic, waveform_temp=temp_waveform)
	audio_arr = generate_audio(text, history_prompt=history_prompt)
	audio_arr = (audio_arr * 32767).astype(np.int16)
	return (SAMPLE_RATE, audio_arr)



	def run_bark(text, n='1', lang='en'):
	uid=uuid.uuid4()
	#history_prompt = []
	semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

	#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
	inputs = processor(text=text,
	voice_preset = semantic_prompt,
	return_tensors="pt",
	)
	print("generating")
	speech_values = model.generate(
	**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
	)
	#speech_values = model.generate(**inputs, do_sample=True)
	sampling_rate = model.generation_config.sample_rate

	#sampling_rate = 24_000
	print("writing")
	scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
	return (f"bark_out-{uid}.wav")


	#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu")
	tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu")
	print(TTS().list_models())
	def bark_ez(text, n='1', land='en'):
	uid=uuid.uuid4()
	tts.tts_to_file(text, file_path=f"{uid}-output.wav")
	return (f'{uid}-output.wav')


	def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None):
	if tog=="Custom":
	if in_aud_mic != None:
	speaker_wav=in_aud_mic
	if in_aud !=None and trim_aud==None:
	speaker_wav=in_aud
	#speaker_wav=Path(f"{uid}-tmp_aud.mp4")
	if trim_aud != None:
	speaker_wav=Path(f"{uid}-trim.wav")
	tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
	tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
	return (f"{uid}-output.wav")
	if tog=="Preset":
	return (bark_ez(inp,speaker))
	def load_video_yt(vid):
	yt = YouTube(vid)
	vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
	vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
	print (f'Video Length: {yt.length}')
	return vid, vid_aud, f"{uid}-tmp_aud.mp4"

	def trim_clip(clip, start_t, end_t):
	clip = Path(f"{clip}")
	song = AudioSegment.from_file(f"{clip}", format="mp4")
	#song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
	start_min = int(start_t.split(":",1)[0])
	start_sec = int(start_t.split(":",1)[1])
	end_min = int(end_t.split(":",1)[0])
	end_sec = int(end_t.split(":",1)[1])
	start = ((start_min60)+start_sec)1000
	end = ((end_min60)+end_sec)1000
	song_clip = song[start: end]
	song_clip.export(f"{uid}-trim.wav", format="wav")
	print("New Audio file is created and saved")

	return f"{uid}-trim.wav"
	def pre_aud(inp):
	print(inp)
	song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
	song.export(f"{uid}-tmp_aud.mp4", format="mp4")
	print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
	return inp
	def tog_in(tog):
	if tog=="Preset":
	return (gr.update(visible=True),gr.update(visible=False))
	if tog=="Custom":
	return (gr.update(visible=False),gr.update(visible=True))


	with gr.Blocks() as app:
	with gr.Group():
	with gr.Row():
	in_text = gr.Textbox(lines = 6, max_lines = 20)
	with gr.Column():
	alt_go_btn = gr.Button()
	out_audio = gr.Audio(interactive=False)

	with gr.Row():
	gr.Markdown('''<H1> Audio Source:''')
	with gr.Row():
	tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")
	with gr.Group(visible=True) as group_1:
	speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
	options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option")
	semantic_btn=gr.Button("Run Semantic")
	with gr.Group(visible=False) as group_2:
	with gr.Row():
	with gr.Column():

	#in_aud_mic = gr.Audio(source='microphone')
	in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
	aud_file = gr.File(interactive=False,visible=True)
	with gr.Row():
	start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
	end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
	trim_clip_btn = gr.Button("Trim Clip")
	trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)
	with gr.Column():
	in_aud_yt = gr.Textbox(label="YouTube URL")
	load_yt_btn = gr.Button("Load URL")
	yt_vid = gr.Video(interactive=False)

	semantic_btn.click(gen_tts,[in_text,options],out_audio)
	tog.change(tog_in,tog,[group_1,group_2])
	#in_aud_file.change(pre_aud,in_aud_file,aud_file)
	load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
	trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
	alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio)

	app.launch()