Omnibus's picture
Update vc.py
aa3ff9e verified
import gradio as gr
import torch
from pathlib import Path
from pytube import YouTube
from pydub import AudioSegment
from transformers import AutoProcessor, BarkModel
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import SUPPORTED_LANGS
from TTS.api import TTS
import scipy
import uuid
import os
test_audio="./shufflin.wav"
uid = uuid.uuid4()
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
DEBUG_MODE=False
AVAILABLE_PROMPTS = ["Unconditional", "Announcer"]
PROMPT_LOOKUP = {}
for _, lang in SUPPORTED_LANGS:
for n in range(10):
label = f"Speaker {n} ({lang})"
AVAILABLE_PROMPTS.append(label)
PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}"
PROMPT_LOOKUP["Unconditional"] = None
PROMPT_LOOKUP["Announcer"] = "announcer"
def gen_tts(text, history_prompt): # , temp_semantic, temp_waveform):
history_prompt = PROMPT_LOOKUP[history_prompt]
if DEBUG_MODE:
audio_arr = np.zeros(SAMPLE_RATE)
else:
# , text_temp=temp_semantic, waveform_temp=temp_waveform)
audio_arr = generate_audio(text, history_prompt=history_prompt)
audio_arr = (audio_arr * 32767).astype(np.int16)
return (SAMPLE_RATE, audio_arr)
def run_bark(text, n='1', lang='en'):
uid=uuid.uuid4()
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
print("generating")
speech_values = model.generate(
**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
)
#speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = 24_000
print("writing")
scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return (f"bark_out-{uid}.wav")
#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu")
tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu")
print(TTS().list_models())
def bark_ez(text, n='1', land='en'):
uid=uuid.uuid4()
tts.tts_to_file(text, file_path=f"{uid}-output.wav")
return (f'{uid}-output.wav')
def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None):
if tog=="Custom":
if in_aud_mic != None:
speaker_wav=in_aud_mic
if in_aud !=None and trim_aud==None:
speaker_wav=in_aud
#speaker_wav=Path(f"{uid}-tmp_aud.mp4")
if trim_aud != None:
speaker_wav=Path(f"{uid}-trim.wav")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
return (f"{uid}-output.wav")
if tog=="Preset":
return (bark_ez(inp,speaker))
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
print (f'Video Length: {yt.length}')
return vid, vid_aud, f"{uid}-tmp_aud.mp4"
def trim_clip(clip, start_t, end_t):
clip = Path(f"{clip}")
song = AudioSegment.from_file(f"{clip}", format="mp4")
#song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
start_min = int(start_t.split(":",1)[0])
start_sec = int(start_t.split(":",1)[1])
end_min = int(end_t.split(":",1)[0])
end_sec = int(end_t.split(":",1)[1])
start = ((start_min*60)+start_sec)*1000
end = ((end_min*60)+end_sec)*1000
song_clip = song[start: end]
song_clip.export(f"{uid}-trim.wav", format="wav")
print("New Audio file is created and saved")
return f"{uid}-trim.wav"
def pre_aud(inp):
print(inp)
song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
song.export(f"{uid}-tmp_aud.mp4", format="mp4")
print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
return inp
def tog_in(tog):
if tog=="Preset":
return (gr.update(visible=True),gr.update(visible=False))
if tog=="Custom":
return (gr.update(visible=False),gr.update(visible=True))
with gr.Blocks() as app:
with gr.Group():
with gr.Row():
in_text = gr.Textbox(lines = 6, max_lines = 20)
with gr.Column():
alt_go_btn = gr.Button()
out_audio = gr.Audio(interactive=False)
with gr.Row():
gr.Markdown('''<H1> Audio Source:''')
with gr.Row():
tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")
with gr.Group(visible=True) as group_1:
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option")
semantic_btn=gr.Button("Run Semantic")
with gr.Group(visible=False) as group_2:
with gr.Row():
with gr.Column():
#in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
aud_file = gr.File(interactive=False,visible=True)
with gr.Row():
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)
with gr.Column():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
yt_vid = gr.Video(interactive=False)
semantic_btn.click(gen_tts,[in_text,options],out_audio)
tog.change(tog_in,tog,[group_1,group_2])
#in_aud_file.change(pre_aud,in_aud_file,aud_file)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio)
app.launch()