Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from pathlib import Path | |
from pytube import YouTube | |
from pydub import AudioSegment | |
from transformers import AutoProcessor, BarkModel | |
import numpy as np | |
from bark import SAMPLE_RATE, generate_audio, preload_models | |
from bark.generation import SUPPORTED_LANGS | |
from TTS.api import TTS | |
import scipy | |
import uuid | |
import os | |
test_audio="./shufflin.wav" | |
uid = uuid.uuid4() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
processor = AutoProcessor.from_pretrained("suno/bark-small") | |
model = BarkModel.from_pretrained("suno/bark-small").to(device) | |
num_list = ["1","2","3","4","5","6","7","8","9","10"] | |
lang_list = ["en","de"] | |
#SAMPLE_RATE = 24_000 | |
DEBUG_MODE=False | |
AVAILABLE_PROMPTS = ["Unconditional", "Announcer"] | |
PROMPT_LOOKUP = {} | |
for _, lang in SUPPORTED_LANGS: | |
for n in range(10): | |
label = f"Speaker {n} ({lang})" | |
AVAILABLE_PROMPTS.append(label) | |
PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}" | |
PROMPT_LOOKUP["Unconditional"] = None | |
PROMPT_LOOKUP["Announcer"] = "announcer" | |
def gen_tts(text, history_prompt): # , temp_semantic, temp_waveform): | |
history_prompt = PROMPT_LOOKUP[history_prompt] | |
if DEBUG_MODE: | |
audio_arr = np.zeros(SAMPLE_RATE) | |
else: | |
# , text_temp=temp_semantic, waveform_temp=temp_waveform) | |
audio_arr = generate_audio(text, history_prompt=history_prompt) | |
audio_arr = (audio_arr * 32767).astype(np.int16) | |
return (SAMPLE_RATE, audio_arr) | |
def run_bark(text, n='1', lang='en'): | |
uid=uuid.uuid4() | |
#history_prompt = [] | |
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}" | |
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."], | |
inputs = processor(text=text, | |
voice_preset = semantic_prompt, | |
return_tensors="pt", | |
) | |
print("generating") | |
speech_values = model.generate( | |
**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True | |
) | |
#speech_values = model.generate(**inputs, do_sample=True) | |
sampling_rate = model.generation_config.sample_rate | |
#sampling_rate = 24_000 | |
print("writing") | |
scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) | |
return (f"bark_out-{uid}.wav") | |
#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu") | |
tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu") | |
print(TTS().list_models()) | |
def bark_ez(text, n='1', land='en'): | |
uid=uuid.uuid4() | |
tts.tts_to_file(text, file_path=f"{uid}-output.wav") | |
return (f'{uid}-output.wav') | |
def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None): | |
if tog=="Custom": | |
if in_aud_mic != None: | |
speaker_wav=in_aud_mic | |
if in_aud !=None and trim_aud==None: | |
speaker_wav=in_aud | |
#speaker_wav=Path(f"{uid}-tmp_aud.mp4") | |
if trim_aud != None: | |
speaker_wav=Path(f"{uid}-trim.wav") | |
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device) | |
tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav") | |
return (f"{uid}-output.wav") | |
if tog=="Preset": | |
return (bark_ez(inp,speaker)) | |
def load_video_yt(vid): | |
yt = YouTube(vid) | |
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4") | |
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4") | |
print (f'Video Length: {yt.length}') | |
return vid, vid_aud, f"{uid}-tmp_aud.mp4" | |
def trim_clip(clip, start_t, end_t): | |
clip = Path(f"{clip}") | |
song = AudioSegment.from_file(f"{clip}", format="mp4") | |
#song = AudioSegment.from_file(Path(f"{clip}"), format="mp4") | |
start_min = int(start_t.split(":",1)[0]) | |
start_sec = int(start_t.split(":",1)[1]) | |
end_min = int(end_t.split(":",1)[0]) | |
end_sec = int(end_t.split(":",1)[1]) | |
start = ((start_min*60)+start_sec)*1000 | |
end = ((end_min*60)+end_sec)*1000 | |
song_clip = song[start: end] | |
song_clip.export(f"{uid}-trim.wav", format="wav") | |
print("New Audio file is created and saved") | |
return f"{uid}-trim.wav" | |
def pre_aud(inp): | |
print(inp) | |
song = AudioSegment.from_file(Path(f'{inp}'), format="mp4") | |
song.export(f"{uid}-tmp_aud.mp4", format="mp4") | |
print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}') | |
return inp | |
def tog_in(tog): | |
if tog=="Preset": | |
return (gr.update(visible=True),gr.update(visible=False)) | |
if tog=="Custom": | |
return (gr.update(visible=False),gr.update(visible=True)) | |
with gr.Blocks() as app: | |
with gr.Group(): | |
with gr.Row(): | |
in_text = gr.Textbox(lines = 6, max_lines = 20) | |
with gr.Column(): | |
alt_go_btn = gr.Button() | |
out_audio = gr.Audio(interactive=False) | |
with gr.Row(): | |
gr.Markdown('''<H1> Audio Source:''') | |
with gr.Row(): | |
tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset") | |
with gr.Group(visible=True) as group_1: | |
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1") | |
options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option") | |
semantic_btn=gr.Button("Run Semantic") | |
with gr.Group(visible=False) as group_2: | |
with gr.Row(): | |
with gr.Column(): | |
#in_aud_mic = gr.Audio(source='microphone') | |
in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio) | |
aud_file = gr.File(interactive=False,visible=True) | |
with gr.Row(): | |
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23") | |
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12") | |
trim_clip_btn = gr.Button("Trim Clip") | |
trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False) | |
with gr.Column(): | |
in_aud_yt = gr.Textbox(label="YouTube URL") | |
load_yt_btn = gr.Button("Load URL") | |
yt_vid = gr.Video(interactive=False) | |
semantic_btn.click(gen_tts,[in_text,options],out_audio) | |
tog.change(tog_in,tog,[group_1,group_2]) | |
#in_aud_file.change(pre_aud,in_aud_file,aud_file) | |
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file]) | |
trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud) | |
alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio) | |
app.launch() |