import gradio as gr import torch from pathlib import Path from pytube import YouTube from pydub import AudioSegment from transformers import AutoProcessor, BarkModel import numpy as np from bark import SAMPLE_RATE, generate_audio, preload_models from bark.generation import SUPPORTED_LANGS from TTS.api import TTS import scipy import uuid import os test_audio="./shufflin.wav" uid = uuid.uuid4() device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained("suno/bark-small") model = BarkModel.from_pretrained("suno/bark-small").to(device) num_list = ["1","2","3","4","5","6","7","8","9","10"] lang_list = ["en","de"] #SAMPLE_RATE = 24_000 DEBUG_MODE=False AVAILABLE_PROMPTS = ["Unconditional", "Announcer"] PROMPT_LOOKUP = {} for _, lang in SUPPORTED_LANGS: for n in range(10): label = f"Speaker {n} ({lang})" AVAILABLE_PROMPTS.append(label) PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}" PROMPT_LOOKUP["Unconditional"] = None PROMPT_LOOKUP["Announcer"] = "announcer" def gen_tts(text, history_prompt): # , temp_semantic, temp_waveform): history_prompt = PROMPT_LOOKUP[history_prompt] if DEBUG_MODE: audio_arr = np.zeros(SAMPLE_RATE) else: # , text_temp=temp_semantic, waveform_temp=temp_waveform) audio_arr = generate_audio(text, history_prompt=history_prompt) audio_arr = (audio_arr * 32767).astype(np.int16) return (SAMPLE_RATE, audio_arr) def run_bark(text, n='1', lang='en'): uid=uuid.uuid4() #history_prompt = [] semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}" #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."], inputs = processor(text=text, voice_preset = semantic_prompt, return_tensors="pt", ) print("generating") speech_values = model.generate( **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True ) #speech_values = model.generate(**inputs, do_sample=True) sampling_rate = model.generation_config.sample_rate #sampling_rate = 24_000 print("writing") scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) return (f"bark_out-{uid}.wav") #tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu") tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu") print(TTS().list_models()) def bark_ez(text, n='1', land='en'): uid=uuid.uuid4() tts.tts_to_file(text, file_path=f"{uid}-output.wav") return (f'{uid}-output.wav') def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None): if tog=="Custom": if in_aud_mic != None: speaker_wav=in_aud_mic if in_aud !=None and trim_aud==None: speaker_wav=in_aud #speaker_wav=Path(f"{uid}-tmp_aud.mp4") if trim_aud != None: speaker_wav=Path(f"{uid}-trim.wav") tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device) tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav") return (f"{uid}-output.wav") if tog=="Preset": return (bark_ez(inp,speaker)) def load_video_yt(vid): yt = YouTube(vid) vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4") vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4") print (f'Video Length: {yt.length}') return vid, vid_aud, f"{uid}-tmp_aud.mp4" def trim_clip(clip, start_t, end_t): clip = Path(f"{clip}") song = AudioSegment.from_file(f"{clip}", format="mp4") #song = AudioSegment.from_file(Path(f"{clip}"), format="mp4") start_min = int(start_t.split(":",1)[0]) start_sec = int(start_t.split(":",1)[1]) end_min = int(end_t.split(":",1)[0]) end_sec = int(end_t.split(":",1)[1]) start = ((start_min*60)+start_sec)*1000 end = ((end_min*60)+end_sec)*1000 song_clip = song[start: end] song_clip.export(f"{uid}-trim.wav", format="wav") print("New Audio file is created and saved") return f"{uid}-trim.wav" def pre_aud(inp): print(inp) song = AudioSegment.from_file(Path(f'{inp}'), format="mp4") song.export(f"{uid}-tmp_aud.mp4", format="mp4") print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}') return inp def tog_in(tog): if tog=="Preset": return (gr.update(visible=True),gr.update(visible=False)) if tog=="Custom": return (gr.update(visible=False),gr.update(visible=True)) with gr.Blocks() as app: with gr.Group(): with gr.Row(): in_text = gr.Textbox(lines = 6, max_lines = 20) with gr.Column(): alt_go_btn = gr.Button() out_audio = gr.Audio(interactive=False) with gr.Row(): gr.Markdown('''

Audio Source:''') with gr.Row(): tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset") with gr.Group(visible=True) as group_1: speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1") options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option") semantic_btn=gr.Button("Run Semantic") with gr.Group(visible=False) as group_2: with gr.Row(): with gr.Column(): #in_aud_mic = gr.Audio(source='microphone') in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio) aud_file = gr.File(interactive=False,visible=True) with gr.Row(): start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23") end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12") trim_clip_btn = gr.Button("Trim Clip") trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False) with gr.Column(): in_aud_yt = gr.Textbox(label="YouTube URL") load_yt_btn = gr.Button("Load URL") yt_vid = gr.Video(interactive=False) semantic_btn.click(gen_tts,[in_text,options],out_audio) tog.change(tog_in,tog,[group_1,group_2]) #in_aud_file.change(pre_aud,in_aud_file,aud_file) load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file]) trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud) alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio) app.launch()