EZ-Voice-Clone-EZ

Runtime error

File size: 6,910 Bytes

d7dae2d
 
 
 
 
8e652d6
5c8292d
aa3ff9e
5c8292d
 
 
d7dae2d
75582c2
d7dae2d
9ffc810
d7dae2d
75fba1e
 
d7dae2d
 
 
 
3846d12
 
 
 
 
5c8292d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3846d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4853900
 
7cd15e7
7cb7a32
 
20d75a9
7cb7a32
 
3846d12
6d2a0cf
3846d12
 
 
 
 
 
 
 
 
 
 
 
6d2a0cf
d7dae2d
 
 
 
 
 
 
 
237e3b5
5bb4e7f
b1a2210
d7dae2d
 
 
 
 
 
 
 
 
 
 
194d529
b1a2210
d10cfe4
 
 
29cf60d
6d2a0cf
 
 
 
 
 
 
d7dae2d
64e99e6
d7dae2d
 
 
 
 
6d2a0cf
060a393
67d6685
3846d12
6d2a0cf
 
 
5c8292d
 
6d2a0cf
d7dae2d
 
 
7c0224a
75fba1e
2a4098b
d7dae2d
 
 
 
29cf60d
d7dae2d
 
 
fb7aeae
d7dae2d
5c8292d
6d2a0cf
ce5d7b2
d7dae2d
237e3b5
6d2a0cf
d7dae2d

import gradio as gr
import torch
from pathlib import Path
from pytube import YouTube
from pydub import AudioSegment
from transformers import AutoProcessor, BarkModel

import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import SUPPORTED_LANGS

from TTS.api import TTS
import scipy
import uuid
import os

test_audio="./shufflin.wav"

uid = uuid.uuid4()

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
DEBUG_MODE=False

AVAILABLE_PROMPTS = ["Unconditional", "Announcer"]
PROMPT_LOOKUP = {}
for _, lang in SUPPORTED_LANGS:
    for n in range(10):
        label = f"Speaker {n} ({lang})"
        AVAILABLE_PROMPTS.append(label)
        PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}"
PROMPT_LOOKUP["Unconditional"] = None
PROMPT_LOOKUP["Announcer"] = "announcer"

def gen_tts(text, history_prompt):  # , temp_semantic, temp_waveform):
    history_prompt = PROMPT_LOOKUP[history_prompt]
    if DEBUG_MODE:
        audio_arr = np.zeros(SAMPLE_RATE)
    else:
        # , text_temp=temp_semantic, waveform_temp=temp_waveform)
        audio_arr = generate_audio(text, history_prompt=history_prompt)
    audio_arr = (audio_arr * 32767).astype(np.int16)
    return (SAMPLE_RATE, audio_arr)



def run_bark(text, n='1', lang='en'):
    uid=uuid.uuid4()
    #history_prompt = []
    semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

        #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
    inputs = processor(text=text,
        voice_preset = semantic_prompt,
        return_tensors="pt",
    )
    print("generating")
    speech_values = model.generate(
        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
    )
    #speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate

    #sampling_rate = 24_000
    print("writing")
    scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
    return (f"bark_out-{uid}.wav")


#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu")
tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu")
print(TTS().list_models())
def bark_ez(text, n='1', land='en'):
    uid=uuid.uuid4()
    tts.tts_to_file(text, file_path=f"{uid}-output.wav")
    return (f'{uid}-output.wav')


def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None):
    if tog=="Custom":
        if in_aud_mic != None:
            speaker_wav=in_aud_mic
        if in_aud !=None and trim_aud==None:
            speaker_wav=in_aud
            #speaker_wav=Path(f"{uid}-tmp_aud.mp4")
        if trim_aud != None:
            speaker_wav=Path(f"{uid}-trim.wav")
        tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
        tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
        return (f"{uid}-output.wav")
    if tog=="Preset":
        return (bark_ez(inp,speaker))
def load_video_yt(vid):
    yt = YouTube(vid)
    vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
    vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
    print (f'Video Length: {yt.length}')
    return vid, vid_aud, f"{uid}-tmp_aud.mp4"

def trim_clip(clip, start_t, end_t):
    clip = Path(f"{clip}")
    song = AudioSegment.from_file(f"{clip}", format="mp4")
    #song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
    start_min = int(start_t.split(":",1)[0])
    start_sec = int(start_t.split(":",1)[1])
    end_min = int(end_t.split(":",1)[0])
    end_sec = int(end_t.split(":",1)[1])
    start = ((start_min*60)+start_sec)*1000
    end = ((end_min*60)+end_sec)*1000
    song_clip = song[start: end]
    song_clip.export(f"{uid}-trim.wav", format="wav")
    print("New Audio file is created and saved")

    return f"{uid}-trim.wav"
def pre_aud(inp):
    print(inp)
    song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
    song.export(f"{uid}-tmp_aud.mp4", format="mp4")
    print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
    return inp
def tog_in(tog):
    if tog=="Preset":
        return (gr.update(visible=True),gr.update(visible=False))
    if tog=="Custom":
        return (gr.update(visible=False),gr.update(visible=True))   


with gr.Blocks() as app:
    with gr.Group():
        with gr.Row():
            in_text = gr.Textbox(lines = 6, max_lines = 20)
            with gr.Column():
                alt_go_btn = gr.Button()
                out_audio = gr.Audio(interactive=False)

        with gr.Row():
            gr.Markdown('''<H1> Audio Source:''')
        with gr.Row():
            tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")    
    with gr.Group(visible=True) as group_1:
        speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
        options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option") 
        semantic_btn=gr.Button("Run Semantic")
    with gr.Group(visible=False) as group_2:      
        with gr.Row():
            with gr.Column():

                #in_aud_mic = gr.Audio(source='microphone')
                in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
                aud_file = gr.File(interactive=False,visible=True)
                with gr.Row():
                    start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
                    end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
                trim_clip_btn = gr.Button("Trim Clip")
                trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)              
            with gr.Column():
                in_aud_yt = gr.Textbox(label="YouTube URL")
                load_yt_btn = gr.Button("Load URL")    
                yt_vid = gr.Video(interactive=False)
                
    semantic_btn.click(gen_tts,[in_text,options],out_audio)
    tog.change(tog_in,tog,[group_1,group_2])
    #in_aud_file.change(pre_aud,in_aud_file,aud_file)
    load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
    trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
    alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio)

app.launch()