Bark-simple / app.py
Omnibus's picture
Update app.py
dce80ca
raw
history blame
4.55 kB
import gradio as gr
import torch
from pathlib import Path
from transformers import AutoProcessor, BarkModel
import scipy
from pytube import YouTube
from pydub import AudioSegment
from TTS.api import TTS
#import ffmpeg
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
# model.enable_cpu_offload()
device = "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
def run_bark(text, n, lang):
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = model.config.sample_rate
#sampling_rate = 24000
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return ("bark_out.wav")
def custom_bark(inp):
speaker_wav=Path("Mid.mp3")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file("This is voice cloning.", speaker_wav=speaker_wav, language="en", file_path="output.wav")
return ("output.wav")
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4")
print (yt.length)
return vid, vid_aud, "tmp_aud.mp4"
def trim_clip(clip, start_t, end_t):
clip = Path("tmp_aud.mp4")
#clip = "tmp_aud.mp3"
# Open an mp3 file
song = AudioSegment.from_file("tmp_aud.mp4",
format="mp4")
# start and end time
#start_min = 0
#start_sec = 10
#end_min = 0
#end_sec = 55
start_min = int(start_t.split(":",1)[0])
start_sec = int(start_t.split(":",1)[1])
end_min = int(end_t.split(":",1)[0])
end_sec = int(end_t.split(":",1)[1])
# pydub does things in milliseconds, so convert time
start = ((start_min*60)+start_sec)*1000
end = ((end_min*60)+end_sec)*1000
#start = 0
#end = 15*1000
# song clip of 10 seconds from starting
first_10_seconds = song[start: end]
# save file
first_10_seconds.export("Mid.mp3", format="mp3")
print("New Audio file is created and saved")
return "Mid.mp3"
with gr.Blocks() as app:
with gr.Column():
in_text = gr.Textbox()
with gr.Tab("Default"):
with gr.Row():
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
go_btn = gr.Button()
with gr.Tab("Upload"):
with gr.Row():
with gr.Column():
in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(source='upload', interactive = True)
aud_file = gr.File()
with gr.Column():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
with gr.Column():
with gr.Row():
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(source='upload', interactive = False)
alt_go_btn = gr.Button()
yt_vid = gr.Video(type = 'filepath')
#speaker_num = gr.Number(value=0)
with gr.Column():
out_audio = gr.Audio()
go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
alt_go_btn.click(custom_bark, trim_aud, out_audio)
app.launch()