Spaces:
Runtime error
Runtime error
File size: 4,980 Bytes
d7dae2d 8e652d6 d7dae2d 9ffc810 d7dae2d 75fba1e d7dae2d 3846d12 d7dae2d 237e3b5 5bb4e7f b1a2210 d7dae2d 194d529 b1a2210 d10cfe4 29cf60d d7dae2d 64e99e6 d7dae2d 64e99e6 060a393 67d6685 3846d12 5b95c68 d7dae2d 7c0224a 75fba1e 2a4098b d7dae2d 29cf60d d7dae2d fb7aeae d7dae2d ce5d7b2 d7dae2d 237e3b5 5b95c68 d7dae2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import torch
from pathlib import Path
from pytube import YouTube
from pydub import AudioSegment
from transformers import AutoProcessor, BarkModel
from TTS.api import TTS
import uuid
import os
test_audio="./shufflin.wav"
uid = uuid.uuid4()
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
def run_bark(text, n='1', lang='en'):
uid=uuid.uuid4()
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
print("generating")
speech_values = model.generate(
**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
)
#speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = 24_000
print("writing")
scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return (f"bark_out-{uid}.wav")
def custom_bark(inp, tog, in_aud=None, trim_aud=None, in_aud_mic=None):
if tog=="Custom":
if in_aud_mic != None:
speaker_wav=in_aud_mic
if in_aud !=None and trim_aud==None:
speaker_wav=in_aud
#speaker_wav=Path(f"{uid}-tmp_aud.mp4")
if trim_aud != None:
speaker_wav=Path(f"{uid}-trim.wav")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
return (f"{uid}-output.wav")
if tog=="Preset":
return (run_bark(inp))
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
print (f'Video Length: {yt.length}')
return vid, vid_aud, f"{uid}-tmp_aud.mp4"
def trim_clip(clip, start_t, end_t):
clip = Path(f"{clip}")
song = AudioSegment.from_file(f"{clip}", format="mp4")
#song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
start_min = int(start_t.split(":",1)[0])
start_sec = int(start_t.split(":",1)[1])
end_min = int(end_t.split(":",1)[0])
end_sec = int(end_t.split(":",1)[1])
start = ((start_min*60)+start_sec)*1000
end = ((end_min*60)+end_sec)*1000
song_clip = song[start: end]
song_clip.export(f"{uid}-trim.wav", format="wav")
print("New Audio file is created and saved")
return f"{uid}-trim.wav"
def pre_aud(inp):
print(inp)
song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
song.export(f"{uid}-tmp_aud.mp4", format="mp4")
print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
return inp
with gr.Blocks() as app:
with gr.Group():
with gr.Row():
in_text = gr.Textbox(lines = 6, max_lines = 20)
with gr.Column():
alt_go_btn = gr.Button()
out_audio = gr.Audio(interactive=False)
with gr.Group():
with gr.Row():
gr.Markdown('''<H1> Audio Source:''')
with gr.Row():
tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")
with gr.Row():
with gr.Column():
#in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
aud_file = gr.File(interactive=False,visible=True)
with gr.Row():
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)
with gr.Column():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
yt_vid = gr.Video(interactive=False)
#in_aud_file.change(pre_aud,in_aud_file,aud_file)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
alt_go_btn.click(custom_bark, [in_text,tog,in_aud_file,trim_aud], out_audio)
app.launch() |