Spaces:
Runtime error
Runtime error
File size: 5,782 Bytes
d7dae2d 8e652d6 d7dae2d 75582c2 d7dae2d 9ffc810 d7dae2d 75fba1e d7dae2d 3846d12 4853900 7cd15e7 7cb7a32 20d75a9 7cb7a32 3846d12 6d2a0cf 3846d12 6d2a0cf d7dae2d 237e3b5 5bb4e7f b1a2210 d7dae2d 194d529 b1a2210 d10cfe4 29cf60d 6d2a0cf d7dae2d 64e99e6 d7dae2d 6d2a0cf 060a393 67d6685 3846d12 6d2a0cf d7dae2d 7c0224a 75fba1e 2a4098b d7dae2d 29cf60d d7dae2d fb7aeae d7dae2d 6d2a0cf ce5d7b2 d7dae2d 237e3b5 6d2a0cf d7dae2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
import torch
from pathlib import Path
from pytube import YouTube
from pydub import AudioSegment
from transformers import AutoProcessor, BarkModel
from TTS.api import TTS
import scipy
import uuid
import os
test_audio="./shufflin.wav"
uid = uuid.uuid4()
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
def run_bark(text, n='1', lang='en'):
uid=uuid.uuid4()
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
print("generating")
speech_values = model.generate(
**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
)
#speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = 24_000
print("writing")
scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return (f"bark_out-{uid}.wav")
#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu")
tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu")
print(TTS().list_models())
def bark_ez(text, n='1', land='en'):
uid=uuid.uuid4()
tts.tts_to_file(text, file_path=f"{uid}-output.wav")
return (f'{uid}-output.wav')
def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None):
if tog=="Custom":
if in_aud_mic != None:
speaker_wav=in_aud_mic
if in_aud !=None and trim_aud==None:
speaker_wav=in_aud
#speaker_wav=Path(f"{uid}-tmp_aud.mp4")
if trim_aud != None:
speaker_wav=Path(f"{uid}-trim.wav")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
return (f"{uid}-output.wav")
if tog=="Preset":
return (bark_ez(inp,speaker))
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
print (f'Video Length: {yt.length}')
return vid, vid_aud, f"{uid}-tmp_aud.mp4"
def trim_clip(clip, start_t, end_t):
clip = Path(f"{clip}")
song = AudioSegment.from_file(f"{clip}", format="mp4")
#song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
start_min = int(start_t.split(":",1)[0])
start_sec = int(start_t.split(":",1)[1])
end_min = int(end_t.split(":",1)[0])
end_sec = int(end_t.split(":",1)[1])
start = ((start_min*60)+start_sec)*1000
end = ((end_min*60)+end_sec)*1000
song_clip = song[start: end]
song_clip.export(f"{uid}-trim.wav", format="wav")
print("New Audio file is created and saved")
return f"{uid}-trim.wav"
def pre_aud(inp):
print(inp)
song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
song.export(f"{uid}-tmp_aud.mp4", format="mp4")
print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
return inp
def tog_in(tog):
if tog=="Preset":
return (gr.update(visible=True),gr.update(visible=False))
if tog=="Custom":
return (gr.update(visible=False),gr.update(visible=True))
with gr.Blocks() as app:
with gr.Group():
with gr.Row():
in_text = gr.Textbox(lines = 6, max_lines = 20)
with gr.Column():
alt_go_btn = gr.Button()
out_audio = gr.Audio(interactive=False)
with gr.Row():
gr.Markdown('''<H1> Audio Source:''')
with gr.Row():
tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")
with gr.Group(visible=True) as group_1:
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
with gr.Group(visible=False) as group_2:
with gr.Row():
with gr.Column():
#in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
aud_file = gr.File(interactive=False,visible=True)
with gr.Row():
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)
with gr.Column():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
yt_vid = gr.Video(interactive=False)
tog.change(tog_in,tog,[group_1,group_2])
#in_aud_file.change(pre_aud,in_aud_file,aud_file)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio)
app.launch() |