Spaces:

Omnibus
/

Bark-simple

Running

File size: 4,665 Bytes

cfd58c5
2a5abe3
69b1728
88cd06b
cfd58c5
d10d42c
1567aff
dce80ca
82f1825
cfd58c5
 
88cd06b
 
 
 
 
 
 
cfd58c5
88cd06b
b026f7e
af8075f
b82d659
b026f7e
f4e63b7
99cf07f
a2597a8
 
 
f3aa612
cfd58c5
 
b82d659
 
 
 
 
eb278b4
cfd58c5
4ec2aff
b82d659
cfd58c5
 
 
dce80ca
 
 
6113dfa
dce80ca
 
d10d42c
 
 
1cb3525
a8e72fd
1cb3525
0f61089
99954b1
1cb3525
a9f1358
1567aff
1cb3525
 
1567aff
 
dce80ca
 
 
 
99954b1
 
 
 
1567aff
99954b1
 
 
 
1567aff
 
 
 
 
 
82f1825
1567aff
d10d42c
cfd58c5
107d5cd
 
3f1c26e
d10d42c
 
 
fad8edf
3f1c26e
071b368
99954b1
 
 
 
 
 
 
 
 
 
 
 
 
 
fad8edf
dce80ca
b026f7e
cfd58c5
107d5cd
 
 
b026f7e
402d823
99954b1
6113dfa
cfd58c5

import gradio as gr
import torch
from pathlib import Path
from transformers import AutoProcessor, BarkModel
import scipy
from pytube import YouTube
from pydub import AudioSegment
from TTS.api import TTS
#import ffmpeg


# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
# model.enable_cpu_offload()

device = "cpu"


processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
def run_bark(text, n, lang):
    #history_prompt = []
    semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

        #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
    inputs = processor(text=text,
        voice_preset = semantic_prompt,
        return_tensors="pt",
    )
    print("generating")
    speech_values = model.generate(
        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
    )
    #speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate

    #sampling_rate = 24_000
    print("writing")
    scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
    return ("bark_out.wav")

def custom_bark(inp):
    speaker_wav=Path("Mid.mp3")
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
    tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path="output.wav")
    return ("output.wav")
    
def load_video_yt(vid):
    yt = YouTube(vid)
    vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
    vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4")
    print (yt.length)
    return vid, vid_aud, "tmp_aud.mp4"

def trim_clip(clip, start_t, end_t):
    clip = Path("tmp_aud.mp4")
    #clip = "tmp_aud.mp3"
    # Open an mp3 file
    song = AudioSegment.from_file("tmp_aud.mp4", 
                                  format="mp4")
      
    # start and end time
    #start_min = 0
    #start_sec = 10
    #end_min = 0
    #end_sec = 55
    start_min = int(start_t.split(":",1)[0])
    start_sec = int(start_t.split(":",1)[1])
    end_min = int(end_t.split(":",1)[0])
    end_sec = int(end_t.split(":",1)[1])
    # pydub does things in milliseconds, so convert time
    start = ((start_min*60)+start_sec)*1000
    end = ((end_min*60)+end_sec)*1000
    #start = 0
    #end = 15*1000
    # song clip of 10 seconds from starting
    first_10_seconds = song[start: end]
      
    # save file
    first_10_seconds.export("Mid.mp3", format="mp3")
    print("New Audio file is created and saved")

    return "Mid.mp3"
    
with gr.Blocks() as app:
    with gr.Column():
        in_text = gr.Textbox()
        with gr.Tab("Default"):
            with gr.Row():
                speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
                speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
            go_btn = gr.Button()
        with gr.Tab("Upload"):
            with gr.Row():
                with gr.Column():
                    in_aud_mic = gr.Audio(source='microphone')
                    in_aud_file = gr.Audio(source='upload', interactive = True)
                    aud_file = gr.File()
                with gr.Column():
                    in_aud_yt = gr.Textbox(label="YouTube URL")
                    load_yt_btn = gr.Button("Load URL")
                with gr.Column():
                    with gr.Row():
                        start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
                        end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
                        
                    trim_clip_btn = gr.Button("Trim Clip")
                    trim_aud = gr.Audio(source='upload', interactive = False)
            alt_go_btn = gr.Button()
            yt_vid = gr.Video(type = 'filepath')
        #speaker_num = gr.Number(value=0)

    with gr.Column():
        out_audio = gr.Audio()

    go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
    load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
    trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
    alt_go_btn.click(custom_bark, in_text, out_audio)

app.launch()