File size: 3,003 Bytes
d7dae2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import torch
from pathlib import Path
import scipy
from pytube import YouTube
from pydub import AudioSegment
from TTS.api import TTS
import uuid

uid = uuid.uuid4()

device = "cuda" if torch.cuda.is_available() else "cpu"

def custom_bark(inp, in_aud, trim_aud=None):
    speaker_wav=Path(f"{uid}-tmp_aud.mp4")
    if trim_aud != None:
        speaker_wav=Path(f"{uid}-trim.wav")
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
    tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
    return (f"{uid}-output.wav")
    
def load_video_yt(vid):
    yt = YouTube(vid)
    vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
    vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
    print (f'Video Length: {yt.length}')
    return vid, vid_aud, f"{uid}-tmp_aud.mp4"

def trim_clip(clip, start_t, end_t):
    clip = Path(f"{uid}-tmp_aud.mp4")
    song = AudioSegment.from_file(f"{uid}-tmp_aud.mp4", format="mp4")
    start_min = int(start_t.split(":",1)[0])
    start_sec = int(start_t.split(":",1)[1])
    end_min = int(end_t.split(":",1)[0])
    end_sec = int(end_t.split(":",1)[1])
    start = ((start_min*60)+start_sec)*1000
    end = ((end_min*60)+end_sec)*1000
    song_clip = song[start: end]
    song_clip.export(f"{uid}-trim.wav", format="wav")
    print("New Audio file is created and saved")

    return f"{uid}-trim.wav"
    
with gr.Blocks() as app:
    with gr.Box():
        with gr.Row():
            in_text = gr.Textbox(lines = 6, max_lines = 20)
            with gr.Column():
                alt_go_btn = gr.Button()
                out_audio = gr.Audio(interactive=False)
    with gr.Box():       
        with gr.Row():
            with gr.Column():

                in_aud_mic = gr.Audio(source='microphone')
                in_aud_file = gr.Audio(label = 'Audio Source', source='upload', interactive = True)
                aud_file = gr.File(interactive=False,visible=False)
                with gr.Row():
                    start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
                    end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
                trim_clip_btn = gr.Button("Trim Clip")
                trim_aud = gr.Audio(label = 'Trimmed Audio Source', source='upload', interactive = False)              
            with gr.Column():
                in_aud_yt = gr.Textbox(label="YouTube URL")
                load_yt_btn = gr.Button("Load URL")    
                yt_vid = gr.Video(interactiv=False)
                



    load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
    trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
    alt_go_btn.click(custom_bark, [in_text,in_aud_file,trim_aud], out_audio)

app.launch()