File size: 4,130 Bytes
d0f02a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbdcec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47520da
cbdcec3
d0f02a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip
import googletrans
from googletrans import Translator

pipe = pipeline(model="rafat0421/whisper-small-hi")

def download_from_youtube(url):
    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') #Downloads the video from the given YouTube URL and returns the path to the audio file.
    fpath = streams.first().download()
    return fpath

def get_timestamp(seconds):
    minutes = int(seconds / 60)
    seconds = int(seconds % 60)
    return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}" #Creates %M:%S timestamp from seconds.

def create_segments(audio_fpath, seconds_max):
    if not os.path.exists("segmented_audios"):
        os.makedirs("segmented_audios")

    sound = AudioFileClip(audio_fpath)
    n_full_segments = int(sound.duration / 30)
    len_last_segment = sound.duration % 30

    max_segments = int(seconds_max / 30)
    if n_full_segments > max_segments:
        n_full_segments = max_segments
        len_last_segment = 0

    segment_paths = []
    segment_start_times = []

    segments_available = n_full_segments + 1
    for i in range(min(segments_available, max_segments)):
        start = i * 30

        # Skip last segment if it is smaller than two seconds
        is_last_segment = i == n_full_segments
        if is_last_segment and not len_last_segment > 2:
            continue
        elif is_last_segment:
            end = start + len_last_segment
        else:
            end = (i + 1) * 30

        segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
        segment = sound.subclip(start, end)
        segment.write_audiofile(segment_path)
        segment_paths.append(segment_path)
        segment_start_times.append(start)

    return segment_paths, segment_start_times

def get_translation(text):
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language='en')
    return result["translatedText"]
    
    
    #translator = Translator(service_urls=['translate.googleapis.com'])
    #translated_text = translator.translate(text, dest='en').text
    
    #return translated_text

    #translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400, device=device)
    #result = translation_pipeline(text)
    #return result[0]['translation_text']

    #return "Under Development..."

def transcribe(audio, url, seconds_max):
    if url:
        fpath = download_from_youtube(url)
        segment_paths, segment_start_times = create_segments(fpath, seconds_max)

        audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
        pred = pipe(audio_dataset["audio"])
        text = ""
        n_segments = len(segment_start_times)
        for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
            text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
            text += f"{output['text']}\n"
            text += f"[Translation]\n{get_translation(output['text'])}\n\n"
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
        gr.Text(max_lines=1, placeholder="YouTube Link", label="Transcribe from YouTube URL"),
        gr.Slider(minimum=30, maximum=600, value=30, step=30, label="Number of seconds to transcribe")
    ], 
    outputs="text",
    title="Whisper: transcribe Swedish language audio to text",
    description="Swedish Text Transcription using Transformers.",
)

iface.launch()