File size: 4,357 Bytes
fa1fe99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
from transformers import pipeline
import os
import gradio as gr
from pydub import AudioSegment
from pytube import YouTube
import timeit
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device)

def transcribe_speech_local(filepath):
    if filepath is None:
        return [{"error": "No audio found, please retry."}]

    # Split audio into 15-second chunks
    audio = AudioSegment.from_file(filepath)
    chunk_length_ms = 15000  # 15 seconds in milliseconds
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    print(chunks)
    aligned_chunks = []
    transcription_time_total = 0

    # Transcribe each chunk and measure time
    for chunk_id, chunk in enumerate(chunks):
        start_time = timeit.default_timer()
        chunk.export("temp_chunk.wav", format="wav")
        output = pipe("temp_chunk.wav")
        transcription_time = timeit.default_timer() - start_time
        transcription_time_total += transcription_time

        # Calculate start and end times in seconds
        start_time_sec = chunk_id * 15
        end_time_sec = start_time_sec + len(chunk) / 1000.0

        aligned_chunks.append({
            "chunk_id": chunk_id,
            "chunk_length": len(chunk) / 1000.0,
            "text": output["text"],
            "start_time": start_time_sec,
            "end_time": end_time_sec,
            "transcription_time": transcription_time
        })

    return aligned_chunks

def download_audio_from_youtube(youtube_url):
    yt = YouTube(youtube_url)
    stream = yt.streams.filter(only_audio=True).first()
    output_path = stream.download()
    base, ext = os.path.splitext(output_path)
    audio_file = base + '.mp3'
    os.rename(output_path, audio_file)
    return audio_file

def transcribe_speech_from_youtube(youtube_url):
    audio_filepath = download_audio_from_youtube(youtube_url)

    # Convert to WAV format with 16kHz sample rate if necessary
    audio = AudioSegment.from_file(audio_filepath)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export("converted_audio.wav", format="wav")
    audio = AudioSegment.from_file("converted_audio.wav")

    # Split audio into 15-second chunks
    chunk_length_ms = 15000  # 15 seconds in milliseconds
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]

    aligned_chunks = []
    transcription_time_total = 0

    # Transcribe each chunk and measure time
    for chunk_id, chunk in enumerate(chunks):
        start_time = timeit.default_timer()
        chunk.export("temp_chunk.wav", format="wav")
        output = pipe("temp_chunk.wav")
        transcription_time = timeit.default_timer() - start_time
        transcription_time_total += transcription_time

        # Calculate start and end times in seconds
        start_time_sec = chunk_id * 15
        end_time_sec = start_time_sec + len(chunk) / 1000.0

        aligned_chunks.append({
            "chunk_id": chunk_id,
            "chunk_length": len(chunk) / 1000.0,
            "text": output["text"],
            "start_time": start_time_sec,
            "end_time": end_time_sec,
            "transcription_time": transcription_time
        })

    # Clean up temporary files
    if os.path.exists("temp_chunk.wav"):
        os.remove("temp_chunk.wav")
    if os.path.exists("converted_audio.wav"):
        os.remove("converted_audio.wav")
    if os.path.exists(audio_filepath):
        os.remove(audio_filepath)

    return aligned_chunks

file_transcribe = gr.Interface(
    fn=transcribe_speech_local,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.JSON(label="Transcription with Time Alignment"),
    allow_flagging="never"
)

link_transcribe = gr.Interface(
    fn=transcribe_speech_from_youtube,
    inputs=gr.Textbox(lines=1, placeholder="Enter YouTube URL here...", label="YouTube URL"),
    outputs=gr.JSON(label="Transcription with Time Alignment"),
    allow_flagging="never"
)

demo = gr.TabbedInterface(
    [file_transcribe, link_transcribe ],
    ["Local files(mp3/mp4/wav)", "Links"]
)

demo.launch(share=True)