import torch from transformers import pipeline import os import gradio as gr from pydub import AudioSegment from pytube import YouTube import timeit import math device = "cuda" if torch.cuda.is_available() else "cpu" pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device) def transcribe_speech_local(filepath): if filepath is None: return [{"error": "No audio found, please retry."}] # Split audio into 15-second chunks audio = AudioSegment.from_file(filepath) chunk_length_ms = 15000 # 15 seconds in milliseconds chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] print(chunks) aligned_chunks = [] transcription_time_total = 0 # Transcribe each chunk and measure time for chunk_id, chunk in enumerate(chunks): start_time = timeit.default_timer() chunk.export("temp_chunk.wav", format="wav") output = pipe("temp_chunk.wav") transcription_time = timeit.default_timer() - start_time transcription_time_total += transcription_time # Calculate start and end times in seconds start_time_sec = chunk_id * 15 end_time_sec = start_time_sec + len(chunk) / 1000.0 aligned_chunks.append({ "chunk_id": chunk_id, "chunk_length": len(chunk) / 1000.0, "text": output["text"], "start_time": start_time_sec, "end_time": end_time_sec, "transcription_time": transcription_time }) return aligned_chunks def download_audio_from_youtube(youtube_url): yt = YouTube(youtube_url) stream = yt.streams.filter(only_audio=True).first() output_path = stream.download() base, ext = os.path.splitext(output_path) audio_file = base + '.mp3' os.rename(output_path, audio_file) return audio_file def transcribe_speech_from_youtube(youtube_url): audio_filepath = download_audio_from_youtube(youtube_url) # Convert to WAV format with 16kHz sample rate if necessary audio = AudioSegment.from_file(audio_filepath) audio = audio.set_frame_rate(16000).set_channels(1) audio.export("converted_audio.wav", format="wav") audio = AudioSegment.from_file("converted_audio.wav") # Split audio into 15-second chunks chunk_length_ms = 15000 # 15 seconds in milliseconds chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] aligned_chunks = [] transcription_time_total = 0 # Transcribe each chunk and measure time for chunk_id, chunk in enumerate(chunks): start_time = timeit.default_timer() chunk.export("temp_chunk.wav", format="wav") output = pipe("temp_chunk.wav") transcription_time = timeit.default_timer() - start_time transcription_time_total += transcription_time # Calculate start and end times in seconds start_time_sec = chunk_id * 15 end_time_sec = start_time_sec + len(chunk) / 1000.0 aligned_chunks.append({ "chunk_id": chunk_id, "chunk_length": len(chunk) / 1000.0, "text": output["text"], "start_time": start_time_sec, "end_time": end_time_sec, "transcription_time": transcription_time }) # Clean up temporary files if os.path.exists("temp_chunk.wav"): os.remove("temp_chunk.wav") if os.path.exists("converted_audio.wav"): os.remove("converted_audio.wav") if os.path.exists(audio_filepath): os.remove(audio_filepath) return aligned_chunks file_transcribe = gr.Interface( fn=transcribe_speech_local, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.JSON(label="Transcription with Time Alignment"), allow_flagging="never" ) link_transcribe = gr.Interface( fn=transcribe_speech_from_youtube, inputs=gr.Textbox(lines=1, placeholder="Enter YouTube URL here...", label="YouTube URL"), outputs=gr.JSON(label="Transcription with Time Alignment"), allow_flagging="never" ) demo = gr.TabbedInterface( [file_transcribe, link_transcribe ], ["Local files(mp3/mp4/wav)", "Links"] ) demo.launch(share=True)