Video_to_text / app.py
sharmaarush's picture
initial upload
fa1fe99
raw
history blame
No virus
4.36 kB
import torch
from transformers import pipeline
import os
import gradio as gr
from pydub import AudioSegment
from pytube import YouTube
import timeit
import math
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device)
def transcribe_speech_local(filepath):
if filepath is None:
return [{"error": "No audio found, please retry."}]
# Split audio into 15-second chunks
audio = AudioSegment.from_file(filepath)
chunk_length_ms = 15000 # 15 seconds in milliseconds
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
print(chunks)
aligned_chunks = []
transcription_time_total = 0
# Transcribe each chunk and measure time
for chunk_id, chunk in enumerate(chunks):
start_time = timeit.default_timer()
chunk.export("temp_chunk.wav", format="wav")
output = pipe("temp_chunk.wav")
transcription_time = timeit.default_timer() - start_time
transcription_time_total += transcription_time
# Calculate start and end times in seconds
start_time_sec = chunk_id * 15
end_time_sec = start_time_sec + len(chunk) / 1000.0
aligned_chunks.append({
"chunk_id": chunk_id,
"chunk_length": len(chunk) / 1000.0,
"text": output["text"],
"start_time": start_time_sec,
"end_time": end_time_sec,
"transcription_time": transcription_time
})
return aligned_chunks
def download_audio_from_youtube(youtube_url):
yt = YouTube(youtube_url)
stream = yt.streams.filter(only_audio=True).first()
output_path = stream.download()
base, ext = os.path.splitext(output_path)
audio_file = base + '.mp3'
os.rename(output_path, audio_file)
return audio_file
def transcribe_speech_from_youtube(youtube_url):
audio_filepath = download_audio_from_youtube(youtube_url)
# Convert to WAV format with 16kHz sample rate if necessary
audio = AudioSegment.from_file(audio_filepath)
audio = audio.set_frame_rate(16000).set_channels(1)
audio.export("converted_audio.wav", format="wav")
audio = AudioSegment.from_file("converted_audio.wav")
# Split audio into 15-second chunks
chunk_length_ms = 15000 # 15 seconds in milliseconds
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
aligned_chunks = []
transcription_time_total = 0
# Transcribe each chunk and measure time
for chunk_id, chunk in enumerate(chunks):
start_time = timeit.default_timer()
chunk.export("temp_chunk.wav", format="wav")
output = pipe("temp_chunk.wav")
transcription_time = timeit.default_timer() - start_time
transcription_time_total += transcription_time
# Calculate start and end times in seconds
start_time_sec = chunk_id * 15
end_time_sec = start_time_sec + len(chunk) / 1000.0
aligned_chunks.append({
"chunk_id": chunk_id,
"chunk_length": len(chunk) / 1000.0,
"text": output["text"],
"start_time": start_time_sec,
"end_time": end_time_sec,
"transcription_time": transcription_time
})
# Clean up temporary files
if os.path.exists("temp_chunk.wav"):
os.remove("temp_chunk.wav")
if os.path.exists("converted_audio.wav"):
os.remove("converted_audio.wav")
if os.path.exists(audio_filepath):
os.remove(audio_filepath)
return aligned_chunks
file_transcribe = gr.Interface(
fn=transcribe_speech_local,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.JSON(label="Transcription with Time Alignment"),
allow_flagging="never"
)
link_transcribe = gr.Interface(
fn=transcribe_speech_from_youtube,
inputs=gr.Textbox(lines=1, placeholder="Enter YouTube URL here...", label="YouTube URL"),
outputs=gr.JSON(label="Transcription with Time Alignment"),
allow_flagging="never"
)
demo = gr.TabbedInterface(
[file_transcribe, link_transcribe ],
["Local files(mp3/mp4/wav)", "Links"]
)
demo.launch(share=True)