Spaces:
Runtime error
Runtime error
# Imports | |
from pytube import YouTube | |
import os | |
import subprocess # process in the os | |
from subprocess import STDOUT, check_call #os process manipulation | |
from huggingsound import SpeechRecognitionModel | |
import torch | |
from transformers import pipeline | |
from IPython.display import Audio | |
from pprint import pprint | |
import os | |
import gradio as gr | |
# Constants | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE) | |
summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline | |
SAMPLING_RATE = 16000 | |
torch.set_num_threads(1) | |
def transcribeVideo(VIDEO_URL): | |
# Download the '.mp4' & save it as an audio file ('.wav') for the video | |
ytVideo = YouTube(VIDEO_URL) | |
ytVideo.streams \ | |
.filter(only_audio = True, file_extension = 'mp4') \ | |
.first() \ | |
.download(filename = 'ytaudio.mp4') \ | |
os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav") | |
# Audio Chunking with Silero VAD | |
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', | |
model='silero_vad', | |
force_reload=True) | |
(get_speech_timestamps, | |
save_audio, | |
read_audio, | |
VADIterator, | |
collect_chunks) = utils | |
# Read '.wav' audio file | |
audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE) | |
# get speech timestamps from full audio file | |
speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE) | |
# Save the audio chunks as separate audio files | |
index = 0 | |
for timestamp in speechTimestamps: | |
startTime = timestamp['start'] | |
endTime = timestamp['end'] | |
save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE) | |
index += 1 | |
# Concatenate the path of these separated audio chunks | |
audioChunksPath = [] | |
for i in range(len(speechTimestamps)): | |
audioChunksPath.append(f'/content/speech-{i}.wav') | |
# Generate individual transcriptions & concatenate them | |
transcriptions = MODEL.transcribe(audioChunksPath) | |
fullTranscript = '' | |
for transcript in transcriptions: | |
fullTranscript += ''.join(transcript['transcription']) + ' ' | |
return fullTranscript | |
def summarizeTranscription(VIDEO_URL): | |
fullTranscript = transcribeVideo(VIDEO_URL) | |
# Generate summary from the full transcript | |
summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False) | |
return summarizedText[0]['summary_text'] | |
iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False) |