# Imports from pytube import YouTube from huggingsound import SpeechRecognitionModel import torch from transformers import pipeline from IPython.display import Audio from pprint import pprint import os import gradio as gr import subprocess # process in the os from subprocess import STDOUT, check_call #os process manipulation # Constants DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE) summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline SAMPLING_RATE = 16000 USE_ONNX = False torch.set_num_threads(1) def transcribeVideo(VIDEO_URL): # Download the '.mp4' & save it as an audio file ('.wav') for the video ytVideo = YouTube(VIDEO_URL) ytVideo.streams \ .filter(only_audio = True, file_extension = 'mp4') \ .first() \ .download(filename = 'ytaudio.mp4') \ os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav") # Audio Chunking with Silero VAD model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True, onnx=USE_ONNX) (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils # Read '.wav' audio file audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE) # get speech timestamps from full audio file speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE) # Save the audio chunks as separate audio files index = 0 for timestamp in speechTimestamps: startTime = timestamp['start'] endTime = timestamp['end'] save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE) index += 1 # Concatenate the path of these separated audio chunks audioChunksPath = [] for i in range(len(speechTimestamps)): audioChunksPath.append(f'speech-{i}.wav') # Generate individual transcriptions & concatenate them transcriptions = MODEL.transcribe(audioChunksPath) fullTranscript = '' for transcript in transcriptions: fullTranscript += ''.join(transcript['transcription']) + ' ' return fullTranscript def summarizeTranscription(VIDEO_URL): fullTranscript = transcribeVideo(VIDEO_URL) # Generate summary from the full transcript summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False) return summarizedText[0]['summary_text'] iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)