Amrrs's picture
Update app.py
1f542fb
raw
history blame
2.94 kB
# Imports
from pytube import YouTube
import os
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipulation
"""install libsndfile on the linux machine"""
proc = subprocess.Popen('apt-get install libsndfile1', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
proc.wait()
from huggingsound import SpeechRecognitionModel
import torch
from transformers import pipeline
from IPython.display import Audio
from pprint import pprint
import os
import gradio as gr
# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE)
summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline
SAMPLING_RATE = 16000
torch.set_num_threads(1)
def transcribeVideo(VIDEO_URL):
# Download the '.mp4' & save it as an audio file ('.wav') for the video
ytVideo = YouTube(VIDEO_URL)
ytVideo.streams \
.filter(only_audio = True, file_extension = 'mp4') \
.first() \
.download(filename = 'ytaudio.mp4') \
os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav")
# Audio Chunking with Silero VAD
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=True,
onnx=USE_ONNX)
(get_speech_timestamps,
save_audio,
read_audio,
VADIterator,
collect_chunks) = utils
# Read '.wav' audio file
audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE)
# Save the audio chunks as separate audio files
index = 0
for timestamp in speechTimestamps:
startTime = timestamp['start']
endTime = timestamp['end']
save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE)
index += 1
# Concatenate the path of these separated audio chunks
audioChunksPath = []
for i in range(len(speechTimestamps)):
audioChunksPath.append(f'/content/speech-{i}.wav')
# Generate individual transcriptions & concatenate them
transcriptions = MODEL.transcribe(audioChunksPath)
fullTranscript = ''
for transcript in transcriptions:
fullTranscript += ''.join(transcript['transcription']) + ' '
return fullTranscript
def summarizeTranscription(VIDEO_URL):
fullTranscript = transcribeVideo(VIDEO_URL)
# Generate summary from the full transcript
summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False)
return summarizedText[0]['summary_text']
iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)