File size: 2,711 Bytes
1ae8089
 
 
 
 
 
 
 
 
61d733a
 
1ae8089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Imports
from pytube import YouTube
from huggingsound import SpeechRecognitionModel
import torch
from transformers import pipeline
from IPython.display import Audio
from pprint import pprint
import os
import gradio as gr
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipulation

# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE)
summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline
SAMPLING_RATE = 16000
torch.set_num_threads(1)

def transcribeVideo(VIDEO_URL):
  # Download the '.mp4' & save it as an audio file ('.wav') for the video
  ytVideo = YouTube(VIDEO_URL)
  ytVideo.streams \
    .filter(only_audio = True, file_extension = 'mp4') \
    .first() \
    .download(filename = 'ytaudio.mp4') \

  os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav")

  # Audio Chunking with Silero VAD
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)
  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

  # Read '.wav' audio file
  audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE)
  # get speech timestamps from full audio file
  speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE)

  # Save the audio chunks as separate audio files
  index = 0
  for timestamp in speechTimestamps:
    startTime = timestamp['start']
    endTime = timestamp['end']
    save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE)
    index += 1

  # Concatenate the path of these separated audio chunks 
  audioChunksPath = []
  for i in range(len(speechTimestamps)):
    audioChunksPath.append(f'/content/speech-{i}.wav')

  # Generate individual transcriptions & concatenate them
  transcriptions = MODEL.transcribe(audioChunksPath)

  fullTranscript = ''
  for transcript in transcriptions:
    fullTranscript += ''.join(transcript['transcription']) + ' '
  
  return fullTranscript

def summarizeTranscription(VIDEO_URL):
  fullTranscript = transcribeVideo(VIDEO_URL)

  # Generate summary from the full transcript
  summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False)
  return summarizedText[0]['summary_text']

iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)