Spaces:
Sleeping
Sleeping
import os | |
save_dir= os.path.join(os.getcwd(),'docs') | |
if not os.path.exists(save_dir): | |
os.mkdir(save_dir) | |
transcription_model_id = "openai/whisper-large" | |
llm_model_id = "tiiuae/falcon-7b-instruct" | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import pytube | |
# get the transcript from YouTube | |
def get_yt_transcript(url): | |
text = '' | |
vid_id = pytube.extract.video_id(url) | |
temp = YouTubeTranscriptApi.get_transcript(vid_id) | |
for t in temp: | |
text+=t['text']+' ' | |
return text | |
from pytube import YouTube | |
import transformers | |
import torch | |
# transcribes the video | |
def transcribe_yt_vid(url): | |
# download YouTube video's audio | |
yt = YouTube(str(url)) | |
audio = yt.streams.filter(only_audio = True).first() | |
out_file = audio.download(filename="audio.mp3", | |
output_path = save_dir) | |
# defining an automatic-speech-recognition pipeline | |
asr = transformers.pipeline( | |
"automatic-speech-recognition", | |
model=transcription_model_id, | |
device_map= 'auto', | |
) | |
# setting model config parameters | |
asr.model.config.forced_decoder_ids = ( | |
asr.tokenizer.get_decoder_prompt_ids( | |
language="en", | |
task="transcribe" | |
) | |
) | |
# invoking the Whisper model | |
temp = asr(out_file,chunk_length_s=20) | |
text = temp['text'] | |
# we can do this at the end to release GPU memory | |
del(asr) | |
torch.cuda.empty_cache() | |
return text | |
from pytube import YouTube | |
from huggingface_hub import InferenceClient | |
# transcribes the video using the Hugging Face Hub API | |
def transcribe_yt_vid_api(url,api_token): | |
# download YouTube video's audio | |
yt = YouTube(str(url)) | |
audio = yt.streams.filter(only_audio = True).first() | |
out_file = audio.download(filename="audio.wav", | |
output_path = save_dir) | |
# Initialize client for the Whisper model | |
client = InferenceClient(model=transcription_model_id, | |
token=api_token) | |
import librosa | |
import soundfile as sf | |
text = '' | |
t=25 # audio chunk length in seconds | |
x, sr = librosa.load(out_file, sr=None) | |
# This gives x as audio file in numpy array and sr as original sampling rate | |
# The audio needs to be split in 20 second chunks since the API call truncates the response | |
for _,i in enumerate(range(0, (len(x)//(t * sr)) +1)): | |
y = x[t * sr * i: t * sr *(i+1)] | |
split_path = os.path.join(save_dir,"audio_split.wav") | |
sf.write(split_path, y, sr) | |
text += client.automatic_speech_recognition(split_path) | |
return text | |
def transcribe_youtube_video(url, force_transcribe=False,use_api=False,api_token=None): | |
yt = YouTube(str(url)) | |
text = '' | |
# get the transcript from YouTube if available | |
try: | |
text = get_yt_transcript(url) | |
except: | |
pass | |
# transcribes the video if YouTube did not provide a transcription | |
# or if you want to force_transcribe anyway | |
if text == '' or force_transcribe: | |
if use_api: | |
text = transcribe_yt_vid_api(url,api_token=api_token) | |
transcript_source = 'The transcript was generated using {} via the Hugging Face Hub API.'.format(transcription_model_id) | |
else: | |
text = transcribe_yt_vid(url) | |
transcript_source = 'The transcript was generated using {} hosted locally.'.format(transcription_model_id) | |
else: | |
transcript_source = 'The transcript was downloaded from YouTube.' | |
return yt.title, text, transcript_source | |
def summarize_text(title,text,temperature,words,use_api=False,api_token=None,do_sample=False): | |
from langchain.chains.llm import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain | |
from langchain.chains.combine_documents.stuff import StuffDocumentsChain | |
import torch | |
import transformers | |
from transformers import BitsAndBytesConfig | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from langchain import HuggingFacePipeline | |
import torch | |
model_kwargs1 = {"temperature":temperature , | |
"do_sample":do_sample, | |
"min_new_tokens":200-25, | |
"max_new_tokens":200+25, | |
'repetition_penalty':20.0 | |
} | |
model_kwargs2 = {"temperature":temperature , | |
"do_sample":do_sample, | |
"min_new_tokens":words, | |
"max_new_tokens":words+100, | |
'repetition_penalty':20.0 | |
} | |
if not do_sample: | |
del model_kwargs1["temperature"] | |
del model_kwargs2["temperature"] | |
if use_api: | |
from langchain import HuggingFaceHub | |
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token | |
llm=HuggingFaceHub( | |
repo_id=llm_model_id, model_kwargs=model_kwargs1, | |
huggingfacehub_api_token=api_token | |
) | |
llm2=HuggingFaceHub( | |
repo_id=llm_model_id, model_kwargs=model_kwargs2, | |
huggingfacehub_api_token=api_token | |
) | |
summary_source = 'The summary was generated using {} via Hugging Face API.'.format(llm_model_id) | |
else: | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_use_double_quant=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(llm_model_id) | |
model = AutoModelForCausalLM.from_pretrained(llm_model_id, | |
# quantization_config=quantization_config | |
) | |
model.to_bettertransformer() | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
pad_token_id=tokenizer.eos_token_id, | |
**model_kwargs1, | |
) | |
pipeline2 = transformers.pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
pad_token_id=tokenizer.eos_token_id, | |
**model_kwargs2, | |
) | |
llm = HuggingFacePipeline(pipeline=pipeline) | |
llm2 = HuggingFacePipeline(pipeline=pipeline2) | |
summary_source = 'The summary was generated using {} hosted locally.'.format(llm_model_id) | |
# Map | |
map_template = """ | |
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
Summarize the following text in a clear and concise way:\n | |
----------------------- \n | |
TITLE: `{title}`\n | |
TEXT:\n | |
`{docs}`\n | |
----------------------- \n | |
BRIEF SUMMARY:\n | |
""" | |
map_prompt = PromptTemplate( | |
template = map_template, | |
input_variables = ['title','docs'] | |
) | |
map_chain = LLMChain(llm=llm, prompt=map_prompt) | |
# Reduce - Collapse | |
collapse_template = """ | |
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
The following is set of partial summaries of a video:\n | |
----------------------- \n | |
TITLE: `{title}`\n | |
PARTIAL SUMMARIES:\n | |
`{doc_summaries}`\n | |
----------------------- \n | |
Take these and distill them into a consolidated summary.\n | |
SUMMARY:\n | |
""" | |
collapse_prompt = PromptTemplate( | |
template = collapse_template, | |
input_variables = ['title','doc_summaries'] | |
) | |
collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt) | |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain | |
collapse_documents_chain = StuffDocumentsChain( | |
llm_chain=collapse_chain, document_variable_name="doc_summaries" | |
) | |
# Final Reduce - Combine | |
combine_template = """\n | |
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
The following is a set of partial summaries of a video:\n | |
----------------------- \n | |
TITLE: `{title}`\n | |
PARTIAL SUMMARIES:\n | |
`{doc_summaries}`\n | |
----------------------- \n | |
Generate an executive summary of the whole text in maximum {words} words that contains the main messages, points, and arguments presented in the video as bullet points. Avoid duplications or redundant information. \n | |
EXECUTIVE SUMMARY:\n | |
""" | |
combine_prompt = PromptTemplate( | |
template = combine_template, | |
input_variables = ['title','doc_summaries','words'] | |
) | |
combine_chain = LLMChain(llm=llm2, prompt=combine_prompt) | |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain | |
combine_documents_chain = StuffDocumentsChain( | |
llm_chain=combine_chain, document_variable_name="doc_summaries" | |
) | |
# Combines and iteratively reduces the mapped documents | |
reduce_documents_chain = ReduceDocumentsChain( | |
# This is final chain that is called. | |
combine_documents_chain=combine_documents_chain, | |
# If documents exceed context for `StuffDocumentsChain` | |
collapse_documents_chain=collapse_documents_chain, | |
# The maximum number of tokens to group documents into. | |
token_max=800, | |
) | |
# Combining documents by mapping a chain over them, then combining results | |
map_reduce_chain = MapReduceDocumentsChain( | |
# Map chain | |
llm_chain=map_chain, | |
# Reduce chain | |
reduce_documents_chain=reduce_documents_chain, | |
# The variable name in the llm_chain to put the documents in | |
document_variable_name="docs", | |
# Return the results of the map steps in the output | |
return_intermediate_steps=False, | |
) | |
from langchain.document_loaders import TextLoader | |
from langchain.text_splitter import TokenTextSplitter | |
with open(save_dir+'/transcript.txt','w') as f: | |
f.write(text) | |
loader = TextLoader(save_dir+"/transcript.txt") | |
doc = loader.load() | |
text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100) | |
docs = text_splitter.split_documents(doc) | |
summary = map_reduce_chain.run({'input_documents':docs, 'title':title, 'words':words}) | |
try: | |
del(map_reduce_chain,reduce_documents_chain,combine_chain,collapse_documents_chain,map_chain,collapse_chain,llm,llm2,pipeline,pipeline2,model,tokenizer) | |
except: | |
pass | |
torch.cuda.empty_cache() | |
return summary, summary_source | |
import gradio as gr | |
import pytube | |
from pytube import YouTube | |
def get_youtube_title(url): | |
yt = YouTube(str(url)) | |
return yt.title | |
def get_video(url): | |
vid_id = pytube.extract.video_id(url) | |
embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id) | |
return embed_html | |
def summarize_youtube_video(url,force_transcribe,api_token="", | |
temperature=1.0,words=150,do_sample=True): | |
print("URL:",url) | |
if api_token == "": | |
api_token = HF_TOKEN | |
title,text,transcript_source = transcribe_youtube_video(url,force_transcribe,True,api_token) | |
print("Transcript:",text[:500]) | |
summary, summary_source = summarize_text(title,text,temperature,words,True,api_token,do_sample) | |
print("Summary:",summary) | |
return summary, text, transcript_source, summary_source | |
html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>' | |
# def change_transcribe_api(vis): | |
# return gr.Checkbox(value=False, visible=vis) | |
# def change_api_token(vis): | |
# return gr.Textbox(visible=vis) | |
def update_source(source): | |
return gr.Textbox(info=source) | |
def show_temp(vis): | |
return gr.Slider(visible=vis) | |
# Defining the structure of the UI | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
gr.Markdown("# Summarize a YouTube Video") | |
with gr.Row(): | |
with gr.Column(scale=4): | |
url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=",info="The video must not be age-restricted. Otherwise, the transcription will fail. The demo supports videos in English language only.") | |
with gr.Column(scale=1): | |
api_token = gr.Textbox(label="Paste your Hugging Face API token here (Optional):",placeholder="hf_...",visible=True,show_label=True,info='The API token passed via this field is not stored. It is only passed through the Hugging Face Hub API for inference.') | |
with gr.Column(scale=1): | |
sum_btn = gr.Button("Summarize!") | |
gr.Markdown("Please like the repo if you find this helpful. Detailed instructions for recreating this tool are provided [here](https://pub.towardsai.net/a-complete-guide-for-creating-an-ai-assistant-for-summarizing-youtube-videos-part-1-32fbadabc2cc?sk=34269402931178039c4c3589df4a6ec5) and [here](https://pub.towardsai.net/a-complete-guide-for-creating-an-ai-assistant-for-summarizing-youtube-videos-part-2-a008ee18f341?sk=d59046b36a52c74dfa8befa99183e5b6).") | |
with gr.Accordion("Transcription Settings",open=False): | |
with gr.Row(): | |
force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.", info='If unchecked, the app attempts to download the transcript from YouTube first. Check this if the transcript does not seem accurate.') | |
# use_transcribe_api = gr.Checkbox(label="Transcribe using the HuggingFaceHub API.",visible=False) | |
with gr.Accordion("Summarization Settings",open=False): | |
with gr.Row(): | |
# use_llm_api = gr.Checkbox(label="Summarize using the HuggingFaceHub API.",visible=True) | |
do_sample = gr.Checkbox(label="Set the Temperature",value=False,visible=True) | |
temperature = gr.Slider(minimum=0.01,maximum=1.0,value=0.25,label="Generation temperature",visible=False) | |
words = gr.Slider(minimum=100,maximum=500,value=200,label="Length of the summary") | |
gr.Markdown("# Results") | |
title = gr.Textbox(label="Video Title",placeholder="title...") | |
with gr.Row(): | |
video = gr.HTML(html,scale=1) | |
summary_source = gr.Textbox(visible=False,scale=0) | |
summary = gr.Textbox(label="Summary",placeholder="summary...",scale=1) | |
with gr.Row(): | |
with gr.Group(): | |
transcript = gr.Textbox(label="Full Transcript",placeholder="transcript...",show_label=True) | |
transcript_source = gr.Textbox(visible=False) | |
with gr.Accordion("Notes",open=False): | |
gr.Markdown(""" | |
1. This app attempts to download the transcript from Youtube first. If the transcript is not available, or the prompts require, the video will be transcribed.\n | |
2. The app performs best on videos in which the number of speakers is limited or when the YouTube transcript includes annotations of the speakers.\n | |
3. The trascription does not annotate the speakers which may downgrade the quality of the summary if there are more than one speaker.\n | |
""") | |
# Defining the interactivity of the UI elements | |
# force_transcribe.change(fn=change_transcribe_api,inputs=force_transcribe,outputs=use_transcribe_api) | |
# use_transcribe_api.change(fn=change_api_token,inputs=use_transcribe_api,outputs=api_token) | |
# use_llm_api.change(fn=change_api_token,inputs=use_llm_api,outputs=api_token) | |
transcript_source.change(fn=update_source,inputs=transcript_source,outputs=transcript) | |
summary_source.change(fn=update_source,inputs=summary_source,outputs=summary) | |
do_sample.change(fn=show_temp,inputs=do_sample,outputs=temperature) | |
# Defining the functions to call on clicking the button | |
sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title", queue=False) | |
sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe,api_token,temperature,words,do_sample], | |
outputs=[summary,transcript, transcript_source, summary_source], api_name="summarize_youtube_video", queue=True) | |
sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video", queue=False) | |
demo.queue() | |
demo.launch(share=False) |