Spaces:
Runtime error
Runtime error
| import os | |
| save_dir= os.path.join(os.getcwd(),'docs') | |
| if not os.path.exists(save_dir): | |
| os.mkdir(save_dir) | |
| transcription_model_id = "openai/whisper-large" | |
| llm_model_id = "tiiuae/falcon-7b-instruct" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| import pytube | |
| # get the transcript from YouTube | |
| def get_yt_transcript(url): | |
| text = '' | |
| vid_id = pytube.extract.video_id(url) | |
| temp = YouTubeTranscriptApi.get_transcript(vid_id) | |
| for t in temp: | |
| text+=t['text']+' ' | |
| return text | |
| from pytube import YouTube | |
| import transformers | |
| import torch | |
| # transcribes the video | |
| def transcribe_yt_vid(url): | |
| # download YouTube video's audio | |
| yt = YouTube(str(url)) | |
| audio = yt.streams.filter(only_audio = True).first() | |
| out_file = audio.download(filename="audio.mp3", | |
| output_path = save_dir) | |
| # defining an automatic-speech-recognition pipeline | |
| asr = transformers.pipeline( | |
| "automatic-speech-recognition", | |
| model=transcription_model_id, | |
| device_map= 'auto', | |
| ) | |
| # setting model config parameters | |
| asr.model.config.forced_decoder_ids = ( | |
| asr.tokenizer.get_decoder_prompt_ids( | |
| language="en", | |
| task="transcribe" | |
| ) | |
| ) | |
| # invoking the Whisper model | |
| temp = asr(out_file,chunk_length_s=20) | |
| text = temp['text'] | |
| # we can do this at the end to release GPU memory | |
| del(asr) | |
| torch.cuda.empty_cache() | |
| return text | |
| from pytube import YouTube | |
| from huggingface_hub import InferenceClient | |
| # transcribes the video using the Hugging Face Hub API | |
| def transcribe_yt_vid_api(url,api_token): | |
| # download YouTube video's audio | |
| yt = YouTube(str(url)) | |
| audio = yt.streams.filter(only_audio = True).first() | |
| out_file = audio.download(filename="audio.wav", | |
| output_path = save_dir) | |
| # Initialize client for the Whisper model | |
| client = InferenceClient(model=transcription_model_id, | |
| token=api_token) | |
| import librosa | |
| import soundfile as sf | |
| text = '' | |
| t=25 # audio chunk length in seconds | |
| x, sr = librosa.load(out_file, sr=None) | |
| # This gives x as audio file in numpy array and sr as original sampling rate | |
| # The audio needs to be split in 20 second chunks since the API call truncates the response | |
| for _,i in enumerate(range(0, (len(x)//(t * sr)) +1)): | |
| y = x[t * sr * i: t * sr *(i+1)] | |
| split_path = os.path.join(save_dir,"audio_split.wav") | |
| sf.write(split_path, y, sr) | |
| text += client.automatic_speech_recognition(split_path) | |
| return text | |
| def transcribe_youtube_video(url, force_transcribe=False,use_api=False,api_token=None): | |
| yt = YouTube(str(url)) | |
| text = '' | |
| # get the transcript from YouTube if available | |
| try: | |
| text = get_yt_transcript(url) | |
| except: | |
| pass | |
| # transcribes the video if YouTube did not provide a transcription | |
| # or if you want to force_transcribe anyway | |
| if text == '' or force_transcribe: | |
| if use_api: | |
| text = transcribe_yt_vid_api(url,api_token=api_token) | |
| transcript_source = 'The transcript was generated using {} via the Hugging Face Hub API.'.format(transcription_model_id) | |
| else: | |
| text = transcribe_yt_vid(url) | |
| transcript_source = 'The transcript was generated using {} hosted locally.'.format(transcription_model_id) | |
| else: | |
| transcript_source = 'The transcript was downloaded from YouTube.' | |
| return yt.title, text, transcript_source | |
| def summarize_text(title,text,temperature,words,use_api=False,api_token=None,do_sample=False): | |
| from langchain.chains.llm import LLMChain | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain | |
| from langchain.chains.combine_documents.stuff import StuffDocumentsChain | |
| import torch | |
| import transformers | |
| from transformers import BitsAndBytesConfig | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from langchain import HuggingFacePipeline | |
| import torch | |
| model_kwargs1 = {"temperature":temperature , | |
| "do_sample":do_sample, | |
| "min_new_tokens":200-25, | |
| "max_new_tokens":200+25, | |
| 'repetition_penalty':20.0 | |
| } | |
| model_kwargs2 = {"temperature":temperature , | |
| "do_sample":do_sample, | |
| "min_new_tokens":words, | |
| "max_new_tokens":words+100, | |
| 'repetition_penalty':20.0 | |
| } | |
| if not do_sample: | |
| del model_kwargs1["temperature"] | |
| del model_kwargs2["temperature"] | |
| if use_api: | |
| from langchain import HuggingFaceHub | |
| # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token | |
| llm=HuggingFaceHub( | |
| repo_id=llm_model_id, model_kwargs=model_kwargs1, | |
| huggingfacehub_api_token=api_token | |
| ) | |
| llm2=HuggingFaceHub( | |
| repo_id=llm_model_id, model_kwargs=model_kwargs2, | |
| huggingfacehub_api_token=api_token | |
| ) | |
| summary_source = 'The summary was generated using {} via Hugging Face API.'.format(llm_model_id) | |
| else: | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(llm_model_id) | |
| model = AutoModelForCausalLM.from_pretrained(llm_model_id, | |
| # quantization_config=quantization_config | |
| ) | |
| model.to_bettertransformer() | |
| pipeline = transformers.pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| pad_token_id=tokenizer.eos_token_id, | |
| **model_kwargs1, | |
| ) | |
| pipeline2 = transformers.pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| pad_token_id=tokenizer.eos_token_id, | |
| **model_kwargs2, | |
| ) | |
| llm = HuggingFacePipeline(pipeline=pipeline) | |
| llm2 = HuggingFacePipeline(pipeline=pipeline2) | |
| summary_source = 'The summary was generated using {} hosted locally.'.format(llm_model_id) | |
| # Map | |
| map_template = """ | |
| You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
| You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
| Summarize the following text in a clear and concise way:\n | |
| ----------------------- \n | |
| TITLE: `{title}`\n | |
| TEXT:\n | |
| `{docs}`\n | |
| ----------------------- \n | |
| BRIEF SUMMARY:\n | |
| """ | |
| map_prompt = PromptTemplate( | |
| template = map_template, | |
| input_variables = ['title','docs'] | |
| ) | |
| map_chain = LLMChain(llm=llm, prompt=map_prompt) | |
| # Reduce - Collapse | |
| collapse_template = """ | |
| You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
| You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
| The following is set of partial summaries of a video:\n | |
| ----------------------- \n | |
| TITLE: `{title}`\n | |
| PARTIAL SUMMARIES:\n | |
| `{doc_summaries}`\n | |
| ----------------------- \n | |
| Take these and distill them into a consolidated summary.\n | |
| SUMMARY:\n | |
| """ | |
| collapse_prompt = PromptTemplate( | |
| template = collapse_template, | |
| input_variables = ['title','doc_summaries'] | |
| ) | |
| collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt) | |
| # Takes a list of documents, combines them into a single string, and passes this to an LLMChain | |
| collapse_documents_chain = StuffDocumentsChain( | |
| llm_chain=collapse_chain, document_variable_name="doc_summaries" | |
| ) | |
| # Final Reduce - Combine | |
| combine_template = """\n | |
| You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n | |
| You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n | |
| The following is a set of partial summaries of a video:\n | |
| ----------------------- \n | |
| TITLE: `{title}`\n | |
| PARTIAL SUMMARIES:\n | |
| `{doc_summaries}`\n | |
| ----------------------- \n | |
| Generate an executive summary of the whole text in maximum {words} words that contains the main messages, points, and arguments presented in the video as bullet points. Avoid duplications or redundant information. \n | |
| EXECUTIVE SUMMARY:\n | |
| """ | |
| combine_prompt = PromptTemplate( | |
| template = combine_template, | |
| input_variables = ['title','doc_summaries','words'] | |
| ) | |
| combine_chain = LLMChain(llm=llm2, prompt=combine_prompt) | |
| # Takes a list of documents, combines them into a single string, and passes this to an LLMChain | |
| combine_documents_chain = StuffDocumentsChain( | |
| llm_chain=combine_chain, document_variable_name="doc_summaries" | |
| ) | |
| # Combines and iteratively reduces the mapped documents | |
| reduce_documents_chain = ReduceDocumentsChain( | |
| # This is final chain that is called. | |
| combine_documents_chain=combine_documents_chain, | |
| # If documents exceed context for `StuffDocumentsChain` | |
| collapse_documents_chain=collapse_documents_chain, | |
| # The maximum number of tokens to group documents into. | |
| token_max=800, | |
| ) | |
| # Combining documents by mapping a chain over them, then combining results | |
| map_reduce_chain = MapReduceDocumentsChain( | |
| # Map chain | |
| llm_chain=map_chain, | |
| # Reduce chain | |
| reduce_documents_chain=reduce_documents_chain, | |
| # The variable name in the llm_chain to put the documents in | |
| document_variable_name="docs", | |
| # Return the results of the map steps in the output | |
| return_intermediate_steps=False, | |
| ) | |
| from langchain.document_loaders import TextLoader | |
| from langchain.text_splitter import TokenTextSplitter | |
| with open(save_dir+'/transcript.txt','w') as f: | |
| f.write(text) | |
| loader = TextLoader(save_dir+"/transcript.txt") | |
| doc = loader.load() | |
| text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100) | |
| docs = text_splitter.split_documents(doc) | |
| summary = map_reduce_chain.run({'input_documents':docs, 'title':title, 'words':words}) | |
| try: | |
| del(map_reduce_chain,reduce_documents_chain,combine_chain,collapse_documents_chain,map_chain,collapse_chain,llm,llm2,pipeline,pipeline2,model,tokenizer) | |
| except: | |
| pass | |
| torch.cuda.empty_cache() | |
| return summary, summary_source | |
| import gradio as gr | |
| import pytube | |
| from pytube import YouTube | |
| def get_youtube_title(url): | |
| yt = YouTube(str(url)) | |
| return yt.title | |
| def get_video(url): | |
| vid_id = pytube.extract.video_id(url) | |
| embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id) | |
| return embed_html | |
| def summarize_youtube_video(url,force_transcribe,api_token="", | |
| temperature=1.0,words=150,do_sample=True): | |
| print("URL:",url) | |
| if api_token == "": | |
| api_token = HF_TOKEN | |
| title,text,transcript_source = transcribe_youtube_video(url,force_transcribe,True,api_token) | |
| print("Transcript:",text[:500]) | |
| summary, summary_source = summarize_text(title,text,temperature,words,True,api_token,do_sample) | |
| print("Summary:",summary) | |
| return summary, text, transcript_source, summary_source | |
| html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>' | |
| # def change_transcribe_api(vis): | |
| # return gr.Checkbox(value=False, visible=vis) | |
| # def change_api_token(vis): | |
| # return gr.Textbox(visible=vis) | |
| def update_source(source): | |
| return gr.Textbox(info=source) | |
| def show_temp(vis): | |
| return gr.Slider(visible=vis) | |
| # Defining the structure of the UI | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| gr.Markdown("# Summarize a YouTube Video") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=",info="The video must not be age-restricted. Otherwise, the transcription will fail. The demo supports videos in English language only.") | |
| with gr.Column(scale=1): | |
| api_token = gr.Textbox(label="Paste your Hugging Face API token here (Optional):",placeholder="hf_...",visible=True,show_label=True,info='The API token passed via this field is not stored. It is only passed through the Hugging Face Hub API for inference.') | |
| with gr.Column(scale=1): | |
| sum_btn = gr.Button("Summarize!") | |
| gr.Markdown("Please like the repo if you find this helpful. Detailed instructions for recreating this tool are provided [here](https://pub.towardsai.net/a-complete-guide-for-creating-an-ai-assistant-for-summarizing-youtube-videos-part-1-32fbadabc2cc?sk=34269402931178039c4c3589df4a6ec5) and [here](https://pub.towardsai.net/a-complete-guide-for-creating-an-ai-assistant-for-summarizing-youtube-videos-part-2-a008ee18f341?sk=d59046b36a52c74dfa8befa99183e5b6).") | |
| with gr.Accordion("Transcription Settings",open=False): | |
| with gr.Row(): | |
| force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.", info='If unchecked, the app attempts to download the transcript from YouTube first. Check this if the transcript does not seem accurate.') | |
| # use_transcribe_api = gr.Checkbox(label="Transcribe using the HuggingFaceHub API.",visible=False) | |
| with gr.Accordion("Summarization Settings",open=False): | |
| with gr.Row(): | |
| # use_llm_api = gr.Checkbox(label="Summarize using the HuggingFaceHub API.",visible=True) | |
| do_sample = gr.Checkbox(label="Set the Temperature",value=False,visible=True) | |
| temperature = gr.Slider(minimum=0.01,maximum=1.0,value=0.25,label="Generation temperature",visible=False) | |
| words = gr.Slider(minimum=100,maximum=500,value=200,label="Length of the summary") | |
| gr.Markdown("# Results") | |
| title = gr.Textbox(label="Video Title",placeholder="title...") | |
| with gr.Row(): | |
| video = gr.HTML(html,scale=1) | |
| summary_source = gr.Textbox(visible=False,scale=0) | |
| summary = gr.Textbox(label="Summary",placeholder="summary...",scale=1) | |
| with gr.Row(): | |
| with gr.Group(): | |
| transcript = gr.Textbox(label="Full Transcript",placeholder="transcript...",show_label=True) | |
| transcript_source = gr.Textbox(visible=False) | |
| with gr.Accordion("Notes",open=False): | |
| gr.Markdown(""" | |
| 1. This app attempts to download the transcript from Youtube first. If the transcript is not available, or the prompts require, the video will be transcribed.\n | |
| 2. The app performs best on videos in which the number of speakers is limited or when the YouTube transcript includes annotations of the speakers.\n | |
| 3. The trascription does not annotate the speakers which may downgrade the quality of the summary if there are more than one speaker.\n | |
| """) | |
| # Defining the interactivity of the UI elements | |
| # force_transcribe.change(fn=change_transcribe_api,inputs=force_transcribe,outputs=use_transcribe_api) | |
| # use_transcribe_api.change(fn=change_api_token,inputs=use_transcribe_api,outputs=api_token) | |
| # use_llm_api.change(fn=change_api_token,inputs=use_llm_api,outputs=api_token) | |
| transcript_source.change(fn=update_source,inputs=transcript_source,outputs=transcript) | |
| summary_source.change(fn=update_source,inputs=summary_source,outputs=summary) | |
| do_sample.change(fn=show_temp,inputs=do_sample,outputs=temperature) | |
| # Defining the functions to call on clicking the button | |
| sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title", queue=False) | |
| sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe,api_token,temperature,words,do_sample], | |
| outputs=[summary,transcript, transcript_source, summary_source], api_name="summarize_youtube_video", queue=True) | |
| sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video", queue=False) | |
| demo.queue() | |
| demo.launch(share=False) |