Spaces:

ww0
/

youtube-chatbot

Sleeping

App Files Files Community

ww0 commited on Jan 20

Commit

059dde7

•

1 Parent(s): 7fea2c0

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -225

app.py CHANGED Viewed

@@ -7,43 +7,47 @@ from langchain_community.vectorstores import Chroma
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda
 from langchain_core.messages import AIMessage, HumanMessage
-from langchain.chains import LLMChain, StuffDocumentsChain, MapReduceDocumentsChain, ReduceDocumentsChain
 from langchain.memory.buffer_window import ConversationBufferWindowMemory
 from langchain_community.llms import HuggingFaceHub
 import yt_dlp
 import json
 import gc
-import gradio as gr
-from gradio_client import Client
 import datetime
 import os
-whisper_jax_api = 'https://sanchit-gandhi-whisper-jax.hf.space/'
 whisper_jax = Client(whisper_jax_api)
-def transcribe_audio(audio_path,
-                     task='transcribe',
-                     return_timestamps=True) -> str:
     text, runtime = whisper_jax.predict(
         audio_path,
         task,
         return_timestamps,
-        api_name='/predict_1',
     )
     return text
-def format_whisper_jax_output(whisper_jax_output: str,
-                              max_duration: int = 60) -> list[dict]:
-    """Returns a list of dict with keys 'start', 'end', 'text'
     The segments from whisper jax output are merged to form paragraphs.
     `max_duration` controls how many seconds of the audio's transcripts are merged
@@ -54,68 +58,65 @@ def format_whisper_jax_output(whisper_jax_output: str,
     final_output = []
     max_duration = datetime.timedelta(seconds=max_duration)
-    segments = whisper_jax_output.split('\n')
-    current_start = datetime.datetime.strptime('00:00', '%M:%S')
-    current_text = ''
     for i, seg in enumerate(segments):
-        text = seg.split(']')[-1].strip()
         # Sometimes whisper jax returns None for timestamp
         try:
-            end = datetime.datetime.strptime(seg[14:19], '%M:%S')
         except ValueError:
             end = current_start + max_duration
-        if (end-current_start >= max_duration) or (i == len(segments)-1):
-            # If we have exceeded max duration or at the last segment,
-            # stop merging and append to final_output.
-            current_text += text
-            final_output.append({
-                'start': current_start.strftime('%H:%M:%S'),
-                'end': end.strftime('%H:%M:%S'),
-                'text': current_text
-            })
-            # Update current start and text
-            current_start = end
-            current_text = ''
         else:
-            # If we have not exceeded max duration, keep merging.
-            current_text += text
-    return final_output
-audio_file_number = 1
-def yt_audio_to_text(url: str,
-                     max_duration: int = 60
-                    ):
-    global audio_file_number
     progress = gr.Progress()
     progress(0.1)
-    with yt_dlp.YoutubeDL({'extract_audio': True,
-                           'format': 'bestaudio',
-                           'outtmpl': f'{audio_file_number}.mp3'
-                          }) as video:
         info_dict = video.extract_info(url, download=False)
         global video_title
-        video_title = info_dict['title']
         video.download(url)
     progress(0.4)
-    audio_file = f'{audio_file_number}.mp3'
-    audio_file_number += 1
     result = transcribe_audio(audio_file, return_timestamps=True)
     progress(0.7)
@@ -123,61 +124,83 @@ def yt_audio_to_text(url: str,
     result = format_whisper_jax_output(result, max_duration=max_duration)
     progress(0.9)
-    with open('audio.json', 'w') as f:
         json.dump(result, f)
-def metadata_func(record: dict, metadata: dict) -> dict:
-    metadata['start'] = record.get('start')
-    metadata['end'] = record.get('end')
-    metadata['source'] =  metadata['start'] + ' -> ' + metadata['end']
     return metadata
 def load_data():
     loader = JSONLoader(
-        file_path='audio.json',
-        jq_schema='.[]',
-        content_key='text',
-        metadata_func=metadata_func
     )
     data = loader.load()
     return data
-embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-embedding_model_kwargs = {'device': device}
-embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name,
-                                   model_kwargs=embedding_model_kwargs)
-def create_vectordb(data, k: int):
-    """Returns a vector database, and its retriever
-    `k` is the number of retrieved documents
     """
-    vectordb = Chroma.from_documents(documents=data, embedding=embeddings)
-    retriever = vectordb.as_retriever(search_type='similarity',
-                                      search_kwargs={'k': k})
-    return vectordb, retriever
-repo_id = 'mistralai/Mistral-7B-Instruct-v0.1'
-llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={'max_new_tokens': 1000})
 # Map
 map_template = """Summarise the following text:
@@ -187,8 +210,6 @@ Answer:"""
 map_prompt = PromptTemplate.from_template(map_template)
 map_chain = LLMChain(llm=llm, prompt=map_prompt)
 # Reduce
 reduce_template = """The following is a set of summaries:
 {docs}
@@ -214,7 +235,7 @@ reduce_documents_chain = ReduceDocumentsChain(
     # If documents exceed context for `StuffDocumentsChain`
     collapse_documents_chain=combine_documents_chain,
     # The maximum number of tokens to group documents into.
-    token_max=4000
 )
@@ -227,21 +248,24 @@ map_reduce_chain = MapReduceDocumentsChain(
     # The variable name in the llm_chain to put the documents in
     document_variable_name="docs",
     # Return the results of the map steps in the output
-    return_intermediate_steps=False
 )
 def get_summary(documents) -> str:
     summary = map_reduce_chain.invoke(documents, return_only_outputs=True)
-    return summary['output_text'].strip()
 contextualise_q_prompt = PromptTemplate.from_template(
     """Given a chat history and the latest user question \
-    which might reference the chat history, formulate a standalone question \
-    that can be understood without the chat history. Do NOT answer the question, \
-    just reformulate it if needed and otherwise return it as is.
     Chat history: {chat_history}
@@ -255,12 +279,15 @@ contextualise_q_chain = contextualise_q_prompt | llm
 standalone_prompt = PromptTemplate.from_template(
     """Given a chat history and the latest user question, \
-    identify whether the question is a standalone question or the question \
-    references the chat history. Answer 'yes' if the question is a standalone \
-    question, and 'no' if the question references the chat history. Do not \
-    answer anything other than 'yes' or 'no'.
     Chat history:
     {chat_history}
@@ -272,14 +299,19 @@ standalone_prompt = PromptTemplate.from_template(
     """
 )
 def format_output(answer: str) -> str:
-    # All lower case and remove all whitespace
-    return ''.join(answer.lower().split())
 standalone_chain = standalone_prompt | llm | format_output
 qa_prompt = PromptTemplate.from_template(
     """You are an assistant for question-answering tasks. \
@@ -298,226 +330,326 @@ qa_prompt = PromptTemplate.from_template(
     """
 )
 class YouTubeChatbot:
-    def __init__(self,
-                 n_sources: int,
-                 k: int,
-                 timestamp_interval: datetime.timedelta,
-                 memory: int,
-                ):
         self.n_sources = n_sources
-        self.k = k
         self.timestamp_interval = timestamp_interval
         self.chat_history = ConversationBufferWindowMemory(k=memory)
     def format_docs(self, docs: list) -> str:
-        """Combine documents
         """
-        self.sources = [doc.metadata['start'] for doc in docs]
-        return '\n\n'.join(doc.page_content for doc in docs)
     def standalone_question(self, input_: dict) -> str:
-        """If the question is a not a standalone question,
         run contextualise_q_chain.
         """
-        if input_['standalone']=='yes':
             return contextualise_q_chain
         else:
-            return input_['question']
     def format_answer(self, answer: str) -> str:
-        if 'cannot find the answer' in answer:
             return answer.strip()
         else:
             timestamps = self.filter_timestamps()
             answer_with_sources = (
-                answer.strip()
-                + ' You can find more information '\
-                'at these timestamps: {}.'.format(', '.join(timestamps))
-                )
             return answer_with_sources
     def filter_timestamps(self) -> list[str]:
-        """Returns a list of timestamps with length `n_sources`.
-        The timestamps are at least an `timestamp_interval` apart.
-        This prevents returning a list of timestamps that are too
-        close together.
         """
-        sorted_timestamps = sorted(self.sources)
-        filtered_timestamps = [sorted_timestamps[0]]
-        i=1
-        while len(filtered_timestamps) < self.n_sources:
-            timestamp1 = datetime.datetime.strptime(filtered_timestamps[-1],
-                                                    '%H:%M:%S')
             try:
-                timestamp2 = datetime.datetime.strptime(sorted_timestamps[i],
-                                                        '%H:%M:%S')
             except IndexError:
                 break
-            time_diff = timestamp2 - timestamp1
-            if time_diff>=self.timestamp_interval:
-                filtered_timestamps.append(str(timestamp2.time()))
             i += 1
         return filtered_timestamps
-    def setup_chatbot(self, url: str) -> str:
-        """Given a YouTube url, set up the chatbot.
         """
         yt_audio_to_text(url)
-        self.data = load_data()
-        _, self.retriever = create_vectordb(self.data, self.k)
-        self.qa_chain = (
             RunnablePassthrough.assign(standalone=standalone_chain)
-            | {'question':self.standalone_question,
-               'context':self.standalone_question|self.retriever|self.format_docs}
             | qa_prompt
-            | llm)
-        return url
-    def get_answer(self, question: str) -> str:
         try:
-            ai_msg = self.qa_chain.invoke({'question': question,
-                                           'chat_history': self.chat_history})
         except AttributeError:
-            raise AttributeError("You haven't setup the chatbot yet. "
-                                 "Setup the chatbot by calling the "
-                                 "instance method `setup_chatbot`.")
-        answer = self.format_answer(ai_msg)
-        self.chat_history.save_context({'question':question},
-                                       {'answer':answer})
         return answer
 class YouTubeChatbotApp(YouTubeChatbot):
-    def __init__(self,
-                 n_sources: int,
-                 k: int,
-                 timestamp_interval: datetime.timedelta,
-                 memory: int,
-                 default_youtube_url: str
-                ):
-        super().__init__(n_sources, k, timestamp_interval, memory)
         self.default_youtube_url = default_youtube_url
-        self.gradio_chat_history = []
-    def greet(self) -> list[tuple[str|None, str|None]]:
-        summary = get_summary(self.data)
-        summary_message = f'Here is a summary of the video "{video_title}":'
-        self.gradio_chat_history.append((None, summary_message))
-        self.gradio_chat_history.append((None, summary))
-        greeting_message = ('You can ask me anything about the video. '
-                            'I will do my best to answer!')
-        self.gradio_chat_history.append((None, greeting_message))
-        return self.gradio_chat_history
-    def question(self, user_message: str) -> list[tuple[str|None, str|None]]:
-        self.gradio_chat_history.append((user_message, None))
-        return '', self.gradio_chat_history
-    def respond(self) -> tuple[str, list[tuple[str|None, str|None]]]:
         try:
-            ai_message = self.get_answer(self.gradio_chat_history[-1][0])
         except AttributeError:
-            raise gr.Error('You need to process the video '
-                           'first by pressing the `Go` button.')
-        self.gradio_chat_history.append((None, ai_message))
-        return self.gradio_chat_history
-    def clear_chat_history(self) -> list:
-        self.chat_history.clear()
-        self.gradio_chat_history = []
-        return self.gradio_chat_history
     def launch(self, **kwargs):
         with gr.Blocks() as demo:
-            # Structure
             with gr.Row():
-                url_input = gr.Textbox(value=self.default_youtube_url,
-                                      label='YouTube URL',
-                                      scale=5)
-                button = gr.Button(value='Go', scale=1)
-            chatbot = gr.Chatbot()
-            user_message = gr.Textbox(label='Ask a question:')
-            clear = gr.ClearButton([user_message, chatbot])
-            # Actions
-            button.click(self.clear_chat_history,
-                        inputs=[],
-                        outputs=[chatbot],
-                        trigger_mode='once'
-                        ).then(self.setup_chatbot,
-                                inputs=[url_input],
-                                outputs=[url_input]
-                        ).then(self.greet,
-                                inputs=[],
-                                outputs=[chatbot])
-            user_message.submit(self.question,
-                                inputs=[user_message],
-                                outputs=[user_message, chatbot]
-                                ).then(self.respond,
-                                      inputs=[],
-                                      outputs=[chatbot])
-            clear.click(self.clear_chat_history, inputs=[], outputs=[chatbot])
         demo.launch(**kwargs)
 if __name__ == "__main__":
-    app = YouTubeChatbotApp(n_sources=3,
-                            k=5,
-                            timestamp_interval=datetime.timedelta(minutes=2),
-                            memory=5,
-                            default_youtube_url='https://www.youtube.com/watch?v=4Bdc55j80l8'
-                           )
-    app.launch()

 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda
 from langchain_core.messages import AIMessage, HumanMessage
 from langchain.memory.buffer_window import ConversationBufferWindowMemory
 from langchain_community.llms import HuggingFaceHub
+from langchain.chains import (
+    LLMChain,
+    StuffDocumentsChain,
+    MapReduceDocumentsChain,
+    ReduceDocumentsChain,
+)
+from gradio_client import Client
+import gradio as gr
 import yt_dlp
 import json
 import gc
 import datetime
 import os
+import numpy as np
+"""Prepare data"""
+whisper_jax_api = "https://sanchit-gandhi-whisper-jax.hf.space/"
 whisper_jax = Client(whisper_jax_api)
+def transcribe_audio(audio_path, task="transcribe", return_timestamps=True) -> str:
     text, runtime = whisper_jax.predict(
         audio_path,
         task,
         return_timestamps,
+        api_name="/predict_1",
     )
     return text
+def format_whisper_jax_output(
+    whisper_jax_output: str, max_duration: int = 60
+) -> list[dict]:
+    """Whisper JAX outputs are in the format
+    '[00:00.000 -> 00:00.000] text\n[00:00.000 -> 00:00.000] text'.
+    Returns a list of dict with keys 'start', 'end', 'text'
     The segments from whisper jax output are merged to form paragraphs.
     `max_duration` controls how many seconds of the audio's transcripts are merged
     final_output = []
     max_duration = datetime.timedelta(seconds=max_duration)
+    segments = whisper_jax_output.split("\n")
+    current_start = datetime.datetime.strptime("00:00", "%M:%S")
+    current_text = ""
     for i, seg in enumerate(segments):
+        text = seg.split("]")[-1].strip()
+        current_text += " " + text
         # Sometimes whisper jax returns None for timestamp
         try:
+            end = datetime.datetime.strptime(seg[14:19], "%M:%S")
         except ValueError:
             end = current_start + max_duration
+        if i == len(segments) - 1:
+            final_output.append(
+                {
+                    "start": current_start.strftime("%H:%M:%S"),
+                    "end": end.strftime("%H:%M:%S"),
+                    "text": current_text.strip(),
+                }
+            )
         else:
+            if end - current_start >= max_duration and current_text[-1] == ".":
+                # If we have exceeded max duration, check whether we have
+                # reached the end of a sentence. If not, keep merging.
+                final_output.append(
+                    {
+                        "start": current_start.strftime("%H:%M:%S"),
+                        "end": end.strftime("%H:%M:%S"),
+                        "text": current_text.strip(),
+                    }
+                )
+                # Update current start and text
+                current_start = end
+                current_text = ""
+    return final_output
+def yt_audio_to_text(url: str, max_duration: int = 60):
+    """Given a YouTube url, download audio and transcribe it to text. Reformat
+    the output from Whisper JAX and save the final result in a json file.
+    """
     progress = gr.Progress()
     progress(0.1)
+    with yt_dlp.YoutubeDL(
+        {"extract_audio": True, "format": "bestaudio", "outtmpl": "audio.mp3"}
+    ) as video:
         info_dict = video.extract_info(url, download=False)
         global video_title
+        video_title = info_dict["title"]
         video.download(url)
     progress(0.4)
+    audio_file = "audio.mp3"
     result = transcribe_audio(audio_file, return_timestamps=True)
     progress(0.7)
     result = format_whisper_jax_output(result, max_duration=max_duration)
     progress(0.9)
+    with open("audio.json", "w") as f:
         json.dump(result, f)
+    os.remove(audio_file)
+"""Load data"""
+def metadata_func(record: dict, metadata: dict) -> dict:
+    """This function is used to tell the Langchain loader the keys that
+    contain metadata and extract them.
+    """
+    metadata["start"] = record.get("start")
+    metadata["end"] = record.get("end")
+    metadata["source"] = metadata["start"] + " -> " + metadata["end"]
     return metadata
 def load_data():
     loader = JSONLoader(
+        file_path="audio.json",
+        jq_schema=".[]",
+        content_key="text",
+        metadata_func=metadata_func,
     )
     data = loader.load()
+    os.remove("audio.json")
     return data
+"""Create embeddings and vector store"""
+embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+embedding_model_kwargs = {"device": device}
+embeddings = HuggingFaceEmbeddings(
+    model_name=embedding_model_name, model_kwargs=embedding_model_kwargs
+)
+def create_vectordb(data, n_retrieved_docs: int, collection_name="YouTube"):
+    """Returns a retriever which is used to fetch relevant documents from
+    the vector database.
+    `n_retrieved_docs` is the number of retrieved documents.
     """
+    vectordb = Chroma.from_documents(
+        documents=data, embedding=embeddings, collection_name=collection_name
+    )
+    n_docs = len(vectordb.get()["ids"])
+    retriever = vectordb.as_retriever(
+        search_type="mmr", search_kwargs={"k": n_retrieved_docs, "fetch_k": n_docs}
+    )
+    return retriever
+"""Load LLM"""
+repo_id = "mistralai/Mistral-7B-Instruct-v0.1"
+llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"max_new_tokens": 1000})
+"""Summarisation"""
 # Map
 map_template = """Summarise the following text:
 map_prompt = PromptTemplate.from_template(map_template)
 map_chain = LLMChain(llm=llm, prompt=map_prompt)
 # Reduce
 reduce_template = """The following is a set of summaries:
 {docs}
     # If documents exceed context for `StuffDocumentsChain`
     collapse_documents_chain=combine_documents_chain,
     # The maximum number of tokens to group documents into.
+    token_max=4000,
 )
     # The variable name in the llm_chain to put the documents in
     document_variable_name="docs",
     # Return the results of the map steps in the output
+    return_intermediate_steps=False,
 )
 def get_summary(documents) -> str:
     summary = map_reduce_chain.invoke(documents, return_only_outputs=True)
+    return summary["output_text"].strip()
+"""Contextualising the question"""
 contextualise_q_prompt = PromptTemplate.from_template(
     """Given a chat history and the latest user question \
+    which might reference the chat history, formulate a \
+    standalone question that can be understood without \
+    the chat history. Do NOT answer the question, just \
+    reformulate it if needed and otherwise return it as is.
     Chat history: {chat_history}
+"""Standalone question chain"""
 standalone_prompt = PromptTemplate.from_template(
     """Given a chat history and the latest user question, \
+    identify whether the question is a standalone question \
+    or the question references the chat history. Answer 'yes' \
+    if the question is a standalone question, and 'no' if the \
+    question references the chat history. Do not answer \
+    anything other than 'yes' or 'no'.
     Chat history:
     {chat_history}
     """
 )
 def format_output(answer: str) -> str:
+    """All lower case and remove all whitespace to ensure
+    that the answer given by the LLM is either 'yes' or 'no'.
+    """
+    return "".join(answer.lower().split())
 standalone_chain = standalone_prompt | llm | format_output
+"""Q&A chain"""
 qa_prompt = PromptTemplate.from_template(
     """You are an assistant for question-answering tasks. \
     """
 )
 class YouTubeChatbot:
+    instance_count = 0
+    def __init__(
+        self,
+        n_sources: int = 3,
+        n_retrieved_docs: int = 5,
+        timestamp_interval: datetime.timedelta = datetime.timedelta(minutes=2),
+        memory: int = 5,
+    ):
+        YouTubeChatbot.instance_count += 1
+        self.chatbot_id = YouTubeChatbot.instance_count
         self.n_sources = n_sources
+        self.n_retrieved_docs = n_retrieved_docs
         self.timestamp_interval = timestamp_interval
         self.chat_history = ConversationBufferWindowMemory(k=memory)
+        self.retriever = None
+        self.qa_chain = None
     def format_docs(self, docs: list) -> str:
+        """Combine documents into a single string which will be included
+        in the prompt given to the LLM.
         """
+        self.sources = [doc.metadata["start"] for doc in docs]
+        return "\n\n".join(doc.page_content for doc in docs)
     def standalone_question(self, input_: dict) -> str:
+        """If the question is a not a standalone question,
         run contextualise_q_chain.
         """
+        if input_["standalone"] == "yes":
             return contextualise_q_chain
         else:
+            return input_["question"]
     def format_answer(self, answer: str) -> str:
+        """Add timestamps to answers.
+        """
+        if "cannot find the answer" in answer:
             return answer.strip()
         else:
             timestamps = self.filter_timestamps()
             answer_with_sources = (
+                answer.strip() + " You can find more information "
+                "at these timestamps: {}.".format(", ".join(timestamps))
+            )
             return answer_with_sources
     def filter_timestamps(self) -> list[str]:
+        """Returns a list of timestamps with length less or
+        equal to `n_sources`. The timestamps are at least an
+        `timestamp_interval` apart. This prevents returning
+        a list of timestamps that are too close together.
         """
+        filtered_timestamps = np.array(
+            [datetime.datetime.strptime(self.sources[0], "%H:%M:%S")]
+        )
+        i = 1
+        while len(filtered_timestamps) < self.n_sources:
             try:
+                new_timestamp = datetime.datetime.strptime(self.sources[i], "%H:%M:%S")
             except IndexError:
                 break
+            absolute_time_difference = abs(new_timestamp - filtered_timestamps)
+            if all(absolute_time_difference >= self.timestamp_interval):
+                filtered_timestamps = np.append(filtered_timestamps, new_timestamp)
             i += 1
+        filtered_timestamps = [
+            timestamp.strftime("%H:%M:%S") for timestamp in filtered_timestamps
+        ]
+        filtered_timestamps.sort()
         return filtered_timestamps
+    def process_video(self, url: str, data=None, retriever=None):
+        """Given a YouTube URL, transcribe YouTube audio to text.
+        Then set up the vector database.
         """
         yt_audio_to_text(url)
+        data = load_data()
+        if retriever is not None:
+            # If we already have documents in the vector store, delete them.
+            ids = retriever.vectorstore.get()["ids"]
+            retriever.vectorstore.delete(ids)
+        retriever = create_vectordb(
+            data, self.n_retrieved_docs,
+            collection_name=f"Chatbot{self.chatbot_id}"
+        )
+        return url, data, retriever
+    def setup_qa_chain(self, retriever, qa_chain=None):
+        qa_chain = (
             RunnablePassthrough.assign(standalone=standalone_chain)
+            | {
+                "question": self.standalone_question,
+                "context": self.standalone_question | retriever | self.format_docs,
+            }
             | qa_prompt
+            | llm
+        )
+        return retriever, qa_chain
+    def setup_chatbot(self, url: str):
+        _, _, self.retriever = self.process_video(url=url, retriever=self.retriever)
+        _, self.qa_chain = self.setup_qa_chain(retriever=self.retriever)
+    def get_answer(self, question: str) -> str:
         try:
+            ai_msg = self.qa_chain.invoke(
+                {"question": question, "chat_history": self.chat_history}
+            )
         except AttributeError:
+            raise AttributeError(
+                "You haven't setup the chatbot yet. "
+                "Setup the chatbot by calling the "
+                "instance method `setup_chatbot`."
+            )
+        self.chat_history.save_context({"question": question}, {"answer": ai_msg})
+        answer = self.format_answer(ai_msg)
         return answer
+"""Web app"""
 class YouTubeChatbotApp(YouTubeChatbot):
+    def __init__(
+        self,
+        n_sources: int,
+        n_retrieved_docs: int,
+        timestamp_interval: datetime.timedelta,
+        memory: int,
+        default_youtube_url: str,
+    ):
+        super().__init__(n_sources, n_retrieved_docs, timestamp_interval, memory)
         self.default_youtube_url = default_youtube_url
+        self.memory = memory
+        self.chat_history = None
+        self.data = None
+        self.retriever = None
+        self.qa_chain = None
+        # Gradio components
+        self.url_input = None
+        self.url_button = None
+        self.app_chat_history = None
+        self.chatbot = None
+        self.user_input = None
+        self.clear_button = None
+    def greet(self, data, app_chat_history) -> dict:
+        """Summarise the video and greet the user.
+        """
+        summary_message = f'Here is a summary of the video "{video_title}":'
+        app_chat_history.append((None, summary_message))
+        summary = get_summary(data)
+        self.data = gr.State(None)
+        app_chat_history.append((None, summary))
+        greeting_message = (
+            "You can ask me anything about the video. " "I will do my best to answer!"
+        )
+        app_chat_history.append((None, greeting_message))
+        return {self.app_chat_history: app_chat_history, self.chatbot: app_chat_history}
+    def question(self, user_question: str, app_chat_history) -> dict:
+        """Display the question asked by the user in the chat window,
+        and delete from the input textbox.
+        """
+        app_chat_history.append((user_question, None))
+        return {
+            self.user_input: "",
+            self.app_chat_history: app_chat_history,
+            self.chatbot: app_chat_history,
+        }
+    def respond(self, qa_chain, chat_history, app_chat_history) -> dict:
+        """Respond to user's latest question"""
+        question = app_chat_history[-1][0]
         try:
+            ai_msg = qa_chain.invoke(
+                {"question": question, "chat_history": chat_history}
+            )
         except AttributeError:
+            raise gr.Error(
+                "You need to process the video " "first by pressing the `Go` button."
+            )
+        chat_history.save_context({"question": question}, {"answer": ai_msg})
+        answer = self.format_answer(ai_msg)
+        app_chat_history.append((None, answer))
+        return {
+            self.qa_chain: qa_chain,
+            self.chat_history: chat_history,
+            self.app_chat_history: app_chat_history,
+            self.chatbot: app_chat_history,
+        }
+    def clear_chat_history(self, chat_history, app_chat_history):
+        chat_history.clear()
+        app_chat_history = []
+        return {
+            self.chat_history: chat_history,
+            self.app_chat_history: app_chat_history,
+            self.chatbot: app_chat_history,
+        }
     def launch(self, **kwargs):
         with gr.Blocks() as demo:
+            self.chat_history = gr.State(ConversationBufferWindowMemory(k=self.memory))
+            self.app_chat_history = gr.State([])
+            self.data = gr.State()
+            self.retriever = gr.State()
+            self.qa_chain = gr.State()
+            # App structure
             with gr.Row():
+                self.url_input = gr.Textbox(
+                    value=self.default_youtube_url, label="YouTube URL", scale=5
+                )
+                self.url_button = gr.Button(value="Go", scale=1)
+            self.chatbot = gr.Chatbot()
+            self.user_input = gr.Textbox(label="Ask a question:")
+            self.clear_button = gr.Button(value="Clear")
+            # App actions
+            # When a new url is given, clear past chat history and process
+            # the new video. Set up the Q&A chain with the new video's data.
+            # Provide a summary of the new video.
+            self.url_button.click(
+                self.clear_chat_history,
+                inputs=[self.chat_history, self.app_chat_history],
+                outputs=[self.chat_history, self.app_chat_history, self.chatbot],
+                trigger_mode="once",
+            ).then(
+                self.process_video,
+                inputs=[self.url_input, self.data, self.retriever],
+                outputs=[self.url_input, self.data, self.retriever],
+            ).then(
+                self.setup_qa_chain,
+                inputs=[self.retriever, self.qa_chain],
+                outputs=[self.retriever, self.qa_chain],
+            ).then(
+                self.greet,
+                inputs=[self.data, self.app_chat_history],
+                outputs=[self.app_chat_history, self.chatbot],
+            )
+            # When a user asks a question, display the question in the chat
+            # window and remove it from the text input area. Then respond
+            # with the Q&A chain.
+            self.user_input.submit(
+                self.question,
+                inputs=[self.user_input, self.app_chat_history],
+                outputs=[self.user_input, self.app_chat_history, self.chatbot],
+                queue=False,
+            ).then(
+                self.respond,
+                inputs=[self.qa_chain, self.chat_history, self.app_chat_history],
+                outputs=[
+                    self.qa_chain,
+                    self.chat_history,
+                    self.app_chat_history,
+                    self.chatbot,
+                ],
+            )
+            # When the `Clear` button is clicked, clear the chat history from
+            # the chat window.
+            self.clear_button.click(
+                self.clear_chat_history,
+                inputs=[self.chat_history, self.app_chat_history],
+                outputs=[self.chat_history, self.app_chat_history, self.chatbot],
+                queue=False,
+            )
         demo.launch(**kwargs)
 if __name__ == "__main__":
+    app = YouTubeChatbotApp(
+        n_sources=3,
+        n_retrieved_docs=5,
+        timestamp_interval=datetime.timedelta(minutes=2),
+        memory=5,
+        default_youtube_url="https://www.youtube.com/watch?v=4Bdc55j80l8",
+    )
+    app.launch()