File size: 3,234 Bytes
65b73f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd6f76f
65b73f1
bd6f76f
65b73f1
 
6e9136f
65b73f1
 
 
 
 
 
bd6f76f
65b73f1
 
 
bd6f76f
65b73f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Q&A Chatbot
from langchain.llms import OpenAI
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import textwrap

load_dotenv(find_dotenv())
embeddings = OpenAIEmbeddings()

#load_dotenv()  # take environment variables from .env.

import streamlit as st
import os

def create_db_from_youtube_video_url(video_url):
    # Get transcript
    loader = YoutubeLoader.from_youtube_url(video_url)
    transcript = loader.load()

    # Clean the text, set max token, split in several chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

    # List with split up transcript
    docs = text_splitter.split_documents(transcript)

    # Create a database
    # Turn into vector of numbers (numerical value of the docs)
    db = FAISS.from_documents(docs, embeddings)
    return db

# Why 4? The model can handle up to 16,385 tokens. The chunk size is set to 2000 and k is 4 to maximize the number of tokens to analyze.
def get_response_from_query(db, query, k=4):
    # FIlter based on the similarity of the database with the prompt
    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.2)

    # Template to use for the system message prompt
    template = """
        You are a helpful assistant that that can answer questions about youtube videos 
        based on the video's transcript: {docs}
        
        Only use the factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        """

    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Human question prompt
    human_template = "Answer the following question: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    # Combines into a chat prompt
    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    chain = LLMChain(llm=chat, prompt=chat_prompt)

    response = chain.run(question=query, docs=docs_page_content)
    response = response.replace("\n", "")
    return response, docs


# Webpage with Streamlit

st.set_page_config(page_title="Youtube Video Chatbot with Langchain")

st.header("Youtube Video Chatbot with Langchain")

youtube_input=st.text_input("Youtube Link: ",key="youtube_input")
query=st.text_input("Your Question Here: ",key="query")


if youtube_input != "":
    db = create_db_from_youtube_video_url(youtube_input)
    response, docs = get_response_from_query(db, query)

submit=st.button("Ask a Question")

## If ask button is clicked
if submit:
    st.subheader("Answer:")
    st.write(response)