File size: 7,675 Bytes
428c7f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

from flask import Flask, request, jsonify, send_from_directory
# from flask_session import Session
from flask_cors import CORS  # <-- New import here
from flask_cors import cross_origin
import openai
import os
from pytube import YouTube
import re
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from youtube_transcript_api import YouTubeTranscriptApi
from dotenv import load_dotenv

load_dotenv()

app = Flask(__name__, static_folder="./dist") # requests in the dist folder are being sent to http://localhost:5000/<endpoint> 
CORS(app, resources={r"/*": {"origins": "*"}}) 
openai.api_key = os.environ["OPENAI_API_KEY"]
llm_name = "gpt-3.5-turbo"
qna_chain = None


@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def serve(path):
    if path != "" and os.path.exists(app.static_folder + '/' + path):
        return send_from_directory(app.static_folder, path)
    else:
        return send_from_directory(app.static_folder, 'index.html')

def load_db(file, chain_type, k):
    """
    Central Function that:
        - Loads the database
        - Creates the retriever
        - Creates the chatbot chain
        - Returns the chatbot chain
        - A Dictionary containing 
                -- question
                -- llm answer
                -- chat history
                -- source_documents
                -- generated_question
                s
    Usage: question_answer_chain = load_db(file, chain_type, k) 
           response = question_answer_chain({"question": query, "chat_history": chat_history}})
    """

    transcript = TextLoader(file).load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=70)
    docs = text_splitter.split_documents(transcript)
    
    embeddings = OpenAIEmbeddings()                                                     
    
    db = Chroma.from_documents(docs, embeddings)
    
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm = ChatOpenAI(temperature=0),                      #### Prompt Template is yet to be created
        chain_type=chain_type,                               
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
        # memory=memory
    )
    
    return qa 


def buffer(history, buff):
    """
    Buffer the history.
    Keeps only buff recent chats in the history

    Usage: history = buffer(history, buff)
    """

    if len(history) > buff :
        print(len(history)>buff)
        return history[-buff:]
    return history
    

def is_valid_yt(link):
    """
    Check if a link is a valid YouTube link.
    
    Usage: boolean, video_id = is_valid_yt(youtube_string)
    """

    pattern = r'^(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([\w\-_]{11})(?:\S+)?$'
    match = re.match(pattern, link)
    if match:
        return True, match.group(1) 
    else:
        return False, None


def get_metadata(video_id) -> dict:
        """Get important video information.

        Components are:
            - title
            - description
            - thumbnail url,
            - publish_date
            - channel_author
            - and more.

        Usage: get_metadata(id)->dict
        """

        try:
            from pytube import YouTube

        except ImportError:
            raise ImportError(
                "Could not import pytube python package. "
                "Please install it with `pip install pytube`."
            )
        yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
        video_info = {
            "title": yt.title or "Unknown",
            "description": yt.description or "Unknown",
            "view_count": yt.views or 0,
            "thumbnail_url": yt.thumbnail_url or "Unknown",
            "publish_date": yt.publish_date.strftime("%Y-%m-%d %H:%M:%S")
            if yt.publish_date
            else "Unknown",
            "length": yt.length or 0,
            "author": yt.author or "Unknown",
        }
        return video_info


def save_transcript(video_id):
    """
    Saves the transcript of a valid yt video to a text file.
    """

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"Error fetching transcript for video {video_id}: {e}")
        return None
    if transcript:
        with open('transcript.txt', 'w') as file:
            for entry in transcript:
                file.write(f"~{int(entry['start'])}~{entry['text']} ")
        print(f"Transcript saved to: transcript.txt")

@app.route('/init', methods=['POST'])
@cross_origin()
def initialize():
    """
    Initialize the qna_chain for a user.
    """
    global qna_chain
    
    qna_chain = 0

    # NEED to authenticate the user here
    yt_link = request.json.get('yt_link', '')
    valid, id = is_valid_yt(yt_link)
    if valid:
        metadata = get_metadata(id)
        try:
            os.remove('./transcript.txt')
        except:
            print("No transcript file to remove.")
            
        save_transcript(id)

        # Initialize qna_chain for the user
        qna_chain = load_db("./transcript.txt", 'stuff',  5)

        # os.remove('./transcript.txt')
        
        return jsonify({"status": "success", 
                        "message": "qna_chain initialized.",
                        "metadata": metadata,
                        })
    else:
        return jsonify({"status": "error", "message": "Invalid YouTube link."})


@app.route('/response', methods=['POST'])
def response():
    """
    - Expects youtube Video Link and chat-history in payload
    - Returns response on the query.
    """
    global qna_chain
    
    req = request.get_json()
    raw = req.get('chat_history', [])

    # raw is a list of list containing two strings convert that into a list of tuples
    if len(raw) > 0:
        chat_history = [tuple(x) for x in raw]
    else:
        chat_history = []
    # print(f"Chat History: {chat_history}")
    
    memory = chat_history
    query = req.get('query', '')
    # print(f"Query: {query}")

    if memory is None:
        memory = []
    
    if qna_chain is None:
        return jsonify({"status": "error", "message": "qna_chain not initialized."}),  400

    response = qna_chain({'question': query, 'chat_history': buffer(memory,7)})

    if response['source_documents']:
        pattern = r'~(\d+)~'
        backlinked_docs = [response['source_documents'][i].page_content for i in range(len(response['source_documents']))]
        timestamps = list(map(lambda s: int(re.search(pattern, s).group(1)) if re.search(pattern, s) else None, backlinked_docs))
        
        return jsonify(dict(timestamps=timestamps, answer=response['answer']))

    return jsonify(response['answer'])

@app.route('/transcript', methods=['POST'])
@cross_origin()
def send_transcript():
    """
    Send the transcript of the video.
    """
    try:
        with open('transcript.txt', 'r') as file:
            transcript = file.read()
        return jsonify({"status": "success", "transcript": transcript})
    except:
        return jsonify({"status": "error", "message": "Transcript not found."})
    

if __name__ == '__main__':
    app.run(debug=False)