tokensandcharms commited on
Commit
353e791
·
0 Parent(s):
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +242 -0
  3. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from openai import OpenAI
4
+ from PyPDF2 import PdfReader
5
+ import requests
6
+ from youtube_transcript_api import YouTubeTranscriptApi
7
+ from urllib.parse import urlparse, parse_qs
8
+ from pinecone import Pinecone
9
+ import uuid
10
+ from dotenv import load_dotenv
11
+ import time
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+
14
+ load_dotenv()
15
+
16
+ # Set up OpenAI client
17
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
+
19
+ # Set up Pinecone
20
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
21
+
22
+ index_name = "main" # Your index name
23
+ index = pc.Index(index_name)
24
+
25
+ def get_embedding(text):
26
+ response = client.embeddings.create(input=text, model="text-embedding-ada-002")
27
+ return response.data[0].embedding
28
+
29
+ def process_pdf(file):
30
+ reader = PdfReader(file)
31
+ text = ""
32
+ for page in reader.pages:
33
+ text += page.extract_text() + "\n"
34
+ return text
35
+
36
+ def process_web_link(url):
37
+ response = requests.get(url)
38
+ return response.text
39
+
40
+ def process_youtube_link(url):
41
+ video_id = extract_video_id(url)
42
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
43
+ return " ".join([entry['text'] for entry in transcript])
44
+
45
+ def extract_video_id(url):
46
+ parsed_url = urlparse(url)
47
+ if parsed_url.hostname == 'youtu.be':
48
+ return parsed_url.path[1:]
49
+ if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
50
+ if parsed_url.path == '/watch':
51
+ return parse_qs(parsed_url.query)['v'][0]
52
+ if parsed_url.path[:7] == '/embed/':
53
+ return parsed_url.path.split('/')[2]
54
+ if parsed_url.path[:3] == '/v/':
55
+ return parsed_url.path.split('/')[2]
56
+ return None
57
+
58
+ def process_upload(upload_type, file_or_link, file_name=None):
59
+ print(f"Starting process_upload for {upload_type}")
60
+ doc_id = str(uuid.uuid4())
61
+ print(f"Generated doc_id: {doc_id}")
62
+
63
+ if upload_type == "PDF":
64
+ content = process_pdf(file_or_link)
65
+ doc_name = file_name or "Uploaded PDF"
66
+ elif upload_type == "Web Link":
67
+ content = process_web_link(file_or_link)
68
+ doc_name = file_or_link
69
+ elif upload_type == "YouTube Link":
70
+ content = process_youtube_link(file_or_link)
71
+ doc_name = f"YouTube: {file_or_link}"
72
+ else:
73
+ print("Invalid upload type")
74
+ return "Invalid upload type"
75
+
76
+ content_length = len(content)
77
+ print(f"Content extracted, length: {content_length}")
78
+
79
+ # Dynamically adjust chunk size based on content length
80
+ if content_length < 10000:
81
+ chunk_size = 1000
82
+ elif content_length < 100000:
83
+ chunk_size = 2000
84
+ else:
85
+ chunk_size = 4000
86
+ print(f"Using chunk size: {chunk_size}")
87
+
88
+ chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
89
+
90
+ vectors = []
91
+ with ThreadPoolExecutor() as executor:
92
+ futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
93
+
94
+ for future in as_completed(futures):
95
+ vectors.append(future.result())
96
+ # Update progress
97
+ progress = len(vectors) / len(chunks)
98
+ st.session_state.upload_progress.progress(progress)
99
+
100
+ print(f"Generated {len(vectors)} vectors")
101
+
102
+ index.upsert(vectors=vectors)
103
+ print("Vectors upserted to Pinecone")
104
+
105
+ return f"Processing complete for {upload_type}. Document Name: {doc_name}"
106
+
107
+ def process_chunk(chunk, doc_id, i, upload_type, doc_name):
108
+ embedding = get_embedding(chunk)
109
+ truncated_embedding = embedding[:200]
110
+ return (f"{doc_id}_{i}", truncated_embedding, {
111
+ "text": chunk,
112
+ "type": upload_type,
113
+ "doc_id": doc_id,
114
+ "doc_name": doc_name,
115
+ "chunk_index": i
116
+ })
117
+
118
+ def get_relevant_context(query, top_k=5):
119
+ print(f"Getting relevant context for query: {query}")
120
+ query_embedding = get_embedding(query)
121
+ truncated_query_embedding = query_embedding[:200]
122
+
123
+ search_results = index.query(vector=truncated_query_embedding, top_k=top_k, include_metadata=True)
124
+ print(f"Found {len(search_results['matches'])} relevant results")
125
+
126
+ # Sort results by doc_id and chunk_index to maintain document structure
127
+ sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
128
+
129
+ context = "\n".join([result['metadata']['text'] for result in sorted_results])
130
+ return context, sorted_results
131
+
132
+ def chat_with_ai(message):
133
+ print(f"Chatting with AI, message: {message}")
134
+ context, results = get_relevant_context(message)
135
+ print(f"Retrieved context, length: {len(context)}")
136
+
137
+ messages = [
138
+ {"role": "system", "content": "You are a helpful assistant. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
139
+ {"role": "system", "content": f"Context: {context}"},
140
+ {"role": "user", "content": message}
141
+ ]
142
+
143
+ response = client.chat.completions.create(
144
+ model="gpt-4o-mini",
145
+ messages=messages
146
+ )
147
+ print("Received response from OpenAI")
148
+
149
+ ai_response = response.choices[0].message.content
150
+
151
+ # Prepare source information
152
+ sources = [
153
+ {
154
+ "doc_id": result['metadata']['doc_id'],
155
+ "doc_name": result['metadata']['doc_name'],
156
+ "chunk_index": result['metadata']['chunk_index'],
157
+ "text": result['metadata']['text'],
158
+ "type": result['metadata']['type']
159
+ }
160
+ for result in results
161
+ ]
162
+
163
+ return ai_response, sources
164
+
165
+ def clear_database():
166
+ print("Clearing database...")
167
+ index.delete(delete_all=True)
168
+ print("Database cleared")
169
+ return "Database cleared successfully."
170
+
171
+ # Streamlit UI
172
+ st.set_page_config(layout="wide")
173
+ st.title("Upload and Chat with PDFs, Web Links, and YouTube Videos")
174
+
175
+ # Create three columns
176
+ col1, col2, col3 = st.columns([1, 1, 1])
177
+
178
+ with col1:
179
+ st.header("Upload")
180
+
181
+ # PDF upload
182
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
183
+
184
+ # Web Link input
185
+ web_link = st.text_input("Enter a Web Link")
186
+
187
+ # YouTube Link input
188
+ youtube_link = st.text_input("Enter a YouTube Link")
189
+
190
+ if st.button("Process All"):
191
+ st.session_state.upload_progress = st.progress(0)
192
+ with st.spinner("Processing uploads..."):
193
+ results = []
194
+ if uploaded_file:
195
+ pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
196
+ results.append(pdf_result)
197
+ if web_link:
198
+ web_result = process_upload("Web Link", web_link)
199
+ results.append(web_result)
200
+ if youtube_link:
201
+ youtube_result = process_upload("YouTube Link", youtube_link)
202
+ results.append(youtube_result)
203
+
204
+ if results:
205
+ for result in results:
206
+ st.success(result)
207
+ else:
208
+ st.warning("No content uploaded. Please provide at least one input.")
209
+ st.session_state.upload_progress.empty()
210
+
211
+ if st.button("Clear Database"):
212
+ result = clear_database()
213
+ st.success(result)
214
+
215
+ with col2:
216
+ st.header("Chat")
217
+ user_input = st.text_input("Ask a question about the uploaded content:")
218
+ if st.button("Send"):
219
+ if user_input:
220
+ print(f"Sending user input: {user_input}")
221
+ st.session_state.chat_progress = st.progress(0)
222
+ response, sources = chat_with_ai(user_input)
223
+ st.session_state.chat_progress.progress(1.0)
224
+ st.markdown("**You:** " + user_input)
225
+ st.markdown("**AI:** " + response)
226
+
227
+ # Store sources in session state for display in col3
228
+ st.session_state.sources = sources
229
+ st.session_state.chat_progress.empty()
230
+ else:
231
+ print("Empty user input")
232
+ st.warning("Please enter a question.")
233
+
234
+ with col3:
235
+ st.header("Source Chunks")
236
+ if 'sources' in st.session_state and st.session_state.sources:
237
+ for i, source in enumerate(st.session_state.sources, 1):
238
+ with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
239
+ st.markdown(f"**Chunk Index:** {source['chunk_index']}")
240
+ st.text(source['text'])
241
+ else:
242
+ st.info("Ask a question to see source chunks here.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ PyPDF2
4
+ requests
5
+ youtube_transcript_api
6
+ pinecone-client
7
+ python-dotenv