Spaces:
Runtime error
Runtime error
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -11,8 +11,260 @@ from dotenv import load_dotenv
|
|
11 |
import time
|
12 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
load_dotenv()
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Set up OpenAI client
|
17 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
18 |
|
|
|
11 |
import time
|
12 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
|
14 |
+
load_dotenv()import os
|
15 |
+
import gradio as gr
|
16 |
+
from openai import OpenAI
|
17 |
+
from PyPDF2 import PdfReader
|
18 |
+
import requests
|
19 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
20 |
+
from urllib.parse import urlparse, parse_qs
|
21 |
+
from pinecone import Pinecone
|
22 |
+
import uuid
|
23 |
+
from dotenv import load_dotenv
|
24 |
+
import time
|
25 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
26 |
+
|
27 |
load_dotenv()
|
28 |
|
29 |
+
# Set up OpenAI client
|
30 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
31 |
+
|
32 |
+
# Set up Pinecone
|
33 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
34 |
+
|
35 |
+
index_name = "main" # Your index name
|
36 |
+
index = pc.Index(index_name)
|
37 |
+
|
38 |
+
def get_embedding(text):
|
39 |
+
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
40 |
+
return response.data[0].embedding
|
41 |
+
|
42 |
+
def process_pdf(file):
|
43 |
+
reader = PdfReader(file)
|
44 |
+
text = ""
|
45 |
+
for page in reader.pages:
|
46 |
+
text += page.extract_text() + "\n"
|
47 |
+
return text
|
48 |
+
|
49 |
+
def process_web_link(url):
|
50 |
+
response = requests.get(url)
|
51 |
+
return response.text
|
52 |
+
|
53 |
+
def process_youtube_link(url):
|
54 |
+
video_id = extract_video_id(url)
|
55 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
56 |
+
return " ".join([entry['text'] for entry in transcript])
|
57 |
+
|
58 |
+
def extract_video_id(url):
|
59 |
+
parsed_url = urlparse(url)
|
60 |
+
if parsed_url.hostname == 'youtu.be':
|
61 |
+
return parsed_url.path[1:]
|
62 |
+
if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
|
63 |
+
if parsed_url.path == '/watch':
|
64 |
+
return parse_qs(parsed_url.query)['v'][0]
|
65 |
+
if parsed_url.path[:7] == '/embed/':
|
66 |
+
return parsed_url.path.split('/')[2]
|
67 |
+
if parsed_url.path[:3] == '/v/':
|
68 |
+
return parsed_url.path.split('/')[2]
|
69 |
+
return None
|
70 |
+
|
71 |
+
def process_upload(upload_type, file_or_link, file_name=None):
|
72 |
+
print(f"Starting process_upload for {upload_type}")
|
73 |
+
doc_id = str(uuid.uuid4())
|
74 |
+
print(f"Generated doc_id: {doc_id}")
|
75 |
+
|
76 |
+
if upload_type == "PDF":
|
77 |
+
content = process_pdf(file_or_link)
|
78 |
+
doc_name = file_name or "Uploaded PDF"
|
79 |
+
elif upload_type == "Web Link":
|
80 |
+
content = process_web_link(file_or_link)
|
81 |
+
doc_name = file_or_link
|
82 |
+
elif upload_type == "YouTube Link":
|
83 |
+
content = process_youtube_link(file_or_link)
|
84 |
+
doc_name = f"YouTube: {file_or_link}"
|
85 |
+
else:
|
86 |
+
print("Invalid upload type")
|
87 |
+
return "Invalid upload type"
|
88 |
+
|
89 |
+
content_length = len(content)
|
90 |
+
print(f"Content extracted, length: {content_length}")
|
91 |
+
|
92 |
+
# Dynamically adjust chunk size based on content length
|
93 |
+
if content_length < 10000:
|
94 |
+
chunk_size = 1000
|
95 |
+
elif content_length < 100000:
|
96 |
+
chunk_size = 2000
|
97 |
+
else:
|
98 |
+
chunk_size = 4000
|
99 |
+
print(f"Using chunk size: {chunk_size}")
|
100 |
+
|
101 |
+
chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
|
102 |
+
|
103 |
+
vectors = []
|
104 |
+
with ThreadPoolExecutor() as executor:
|
105 |
+
futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
|
106 |
+
|
107 |
+
for future in as_completed(futures):
|
108 |
+
vectors.append(future.result())
|
109 |
+
# Progress can be handled via logging or status messages
|
110 |
+
|
111 |
+
print(f"Generated {len(vectors)} vectors")
|
112 |
+
|
113 |
+
index.upsert(vectors=vectors)
|
114 |
+
print("Vectors upserted to Pinecone")
|
115 |
+
|
116 |
+
return f"Processing complete for {upload_type}. Document Name: {doc_name}"
|
117 |
+
|
118 |
+
def process_chunk(chunk, doc_id, i, upload_type, doc_name):
|
119 |
+
embedding = get_embedding(chunk)
|
120 |
+
return (f"{doc_id}_{i}", embedding, {
|
121 |
+
"text": chunk,
|
122 |
+
"type": upload_type,
|
123 |
+
"doc_id": doc_id,
|
124 |
+
"doc_name": doc_name,
|
125 |
+
"chunk_index": i
|
126 |
+
})
|
127 |
+
|
128 |
+
def get_relevant_context(query, top_k=5):
|
129 |
+
print(f"Getting relevant context for query: {query}")
|
130 |
+
query_embedding = get_embedding(query)
|
131 |
+
|
132 |
+
search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
|
133 |
+
print(f"Found {len(search_results['matches'])} relevant results")
|
134 |
+
|
135 |
+
# Sort results by doc_id and chunk_index to maintain document structure
|
136 |
+
sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
|
137 |
+
|
138 |
+
context = "\n".join([result['metadata']['text'] for result in sorted_results])
|
139 |
+
return context, sorted_results
|
140 |
+
|
141 |
+
def chat_with_ai(message):
|
142 |
+
print(f"Chatting with AI, message: {message}")
|
143 |
+
context, results = get_relevant_context(message)
|
144 |
+
print(f"Retrieved context, length: {len(context)}")
|
145 |
+
|
146 |
+
messages = [
|
147 |
+
{"role": "system", "content": "You are a helpful assistant. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
|
148 |
+
{"role": "system", "content": f"Context: {context}"},
|
149 |
+
{"role": "user", "content": message}
|
150 |
+
]
|
151 |
+
|
152 |
+
response = client.chat.completions.create(
|
153 |
+
model="gpt-4o-mini",
|
154 |
+
messages=messages
|
155 |
+
)
|
156 |
+
print("Received response from OpenAI")
|
157 |
+
|
158 |
+
ai_response = response.choices[0].message.content
|
159 |
+
|
160 |
+
# Prepare source information
|
161 |
+
sources = [
|
162 |
+
{
|
163 |
+
"doc_id": result['metadata']['doc_id'],
|
164 |
+
"doc_name": result['metadata']['doc_name'],
|
165 |
+
"chunk_index": result['metadata']['chunk_index'],
|
166 |
+
"text": result['metadata']['text'],
|
167 |
+
"type": result['metadata']['type']
|
168 |
+
}
|
169 |
+
for result in results
|
170 |
+
]
|
171 |
+
|
172 |
+
return ai_response, sources
|
173 |
+
|
174 |
+
def clear_database():
|
175 |
+
print("Clearing database...")
|
176 |
+
index.delete(delete_all=True)
|
177 |
+
print("Database cleared")
|
178 |
+
return "Database cleared successfully."
|
179 |
+
|
180 |
+
# Gradio Interface Components
|
181 |
+
|
182 |
+
def handle_uploads(pdf, web_link, youtube_link):
|
183 |
+
results = []
|
184 |
+
if pdf:
|
185 |
+
pdf_result = process_upload("PDF", pdf, pdf.name)
|
186 |
+
results.append(pdf_result)
|
187 |
+
if web_link:
|
188 |
+
web_result = process_upload("Web Link", web_link)
|
189 |
+
results.append(web_result)
|
190 |
+
if youtube_link:
|
191 |
+
youtube_result = process_upload("YouTube Link", youtube_link)
|
192 |
+
results.append(youtube_result)
|
193 |
+
|
194 |
+
if results:
|
195 |
+
return "\n".join([f"✅ {res}" for res in results])
|
196 |
+
else:
|
197 |
+
return "⚠️ No content uploaded. Please provide at least one input."
|
198 |
+
|
199 |
+
def handle_chat(user_input, state):
|
200 |
+
if not user_input:
|
201 |
+
return "⚠️ Please enter a question.", state
|
202 |
+
|
203 |
+
response, sources = chat_with_ai(user_input)
|
204 |
+
state = sources # Update state with sources
|
205 |
+
return f"**You:** {user_input}\n\n**AI:** {response}", state
|
206 |
+
|
207 |
+
def handle_clear_database():
|
208 |
+
result = clear_database()
|
209 |
+
return result
|
210 |
+
|
211 |
+
def display_sources(sources):
|
212 |
+
if not sources:
|
213 |
+
return "ℹ️ Ask a question to see source chunks here."
|
214 |
+
|
215 |
+
source_texts = []
|
216 |
+
for i, source in enumerate(sources, 1):
|
217 |
+
source_text = f"**Source {i} - {source['type']} ({source['doc_name']})**\n\n" \
|
218 |
+
f"**Chunk Index:** {source['chunk_index']}\n" \
|
219 |
+
f"{source['text']}\n\n---\n"
|
220 |
+
source_texts.append(source_text)
|
221 |
+
return "\n".join(source_texts)
|
222 |
+
|
223 |
+
with gr.Blocks() as demo:
|
224 |
+
gr.Markdown("# 📄 Upload and Chat with PDFs, Web Links, and YouTube Videos")
|
225 |
+
|
226 |
+
with gr.Row():
|
227 |
+
with gr.Column(scale=1):
|
228 |
+
gr.Markdown("## 📤 Upload")
|
229 |
+
|
230 |
+
pdf_input = gr.File(label="Choose a PDF file", file_types=[".pdf"])
|
231 |
+
web_link_input = gr.Textbox(label="Enter a Web Link", placeholder="https://example.com")
|
232 |
+
youtube_link_input = gr.Textbox(label="Enter a YouTube Link", placeholder="https://youtube.com/watch?v=...")
|
233 |
+
|
234 |
+
upload_button = gr.Button("Process All")
|
235 |
+
clear_db_button = gr.Button("Clear Database")
|
236 |
+
upload_output = gr.Markdown()
|
237 |
+
|
238 |
+
with gr.Column(scale=1):
|
239 |
+
gr.Markdown("## 💬 Chat")
|
240 |
+
user_input = gr.Textbox(label="Ask a question about the uploaded content:", placeholder="Your question here...")
|
241 |
+
chat_button = gr.Button("Send")
|
242 |
+
chat_output = gr.Markdown()
|
243 |
+
|
244 |
+
with gr.Column(scale=1):
|
245 |
+
gr.Markdown("## 📚 Source Chunks")
|
246 |
+
sources_display = gr.Markdown()
|
247 |
+
|
248 |
+
# Hidden state to store sources
|
249 |
+
state = gr.State([])
|
250 |
+
|
251 |
+
# Define interactions
|
252 |
+
upload_button.click(handle_uploads, inputs=[pdf_input, web_link_input, youtube_link_input], outputs=upload_output)
|
253 |
+
clear_db_button.click(handle_clear_database, inputs=None, outputs=upload_output)
|
254 |
+
chat_button.click(handle_chat, inputs=[user_input, state], outputs=[chat_output, state])
|
255 |
+
state.change(display_sources, inputs=state, outputs=sources_display)
|
256 |
+
|
257 |
+
# Alternatively, use an event to update sources_display when state changes
|
258 |
+
def update_sources(sources):
|
259 |
+
return display_sources(sources)
|
260 |
+
|
261 |
+
chat_button.click(update_sources, inputs=state, outputs=sources_display)
|
262 |
+
|
263 |
+
# Launch the Gradio app
|
264 |
+
if __name__ == "__main__":
|
265 |
+
demo.launch()
|
266 |
+
|
267 |
+
|
268 |
# Set up OpenAI client
|
269 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
270 |
|