poemsforaphrodite commited on
Commit
96cc439
·
verified ·
1 Parent(s): 8ae9422

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -0
app.py CHANGED
@@ -11,8 +11,260 @@ from dotenv import load_dotenv
11
  import time
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  load_dotenv()
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Set up OpenAI client
17
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
 
 
11
  import time
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
 
14
+ load_dotenv()import os
15
+ import gradio as gr
16
+ from openai import OpenAI
17
+ from PyPDF2 import PdfReader
18
+ import requests
19
+ from youtube_transcript_api import YouTubeTranscriptApi
20
+ from urllib.parse import urlparse, parse_qs
21
+ from pinecone import Pinecone
22
+ import uuid
23
+ from dotenv import load_dotenv
24
+ import time
25
+ from concurrent.futures import ThreadPoolExecutor, as_completed
26
+
27
  load_dotenv()
28
 
29
+ # Set up OpenAI client
30
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
31
+
32
+ # Set up Pinecone
33
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
34
+
35
+ index_name = "main" # Your index name
36
+ index = pc.Index(index_name)
37
+
38
+ def get_embedding(text):
39
+ response = client.embeddings.create(input=text, model="text-embedding-3-large")
40
+ return response.data[0].embedding
41
+
42
+ def process_pdf(file):
43
+ reader = PdfReader(file)
44
+ text = ""
45
+ for page in reader.pages:
46
+ text += page.extract_text() + "\n"
47
+ return text
48
+
49
+ def process_web_link(url):
50
+ response = requests.get(url)
51
+ return response.text
52
+
53
+ def process_youtube_link(url):
54
+ video_id = extract_video_id(url)
55
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
56
+ return " ".join([entry['text'] for entry in transcript])
57
+
58
+ def extract_video_id(url):
59
+ parsed_url = urlparse(url)
60
+ if parsed_url.hostname == 'youtu.be':
61
+ return parsed_url.path[1:]
62
+ if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
63
+ if parsed_url.path == '/watch':
64
+ return parse_qs(parsed_url.query)['v'][0]
65
+ if parsed_url.path[:7] == '/embed/':
66
+ return parsed_url.path.split('/')[2]
67
+ if parsed_url.path[:3] == '/v/':
68
+ return parsed_url.path.split('/')[2]
69
+ return None
70
+
71
+ def process_upload(upload_type, file_or_link, file_name=None):
72
+ print(f"Starting process_upload for {upload_type}")
73
+ doc_id = str(uuid.uuid4())
74
+ print(f"Generated doc_id: {doc_id}")
75
+
76
+ if upload_type == "PDF":
77
+ content = process_pdf(file_or_link)
78
+ doc_name = file_name or "Uploaded PDF"
79
+ elif upload_type == "Web Link":
80
+ content = process_web_link(file_or_link)
81
+ doc_name = file_or_link
82
+ elif upload_type == "YouTube Link":
83
+ content = process_youtube_link(file_or_link)
84
+ doc_name = f"YouTube: {file_or_link}"
85
+ else:
86
+ print("Invalid upload type")
87
+ return "Invalid upload type"
88
+
89
+ content_length = len(content)
90
+ print(f"Content extracted, length: {content_length}")
91
+
92
+ # Dynamically adjust chunk size based on content length
93
+ if content_length < 10000:
94
+ chunk_size = 1000
95
+ elif content_length < 100000:
96
+ chunk_size = 2000
97
+ else:
98
+ chunk_size = 4000
99
+ print(f"Using chunk size: {chunk_size}")
100
+
101
+ chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
102
+
103
+ vectors = []
104
+ with ThreadPoolExecutor() as executor:
105
+ futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
106
+
107
+ for future in as_completed(futures):
108
+ vectors.append(future.result())
109
+ # Progress can be handled via logging or status messages
110
+
111
+ print(f"Generated {len(vectors)} vectors")
112
+
113
+ index.upsert(vectors=vectors)
114
+ print("Vectors upserted to Pinecone")
115
+
116
+ return f"Processing complete for {upload_type}. Document Name: {doc_name}"
117
+
118
+ def process_chunk(chunk, doc_id, i, upload_type, doc_name):
119
+ embedding = get_embedding(chunk)
120
+ return (f"{doc_id}_{i}", embedding, {
121
+ "text": chunk,
122
+ "type": upload_type,
123
+ "doc_id": doc_id,
124
+ "doc_name": doc_name,
125
+ "chunk_index": i
126
+ })
127
+
128
+ def get_relevant_context(query, top_k=5):
129
+ print(f"Getting relevant context for query: {query}")
130
+ query_embedding = get_embedding(query)
131
+
132
+ search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
133
+ print(f"Found {len(search_results['matches'])} relevant results")
134
+
135
+ # Sort results by doc_id and chunk_index to maintain document structure
136
+ sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
137
+
138
+ context = "\n".join([result['metadata']['text'] for result in sorted_results])
139
+ return context, sorted_results
140
+
141
+ def chat_with_ai(message):
142
+ print(f"Chatting with AI, message: {message}")
143
+ context, results = get_relevant_context(message)
144
+ print(f"Retrieved context, length: {len(context)}")
145
+
146
+ messages = [
147
+ {"role": "system", "content": "You are a helpful assistant. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
148
+ {"role": "system", "content": f"Context: {context}"},
149
+ {"role": "user", "content": message}
150
+ ]
151
+
152
+ response = client.chat.completions.create(
153
+ model="gpt-4o-mini",
154
+ messages=messages
155
+ )
156
+ print("Received response from OpenAI")
157
+
158
+ ai_response = response.choices[0].message.content
159
+
160
+ # Prepare source information
161
+ sources = [
162
+ {
163
+ "doc_id": result['metadata']['doc_id'],
164
+ "doc_name": result['metadata']['doc_name'],
165
+ "chunk_index": result['metadata']['chunk_index'],
166
+ "text": result['metadata']['text'],
167
+ "type": result['metadata']['type']
168
+ }
169
+ for result in results
170
+ ]
171
+
172
+ return ai_response, sources
173
+
174
+ def clear_database():
175
+ print("Clearing database...")
176
+ index.delete(delete_all=True)
177
+ print("Database cleared")
178
+ return "Database cleared successfully."
179
+
180
+ # Gradio Interface Components
181
+
182
+ def handle_uploads(pdf, web_link, youtube_link):
183
+ results = []
184
+ if pdf:
185
+ pdf_result = process_upload("PDF", pdf, pdf.name)
186
+ results.append(pdf_result)
187
+ if web_link:
188
+ web_result = process_upload("Web Link", web_link)
189
+ results.append(web_result)
190
+ if youtube_link:
191
+ youtube_result = process_upload("YouTube Link", youtube_link)
192
+ results.append(youtube_result)
193
+
194
+ if results:
195
+ return "\n".join([f"✅ {res}" for res in results])
196
+ else:
197
+ return "⚠️ No content uploaded. Please provide at least one input."
198
+
199
+ def handle_chat(user_input, state):
200
+ if not user_input:
201
+ return "⚠️ Please enter a question.", state
202
+
203
+ response, sources = chat_with_ai(user_input)
204
+ state = sources # Update state with sources
205
+ return f"**You:** {user_input}\n\n**AI:** {response}", state
206
+
207
+ def handle_clear_database():
208
+ result = clear_database()
209
+ return result
210
+
211
+ def display_sources(sources):
212
+ if not sources:
213
+ return "ℹ️ Ask a question to see source chunks here."
214
+
215
+ source_texts = []
216
+ for i, source in enumerate(sources, 1):
217
+ source_text = f"**Source {i} - {source['type']} ({source['doc_name']})**\n\n" \
218
+ f"**Chunk Index:** {source['chunk_index']}\n" \
219
+ f"{source['text']}\n\n---\n"
220
+ source_texts.append(source_text)
221
+ return "\n".join(source_texts)
222
+
223
+ with gr.Blocks() as demo:
224
+ gr.Markdown("# 📄 Upload and Chat with PDFs, Web Links, and YouTube Videos")
225
+
226
+ with gr.Row():
227
+ with gr.Column(scale=1):
228
+ gr.Markdown("## 📤 Upload")
229
+
230
+ pdf_input = gr.File(label="Choose a PDF file", file_types=[".pdf"])
231
+ web_link_input = gr.Textbox(label="Enter a Web Link", placeholder="https://example.com")
232
+ youtube_link_input = gr.Textbox(label="Enter a YouTube Link", placeholder="https://youtube.com/watch?v=...")
233
+
234
+ upload_button = gr.Button("Process All")
235
+ clear_db_button = gr.Button("Clear Database")
236
+ upload_output = gr.Markdown()
237
+
238
+ with gr.Column(scale=1):
239
+ gr.Markdown("## 💬 Chat")
240
+ user_input = gr.Textbox(label="Ask a question about the uploaded content:", placeholder="Your question here...")
241
+ chat_button = gr.Button("Send")
242
+ chat_output = gr.Markdown()
243
+
244
+ with gr.Column(scale=1):
245
+ gr.Markdown("## 📚 Source Chunks")
246
+ sources_display = gr.Markdown()
247
+
248
+ # Hidden state to store sources
249
+ state = gr.State([])
250
+
251
+ # Define interactions
252
+ upload_button.click(handle_uploads, inputs=[pdf_input, web_link_input, youtube_link_input], outputs=upload_output)
253
+ clear_db_button.click(handle_clear_database, inputs=None, outputs=upload_output)
254
+ chat_button.click(handle_chat, inputs=[user_input, state], outputs=[chat_output, state])
255
+ state.change(display_sources, inputs=state, outputs=sources_display)
256
+
257
+ # Alternatively, use an event to update sources_display when state changes
258
+ def update_sources(sources):
259
+ return display_sources(sources)
260
+
261
+ chat_button.click(update_sources, inputs=state, outputs=sources_display)
262
+
263
+ # Launch the Gradio app
264
+ if __name__ == "__main__":
265
+ demo.launch()
266
+
267
+
268
  # Set up OpenAI client
269
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
270