Spaces:
Running
Running
bbb
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
-
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain.vectorstores import Chroma
|
@@ -16,10 +16,37 @@ import transformers
|
|
16 |
import torch
|
17 |
import tqdm
|
18 |
import accelerate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
default_persist_directory = './chroma_HF/'
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
llm_name1 = "mistralai/Mistral-7B-Instruct-v0.2"
|
24 |
llm_name2 = "mistralai/Mistral-7B-Instruct-v0.1"
|
25 |
llm_name3 = "meta-llama/Llama-2-7b-chat-hf"
|
@@ -30,6 +57,12 @@ llm_name7 = "google/flan-t5-xxl"
|
|
30 |
list_llm = [llm_name1, llm_name2, llm_name3, llm_name4, llm_name5, llm_name6, llm_name7]
|
31 |
list_llm_simple = [os.path.basename(llm) for llm in list_llm]
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Load PDF document and create doc splits
|
34 |
def load_doc(list_file_path, chunk_size, chunk_overlap):
|
35 |
# Processing for one document only
|
@@ -47,6 +80,12 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
|
|
47 |
return doc_splits
|
48 |
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Create vector database
|
51 |
def create_db(splits):
|
52 |
embedding = HuggingFaceEmbeddings()
|
@@ -186,23 +225,161 @@ def upload_file(file_obj):
|
|
186 |
return list_file_path
|
187 |
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
def demo():
|
|
|
190 |
with gr.Blocks(theme="base") as demo:
|
191 |
vector_db = gr.State()
|
192 |
qa_chain = gr.State()
|
193 |
-
|
|
|
|
|
|
|
194 |
gr.Markdown(
|
195 |
"""<center><h2>PDF-based chatbot (powered by LangChain and open-source LLMs)</center></h2>
|
196 |
<h3>Ask any questions about your PDF documents, along with follow-ups</h3>
|
197 |
-
|
198 |
-
When generating answers, it takes past questions into account (via conversational memory), and includes document references for clarity purposes.</i>
|
199 |
-
<br><b>Warning:</b> This space uses the free CPU Basic hardware from Hugging Face. Some steps and LLM models used below (free inference endpoints) can take some time to generate an output.<br>
|
200 |
""")
|
201 |
with gr.Tab("Step 1 - Document pre-processing"):
|
|
|
202 |
with gr.Row():
|
203 |
document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
|
204 |
# upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)
|
205 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
|
207 |
with gr.Accordion("Advanced options - Document text splitter", open=False):
|
208 |
with gr.Row():
|
@@ -244,6 +421,11 @@ def demo():
|
|
244 |
|
245 |
# Preprocessing events
|
246 |
#upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
|
|
|
|
|
|
|
|
|
|
|
247 |
db_btn.click(initialize_database, \
|
248 |
inputs=[document, slider_chunk_size, slider_chunk_overlap], \
|
249 |
outputs=[vector_db, db_progress])
|
@@ -267,8 +449,20 @@ def demo():
|
|
267 |
inputs=None, \
|
268 |
outputs=[chatbot, doc_source1, source1_page, doc_source2, source2_page], \
|
269 |
queue=False)
|
270 |
-
demo.queue().launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
|
273 |
if __name__ == "__main__":
|
|
|
274 |
demo()
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain.vectorstores import Chroma
|
|
|
16 |
import torch
|
17 |
import tqdm
|
18 |
import accelerate
|
19 |
+
import requests
|
20 |
+
import shutil
|
21 |
+
import os
|
22 |
+
|
23 |
+
import sys
|
24 |
+
|
25 |
+
import sys
|
26 |
+
import subprocess
|
27 |
+
|
28 |
+
|
29 |
|
30 |
|
31 |
default_persist_directory = './chroma_HF/'
|
32 |
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
# Use a try-except block to handle potential errors
|
39 |
+
try:
|
40 |
+
# Delete the directory and its contents
|
41 |
+
shutil.rmtree(default_persist_directory)
|
42 |
+
print(f"Successfully deleted the directory: {default_persist_directory}")
|
43 |
+
except OSError as e:
|
44 |
+
# Handle the exception (e.g., directory not found)
|
45 |
+
print(f"Error: {e.filename} - {e.strerror}")
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
llm_name1 = "mistralai/Mistral-7B-Instruct-v0.2"
|
51 |
llm_name2 = "mistralai/Mistral-7B-Instruct-v0.1"
|
52 |
llm_name3 = "meta-llama/Llama-2-7b-chat-hf"
|
|
|
57 |
list_llm = [llm_name1, llm_name2, llm_name3, llm_name4, llm_name5, llm_name6, llm_name7]
|
58 |
list_llm_simple = [os.path.basename(llm) for llm in list_llm]
|
59 |
|
60 |
+
|
61 |
+
|
62 |
+
import os
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
# Load PDF document and create doc splits
|
67 |
def load_doc(list_file_path, chunk_size, chunk_overlap):
|
68 |
# Processing for one document only
|
|
|
80 |
return doc_splits
|
81 |
|
82 |
|
83 |
+
|
84 |
+
|
85 |
+
def restart_program():
|
86 |
+
python = sys.executable
|
87 |
+
os.execl(python, python, *sys.argv)
|
88 |
+
|
89 |
# Create vector database
|
90 |
def create_db(splits):
|
91 |
embedding = HuggingFaceEmbeddings()
|
|
|
225 |
return list_file_path
|
226 |
|
227 |
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
# ... other code ...
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
import random
|
237 |
+
import time
|
238 |
+
|
239 |
+
def authenticate(username, password):
|
240 |
+
if username == 'fiver' and password == 'fiver':
|
241 |
+
return True
|
242 |
+
|
243 |
+
else:
|
244 |
+
return False
|
245 |
+
|
246 |
+
def logout(request: gr.Request):
|
247 |
+
|
248 |
+
print("logged out")
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
def restart():
|
255 |
+
print('Restarting script...')
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
# Start the script again
|
260 |
+
# try:
|
261 |
+
# # Replace 'python script_name.py' with the appropriate command to start your script
|
262 |
+
# subprocess.run(['python', 'app2.py'], check=True)
|
263 |
+
# except subprocess.CalledProcessError as e:
|
264 |
+
# print(f'Error restarting script: {e}')
|
265 |
+
|
266 |
+
|
267 |
+
# Use a try-except block to handle potential errors
|
268 |
+
try:
|
269 |
+
# Delete the directory and its contents
|
270 |
+
shutil.rmtree(default_persist_directory)
|
271 |
+
print(f"Successfully deleted the directory: {default_persist_directory}")
|
272 |
+
except OSError as e:
|
273 |
+
# Handle the exception (e.g., directory not found)
|
274 |
+
print(f"Error: {e.filename} - {e.strerror}")
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
# def restart_and_clear():
|
281 |
+
# print('Restarting script and clearing cookies/session...')
|
282 |
+
|
283 |
+
# # JavaScript code to clear cookies and session
|
284 |
+
# js_code = """
|
285 |
+
# // Clear cookies
|
286 |
+
# document.cookie.split(";").forEach(function(c) {
|
287 |
+
# document.cookie = c.replace(/^\\s+/,"").replace(/=.*/, "=;expires=" + new Date().toUTCString() + ";path=/");
|
288 |
+
# });
|
289 |
+
|
290 |
+
# // Clear session storage
|
291 |
+
# window.sessionStorage.clear();
|
292 |
+
|
293 |
+
# // Clear local storage
|
294 |
+
# window.localStorage.clear();
|
295 |
+
# """
|
296 |
+
|
297 |
+
# # Display JavaScript code in Gradio interface
|
298 |
+
# return gr.Text(js_code, type="code", label="JavaScript Code")
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
with gr.Blocks() as demo:
|
303 |
+
chatbot = gr.Chatbot()
|
304 |
+
msg = gr.Textbox()
|
305 |
+
clear = gr.ClearButton([msg, chatbot])
|
306 |
+
logout_button = gr.Button(value = "Logout")
|
307 |
+
logout_button.click(logout)
|
308 |
+
# logout_button = gr.LogoutButton()
|
309 |
+
def respond(message, chat_history):
|
310 |
+
bot_message = random.choice(["How are you?", "I'm very hungry"])
|
311 |
+
chat_history.append((message, bot_message))
|
312 |
+
time.sleep(2)
|
313 |
+
return "", chat_history
|
314 |
+
|
315 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot])
|
316 |
+
|
317 |
+
|
318 |
+
|
319 |
+
def download_and_update_list(url): # Function to handle download and list update
|
320 |
+
try:
|
321 |
+
response = requests.get(url)
|
322 |
+
response.raise_for_status()
|
323 |
+
|
324 |
+
with open("downloaded_pdf.pdf", "wb") as f:
|
325 |
+
f.write(response.content)
|
326 |
+
|
327 |
+
return ["downloaded_pdf.pdf"] # Return the path of the downloaded file
|
328 |
+
except requests.exceptions.RequestException as e:
|
329 |
+
print("Download error:", e)
|
330 |
+
return [] # Return an empty list in case of errors
|
331 |
+
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
# def download_and_update_list(urls):
|
336 |
+
# filenames = []
|
337 |
+
# for url in urls:
|
338 |
+
# response = requests.get(url) # Download the PDF from the provided URL
|
339 |
+
# filename = f"downloaded_{len(filenames)+1}.pdf" # Generate unique filename
|
340 |
+
# with open(filename, "wb") as fh: # Save it to a file
|
341 |
+
# fh.write(response.content)
|
342 |
+
# filenames.append(filename)
|
343 |
+
# return filenames # Return a list of filenames
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
|
348 |
+
|
349 |
def demo():
|
350 |
+
load_dotenv()
|
351 |
with gr.Blocks(theme="base") as demo:
|
352 |
vector_db = gr.State()
|
353 |
qa_chain = gr.State()
|
354 |
+
logout_btn = gr.Button("RESET MODEL")
|
355 |
+
|
356 |
+
|
357 |
+
|
358 |
gr.Markdown(
|
359 |
"""<center><h2>PDF-based chatbot (powered by LangChain and open-source LLMs)</center></h2>
|
360 |
<h3>Ask any questions about your PDF documents, along with follow-ups</h3>
|
361 |
+
|
|
|
|
|
362 |
""")
|
363 |
with gr.Tab("Step 1 - Document pre-processing"):
|
364 |
+
uploaded_documents = []
|
365 |
with gr.Row():
|
366 |
document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
|
367 |
# upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)
|
368 |
with gr.Row():
|
369 |
+
document_url = gr.Textbox(label="Or enter a PDF document URL:") # Add URL field
|
370 |
+
|
371 |
+
download_btn = gr.Button("Download PDF") # Add download button
|
372 |
+
|
373 |
+
|
374 |
+
# ... (rest of your code)
|
375 |
+
# ... (rest of your code)
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
# Error handling
|
380 |
+
|
381 |
+
|
382 |
+
with gr.Row():
|
383 |
db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
|
384 |
with gr.Accordion("Advanced options - Document text splitter", open=False):
|
385 |
with gr.Row():
|
|
|
421 |
|
422 |
# Preprocessing events
|
423 |
#upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
|
424 |
+
|
425 |
+
download_btn.click(download_and_update_list, inputs=[document_url], outputs=[document])
|
426 |
+
|
427 |
+
|
428 |
+
logout_btn.click(restart)
|
429 |
db_btn.click(initialize_database, \
|
430 |
inputs=[document, slider_chunk_size, slider_chunk_overlap], \
|
431 |
outputs=[vector_db, db_progress])
|
|
|
449 |
inputs=None, \
|
450 |
outputs=[chatbot, doc_source1, source1_page, doc_source2, source2_page], \
|
451 |
queue=False)
|
452 |
+
demo.queue().launch(auth=authenticate,debug=True)
|
453 |
+
|
454 |
+
|
455 |
+
|
456 |
+
|
457 |
+
|
458 |
+
|
459 |
+
|
460 |
+
|
461 |
+
|
462 |
+
|
463 |
+
|
464 |
|
465 |
|
466 |
if __name__ == "__main__":
|
467 |
+
|
468 |
demo()
|