Upload folder using huggingface_hub
Browse files- advanced_rag.py +47 -3
advanced_rag.py
CHANGED
|
@@ -30,6 +30,9 @@ from langchain.llms.base import LLM
|
|
| 30 |
from typing import Any, Optional, List
|
| 31 |
import typing
|
| 32 |
import time
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
print("Pydantic Version: ")
|
| 35 |
print(pydantic.__version__)
|
|
@@ -366,7 +369,6 @@ default_prompt = """\
|
|
| 366 |
{conversation_history}
|
| 367 |
Use the following context to provide a detailed technical answer to the user's question.
|
| 368 |
Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
|
| 369 |
-
If you don't know the answer, please respond with "I don't know".
|
| 370 |
|
| 371 |
Context:
|
| 372 |
{context}
|
|
@@ -375,6 +377,8 @@ User's question:
|
|
| 375 |
{question}
|
| 376 |
"""
|
| 377 |
|
|
|
|
|
|
|
| 378 |
def load_txt_from_url(url: str) -> Document:
|
| 379 |
response = requests.get(url)
|
| 380 |
if response.status_code == 200:
|
|
@@ -384,6 +388,30 @@ def load_txt_from_url(url: str) -> Document:
|
|
| 384 |
return Document(page_content=text, metadata={"source": url})
|
| 385 |
else:
|
| 386 |
raise Exception(f"Failed to load {url} with status {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
class ElevatedRagChain:
|
| 389 |
def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
|
|
@@ -603,6 +631,7 @@ class ElevatedRagChain:
|
|
| 603 |
self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
|
| 604 |
debug_print("Chain updated successfully with new LLM pipeline.")
|
| 605 |
|
|
|
|
| 606 |
def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
|
| 607 |
debug_print(f"Processing files using {self.llm_choice}")
|
| 608 |
self.raw_data = []
|
|
@@ -620,10 +649,25 @@ class ElevatedRagChain:
|
|
| 620 |
self.raw_data.append(load_txt_from_url(link))
|
| 621 |
except Exception as e:
|
| 622 |
debug_print(f"Error loading TXT file {link}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
else:
|
| 624 |
debug_print(f"File type not supported for URL: {link}")
|
| 625 |
-
|
| 626 |
-
|
| 627 |
debug_print("Files loaded successfully.")
|
| 628 |
debug_print("Starting text splitting...")
|
| 629 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
|
|
|
|
| 30 |
from typing import Any, Optional, List
|
| 31 |
import typing
|
| 32 |
import time
|
| 33 |
+
import requests
|
| 34 |
+
import re
|
| 35 |
+
|
| 36 |
|
| 37 |
print("Pydantic Version: ")
|
| 38 |
print(pydantic.__version__)
|
|
|
|
| 369 |
{conversation_history}
|
| 370 |
Use the following context to provide a detailed technical answer to the user's question.
|
| 371 |
Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
|
|
|
|
| 372 |
|
| 373 |
Context:
|
| 374 |
{context}
|
|
|
|
| 377 |
{question}
|
| 378 |
"""
|
| 379 |
|
| 380 |
+
# #If you don't know the answer, please respond with "I don't know".
|
| 381 |
+
|
| 382 |
def load_txt_from_url(url: str) -> Document:
|
| 383 |
response = requests.get(url)
|
| 384 |
if response.status_code == 200:
|
|
|
|
| 388 |
return Document(page_content=text, metadata={"source": url})
|
| 389 |
else:
|
| 390 |
raise Exception(f"Failed to load {url} with status {response.status_code}")
|
| 391 |
+
|
| 392 |
+
def load_txt_from_google_drive(link: str) -> Document:
|
| 393 |
+
"""
|
| 394 |
+
Load text from a Google Drive shared link
|
| 395 |
+
"""
|
| 396 |
+
# Extract the file ID from the Google Drive link
|
| 397 |
+
file_id_match = re.search(r'\/d\/(.*?)\/view', link)
|
| 398 |
+
if not file_id_match:
|
| 399 |
+
raise ValueError(f"Could not extract file ID from Google Drive link: {link}")
|
| 400 |
+
|
| 401 |
+
file_id = file_id_match.group(1)
|
| 402 |
+
|
| 403 |
+
# Create direct download link
|
| 404 |
+
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
| 405 |
+
|
| 406 |
+
# Request the file content
|
| 407 |
+
response = requests.get(download_url)
|
| 408 |
+
if response.status_code != 200:
|
| 409 |
+
raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
|
| 410 |
+
|
| 411 |
+
# Create a Document object
|
| 412 |
+
content = response.text
|
| 413 |
+
metadata = {"source": link}
|
| 414 |
+
return Document(page_content=content, metadata=metadata)
|
| 415 |
|
| 416 |
class ElevatedRagChain:
|
| 417 |
def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
|
|
|
|
| 631 |
self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
|
| 632 |
debug_print("Chain updated successfully with new LLM pipeline.")
|
| 633 |
|
| 634 |
+
|
| 635 |
def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
|
| 636 |
debug_print(f"Processing files using {self.llm_choice}")
|
| 637 |
self.raw_data = []
|
|
|
|
| 649 |
self.raw_data.append(load_txt_from_url(link))
|
| 650 |
except Exception as e:
|
| 651 |
debug_print(f"Error loading TXT file {link}: {e}")
|
| 652 |
+
elif "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
|
| 653 |
+
debug_print(f"Loading Google Drive file: {link}")
|
| 654 |
+
try:
|
| 655 |
+
if ".pdf" in link.lower():
|
| 656 |
+
# Google Drive PDF handling
|
| 657 |
+
file_id = re.search(r'\/d\/(.*?)\/view', link).group(1)
|
| 658 |
+
direct_pdf_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
| 659 |
+
loaded_docs = OnlinePDFLoader(direct_pdf_url).load()
|
| 660 |
+
if loaded_docs:
|
| 661 |
+
self.raw_data.append(loaded_docs[0])
|
| 662 |
+
else:
|
| 663 |
+
# Assuming it's a text file
|
| 664 |
+
self.raw_data.append(load_txt_from_google_drive(link))
|
| 665 |
+
except Exception as e:
|
| 666 |
+
debug_print(f"Error loading Google Drive file {link}: {e}")
|
| 667 |
else:
|
| 668 |
debug_print(f"File type not supported for URL: {link}")
|
| 669 |
+
|
| 670 |
+
|
| 671 |
debug_print("Files loaded successfully.")
|
| 672 |
debug_print("Starting text splitting...")
|
| 673 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
|