Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Mar 19

Commit

106fe41

verified ·

1 Parent(s): d48ad9f

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

advanced_rag.py +47 -3

advanced_rag.py CHANGED Viewed

@@ -30,6 +30,9 @@ from langchain.llms.base import LLM
 from typing import Any, Optional, List
 import typing
 import time
 print("Pydantic Version: ")
 print(pydantic.__version__)
@@ -366,7 +369,6 @@ default_prompt = """\
 {conversation_history}
 Use the following context to provide a detailed technical answer to the user's question.
 Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
-If you don't know the answer, please respond with "I don't know".
 Context:
 {context}
@@ -375,6 +377,8 @@ User's question:
 {question}
 """
 def load_txt_from_url(url: str) -> Document:
     response = requests.get(url)
     if response.status_code == 200:
@@ -384,6 +388,30 @@ def load_txt_from_url(url: str) -> Document:
         return Document(page_content=text, metadata={"source": url})
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
@@ -603,6 +631,7 @@ class ElevatedRagChain:
         self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
         debug_print("Chain updated successfully with new LLM pipeline.")
     def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
         debug_print(f"Processing files using {self.llm_choice}")
         self.raw_data = []
@@ -620,10 +649,25 @@ class ElevatedRagChain:
                     self.raw_data.append(load_txt_from_url(link))
                 except Exception as e:
                     debug_print(f"Error loading TXT file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")
-        if not self.raw_data:
-            raise ValueError("No files were successfully loaded. Please check the URLs and file formats.")
         debug_print("Files loaded successfully.")
         debug_print("Starting text splitting...")
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

 from typing import Any, Optional, List
 import typing
 import time
+import requests
+import re
 print("Pydantic Version: ")
 print(pydantic.__version__)
 {conversation_history}
 Use the following context to provide a detailed technical answer to the user's question.
 Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
 Context:
 {context}
 {question}
 """
+# #If you don't know the answer, please respond with "I don't know".
 def load_txt_from_url(url: str) -> Document:
     response = requests.get(url)
     if response.status_code == 200:
         return Document(page_content=text, metadata={"source": url})
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
+def load_txt_from_google_drive(link: str) -> Document:
+    """
+    Load text from a Google Drive shared link
+    """
+    # Extract the file ID from the Google Drive link
+    file_id_match = re.search(r'\/d\/(.*?)\/view', link)
+    if not file_id_match:
+        raise ValueError(f"Could not extract file ID from Google Drive link: {link}")
+    file_id = file_id_match.group(1)
+    # Create direct download link
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # Request the file content
+    response = requests.get(download_url)
+    if response.status_code != 200:
+        raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
+    # Create a Document object
+    content = response.text
+    metadata = {"source": link}
+    return Document(page_content=content, metadata=metadata)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
         self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
         debug_print("Chain updated successfully with new LLM pipeline.")
     def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
         debug_print(f"Processing files using {self.llm_choice}")
         self.raw_data = []
                     self.raw_data.append(load_txt_from_url(link))
                 except Exception as e:
                     debug_print(f"Error loading TXT file {link}: {e}")
+            elif "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
+                debug_print(f"Loading Google Drive file: {link}")
+                try:
+                    if ".pdf" in link.lower():
+                        # Google Drive PDF handling
+                        file_id = re.search(r'\/d\/(.*?)\/view', link).group(1)
+                        direct_pdf_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                        loaded_docs = OnlinePDFLoader(direct_pdf_url).load()
+                        if loaded_docs:
+                            self.raw_data.append(loaded_docs[0])
+                    else:
+                        # Assuming it's a text file
+                        self.raw_data.append(load_txt_from_google_drive(link))
+                except Exception as e:
+                    debug_print(f"Error loading Google Drive file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")
         debug_print("Files loaded successfully.")
         debug_print("Starting text splitting...")
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)