alx-d commited on
Commit
106fe41
·
verified ·
1 Parent(s): d48ad9f

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. advanced_rag.py +47 -3
advanced_rag.py CHANGED
@@ -30,6 +30,9 @@ from langchain.llms.base import LLM
30
  from typing import Any, Optional, List
31
  import typing
32
  import time
 
 
 
33
 
34
  print("Pydantic Version: ")
35
  print(pydantic.__version__)
@@ -366,7 +369,6 @@ default_prompt = """\
366
  {conversation_history}
367
  Use the following context to provide a detailed technical answer to the user's question.
368
  Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
369
- If you don't know the answer, please respond with "I don't know".
370
 
371
  Context:
372
  {context}
@@ -375,6 +377,8 @@ User's question:
375
  {question}
376
  """
377
 
 
 
378
  def load_txt_from_url(url: str) -> Document:
379
  response = requests.get(url)
380
  if response.status_code == 200:
@@ -384,6 +388,30 @@ def load_txt_from_url(url: str) -> Document:
384
  return Document(page_content=text, metadata={"source": url})
385
  else:
386
  raise Exception(f"Failed to load {url} with status {response.status_code}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  class ElevatedRagChain:
389
  def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
@@ -603,6 +631,7 @@ class ElevatedRagChain:
603
  self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
604
  debug_print("Chain updated successfully with new LLM pipeline.")
605
 
 
606
  def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
607
  debug_print(f"Processing files using {self.llm_choice}")
608
  self.raw_data = []
@@ -620,10 +649,25 @@ class ElevatedRagChain:
620
  self.raw_data.append(load_txt_from_url(link))
621
  except Exception as e:
622
  debug_print(f"Error loading TXT file {link}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  else:
624
  debug_print(f"File type not supported for URL: {link}")
625
- if not self.raw_data:
626
- raise ValueError("No files were successfully loaded. Please check the URLs and file formats.")
627
  debug_print("Files loaded successfully.")
628
  debug_print("Starting text splitting...")
629
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
 
30
  from typing import Any, Optional, List
31
  import typing
32
  import time
33
+ import requests
34
+ import re
35
+
36
 
37
  print("Pydantic Version: ")
38
  print(pydantic.__version__)
 
369
  {conversation_history}
370
  Use the following context to provide a detailed technical answer to the user's question.
371
  Do not include an introduction like "Based on the provided documents, ...". Just answer the question.
 
372
 
373
  Context:
374
  {context}
 
377
  {question}
378
  """
379
 
380
+ # #If you don't know the answer, please respond with "I don't know".
381
+
382
  def load_txt_from_url(url: str) -> Document:
383
  response = requests.get(url)
384
  if response.status_code == 200:
 
388
  return Document(page_content=text, metadata={"source": url})
389
  else:
390
  raise Exception(f"Failed to load {url} with status {response.status_code}")
391
+
392
+ def load_txt_from_google_drive(link: str) -> Document:
393
+ """
394
+ Load text from a Google Drive shared link
395
+ """
396
+ # Extract the file ID from the Google Drive link
397
+ file_id_match = re.search(r'\/d\/(.*?)\/view', link)
398
+ if not file_id_match:
399
+ raise ValueError(f"Could not extract file ID from Google Drive link: {link}")
400
+
401
+ file_id = file_id_match.group(1)
402
+
403
+ # Create direct download link
404
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
405
+
406
+ # Request the file content
407
+ response = requests.get(download_url)
408
+ if response.status_code != 200:
409
+ raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
410
+
411
+ # Create a Document object
412
+ content = response.text
413
+ metadata = {"source": link}
414
+ return Document(page_content=content, metadata=metadata)
415
 
416
  class ElevatedRagChain:
417
  def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
 
631
  self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
632
  debug_print("Chain updated successfully with new LLM pipeline.")
633
 
634
+
635
  def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
636
  debug_print(f"Processing files using {self.llm_choice}")
637
  self.raw_data = []
 
649
  self.raw_data.append(load_txt_from_url(link))
650
  except Exception as e:
651
  debug_print(f"Error loading TXT file {link}: {e}")
652
+ elif "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
653
+ debug_print(f"Loading Google Drive file: {link}")
654
+ try:
655
+ if ".pdf" in link.lower():
656
+ # Google Drive PDF handling
657
+ file_id = re.search(r'\/d\/(.*?)\/view', link).group(1)
658
+ direct_pdf_url = f"https://drive.google.com/uc?export=download&id={file_id}"
659
+ loaded_docs = OnlinePDFLoader(direct_pdf_url).load()
660
+ if loaded_docs:
661
+ self.raw_data.append(loaded_docs[0])
662
+ else:
663
+ # Assuming it's a text file
664
+ self.raw_data.append(load_txt_from_google_drive(link))
665
+ except Exception as e:
666
+ debug_print(f"Error loading Google Drive file {link}: {e}")
667
  else:
668
  debug_print(f"File type not supported for URL: {link}")
669
+
670
+
671
  debug_print("Files loaded successfully.")
672
  debug_print("Starting text splitting...")
673
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)