Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

Asaad Almutareb commited on Jul 6, 2024

Commit

fb95c43

1 Parent(s): 7acac3e

added ReAct agent with its tools

Browse files

added wget website scrapper as alternative to recursive URL tool
fixed some LC deprecation warnings

Files changed (22) hide show

.devcontainer/Dockerfile +4 -1
.gitignore +3 -0
README.md +21 -33
app_gui.py +14 -6
core-langchain-rag.py +1 -1
example.env +10 -2
rag_app/{handle_vector_store.py → build_vector_store.py} +1 -1
rag_app/database/db_handler.py +114 -0
rag_app/database/schema.py +15 -0
rag_app/get_db_retriever.py +2 -2
rag_app/{load_data_from_urls.py → loading_data/load_urls_recurisvely.py} +0 -0
rag_app/loading_data/scrap_website.py +36 -0
rag_app/react_agent.py +77 -0
rag_app/structured_tools/__init__.py +0 -0
rag_app/structured_tools/structured_tools.py +98 -0
rag_app/templates/__init__.py +0 -0
rag_app/templates/react_json_with_memory.py +47 -0
rag_app/utils/__init__.py +0 -0
rag_app/utils/logger.py +20 -0
rag_app/utils/utils.py +115 -0
requirements.txt +6 -2
test_this.py +15 -11

.devcontainer/Dockerfile CHANGED Viewed

@@ -23,7 +23,10 @@ RUN DEBIAN_FRONTEND=noninteractive \
         libxml2-dev \
         libxmlsec1-dev \
         libffi-dev \
-        liblzma-dev
 # Python and poetry installation
 USER $USER

         libxml2-dev \
         libxmlsec1-dev \
         libffi-dev \
+        liblzma-dev \
+        ffmpeg \
+        libsm6 \
+        libxext6
 # Python and poetry installation
 USER $USER

.gitignore CHANGED Viewed

@@ -161,3 +161,6 @@ cython_debug/
 *.zip
 *.pkl
 *.faiss

 *.zip
 *.pkl
 *.faiss
+*.sqlite3
+*.bin
+*.pickle

README.md CHANGED Viewed

@@ -1,33 +1,21 @@
-# Anatomy of Advanced Enterprise Rag Systems
-This repository accompanies the blog series "The Anatomy of Advanced Enterprise Rag Systems" and provides a hands-on learning experience for building sophisticated Rag systems. Dive deep into each component, from setup and evaluation to security and multi-agent interactions.
-Explore these key topics:
-- Test Setup and Evaluation Metrics: Learn how to assess the performance and effectiveness of your Rag system.
-- Data Preparation and Management: Discover techniques for organizing and optimizing your knowledge base.
-- User Input Processing: Understand how to handle diverse user queries and extract relevant information.
-- Retrieval System: Unleash the power of retrieving relevant passages from your knowledge base.
-- Information Processing and Generation: Craft accurate and informative responses using state-of-the-art techniques.
-- Feedback and Continuous Improvement: Enhance your Rag system over time using user feedback and data analysis.
-- Multi-agents and Agent-services: Explore advanced architectures for distributed and collaborative Rag systems.
-- Monitoring and Security: Ensure the robustness and trustworthiness of your Rag system with proper monitoring and security practices.
-What you'll find here:
-- Code examples: Implementations of key concepts from each topic, ready to use and adapt.
-- Data samples: Pre-prepared data sets for experimentation and testing.
-- Additional resources: Links to relevant articles, libraries, and tools to deepen your understanding.
-Getting started:
-- Clone this repository: git clone https://github.com/<username>/advanced-enterprise-rag-systems.git
-- Follow the instructions in each topic directory.
-Contributing:
-We welcome your contributions! Share your expertise, improve existing code examples, or add new ones. Submit a pull request to share your valuable additions.
-License:
-This project is licensed under the MIT License: LICENSE.

+# Insurance Advisor Agent
+1. Improve Vectorstore search
+    Isayah:
+    - multi index (chunks+summaries)
+    - metadata filtering
+    Karan:
+    - re-ranking
+    - hybrid search (bm25+vectorstore)
+    - HyDE/HyQE
+2. Use ReAct agent instead of chain [DONE]
+    - add tools (Google search engine) [DONE]
+    - query re-writing
+    - ...
+3. Use Corrective RAG
+4. Workflow for the agents
+    - determine funnel stage of customer
+    - different prompts per step
+    - follow a "script"

app_gui.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
 # need to import the qa!
 # Function to add a new input to the chat history
@@ -12,20 +13,27 @@ def add_text(history, text):
 def bot(history):
     # Obtain the response from the 'infer' function using the latest input
     response = infer(history[-1][0], history)
-    sources = [doc.metadata.get("source") for doc in response['source_documents']]
-    src_list = '\n'.join(sources)
-    print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
-    history[-1][1] = print_this #response['answer']
     # Update the history with the bot's response
-    #history[-1][1] = response['result']
     return history
 # Function to infer the response using the RAG model
 def infer(question, history):
     # Use the question and history to query the RAG model
-    result = qa({"query": question, "history": history, "question": question})
     return result
 # CSS styling for the Gradio interface

 # Import Gradio for UI, along with other necessary libraries
 import gradio as gr
+from rag_app.react_agent import agent_executor
 # need to import the qa!
 # Function to add a new input to the chat history
 def bot(history):
     # Obtain the response from the 'infer' function using the latest input
     response = infer(history[-1][0], history)
+    #sources = [doc.metadata.get("source") for doc in response['source_documents']]
+    #src_list = '\n'.join(sources)
+    #print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
+    #history[-1][1] = print_this #response['answer']
     # Update the history with the bot's response
+    print(*response)
+    history[-1][1] = response['output']
     return history
 # Function to infer the response using the RAG model
 def infer(question, history):
     # Use the question and history to query the RAG model
+    #result = qa({"query": question, "history": history, "question": question})
+    result = agent_executor.invoke(
+        {
+            "input": question,
+            "chat_history": history
+        }
+    )
     return result
 # CSS styling for the Gradio interface

core-langchain-rag.py CHANGED Viewed

@@ -274,7 +274,7 @@ def infer(question, history):
     # Use the question and history to query the RAG model
     #result = generate_qa_retriever(history, question)
     result = qa({"query": question, "history": history, "question": question})
-    print(*result)
     return result
 # CSS styling for the Gradio interface

     # Use the question and history to query the RAG model
     #result = generate_qa_retriever(history, question)
     result = qa({"query": question, "history": history, "question": question})
+    #print(*result)
     return result
 # CSS styling for the Gradio interface

example.env CHANGED Viewed

@@ -10,7 +10,15 @@ CHROMA_VS_NAME=""
 FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
 CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
 # llm and embedding models
-EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
 LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
-LLM_MODEL_ARGS=

 FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
 CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
+# for chromadb
+VECTOR_DATABASE_LOCATION="./vectorstore/chroma-insurance-agent-1500"
+# Name for the Conversation Memory Collection
+CONVERSATION_COLLECTION_NAME="ConversationMemory"
 # llm and embedding models
+#EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
+EMBEDDING_MODEL="microsoft/Multilingual-MiniLM-L12-H384"
 LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
+LLM_MODEL_ARGS=

rag_app/{handle_vector_store.py → build_vector_store.py} RENAMED Viewed

@@ -1,7 +1,7 @@
 # vectorization functions
 from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import ReadTheDocsLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.retrievers import BM25Retriever

 # vectorization functions
 from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import DirectoryLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.retrievers import BM25Retriever

rag_app/database/db_handler.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from sqlmodel import SQLModel, create_engine, Session, select
+from rag_app.database.schema import Sources
+from rag_app.utils.logger import get_console_logger
+import os
+from dotenv import load_dotenv
+load_dotenv()
+sqlite_file_name = os.getenv('SOURCES_CACHE')
+sqlite_url = f"sqlite:///{sqlite_file_name}"
+engine = create_engine(sqlite_url, echo=False)
+logger = get_console_logger("db_handler")
+SQLModel.metadata.create_all(engine)
+def read_one(hash_id: dict):
+    with Session(engine) as session:
+        statement = select(Sources).where(Sources.hash_id == hash_id)
+        sources = session.exec(statement).first()
+        return sources
+def add_one(data: dict):
+    with Session(engine) as session:
+        if session.exec(
+            select(Sources).where(Sources.hash_id == data.get("hash_id"))
+        ).first():
+            logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
+            return None  # or raise an exception, or handle as needed
+        sources = Sources(**data)
+        session.add(sources)
+        session.commit()
+        session.refresh(sources)
+        logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
+        return sources
+def update_one(hash_id: dict, data: dict):
+    with Session(engine) as session:
+        # Check if the item with the given hash_id exists
+        sources = session.exec(
+            select(Sources).where(Sources.hash_id == hash_id)
+        ).first()
+        if not sources:
+            logger.warning(f"No item with hash_id {hash_id} found for update")
+            return None  # or raise an exception, or handle as needed
+        for key, value in data.items():
+            setattr(sources, key, value)
+        session.commit()
+        logger.info(f"Item with hash_id {hash_id} updated in the database")
+        return sources
+def delete_one(id: int):
+    with Session(engine) as session:
+        # Check if the item with the given hash_id exists
+        sources = session.exec(
+            select(Sources).where(Sources.hash_id == id)
+        ).first()
+        if not sources:
+            logger.warning(f"No item with hash_id {id} found for deletion")
+            return None  # or raise an exception, or handle as needed
+        session.delete(sources)
+        session.commit()
+        logger.info(f"Item with hash_id {id} deleted from the database")
+def add_many(data: list):
+    with Session(engine) as session:
+        for info in data:
+            # Reuse add_one function for each item
+            result = add_one(info)
+            if result is None:
+                logger.warning(
+                    f"Item with hash_id {info.get('hash_id')} could not be added"
+                )
+            else:
+                logger.info(
+                    f"Item with hash_id {info.get('hash_id')} added to the database"
+                )
+        session.commit()  # Commit at the end of the loop
+def delete_many(ids: list):
+    with Session(engine) as session:
+        for id in ids:
+            # Reuse delete_one function for each item
+            result = delete_one(id)
+            if result is None:
+                logger.warning(f"No item with hash_id {id} found for deletion")
+            else:
+                logger.info(f"Item with hash_id {id} deleted from the database")
+        session.commit()  # Commit at the end of the loop
+def read_all(query: dict = None):
+    with Session(engine) as session:
+        statement = select(Sources)
+        if query:
+            statement = statement.where(
+                *[getattr(Sources, key) == value for key, value in query.items()]
+            )
+        sources = session.exec(statement).all()
+        return sources
+def delete_all():
+    with Session(engine) as session:
+        session.exec(Sources).delete()
+        session.commit()
+        logger.info("All items deleted from the database")

rag_app/database/schema.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from sqlmodel import SQLModel, Field
+from typing import Optional
+import datetime
+class Sources(SQLModel, table=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    url: str = Field()
+    title: Optional[str] = Field(default="NA", unique=False)
+    hash_id: str = Field(unique=True)
+    created_at: float = Field(default=datetime.datetime.now().timestamp())
+    summary: str = Field(default="")
+    embedded: bool = Field(default=False)
+    __table_args__ = {"extend_existing": True}

rag_app/get_db_retriever.py CHANGED Viewed

@@ -2,9 +2,9 @@
 # HF libraries
 from langchain.llms import HuggingFaceHub
-from langchain.embeddings import HuggingFaceHubEmbeddings
 # vectorestore
-from langchain.vectorstores import FAISS
 # retrieval chain
 from langchain.chains import RetrievalQA
 # prompt template

 # HF libraries
 from langchain.llms import HuggingFaceHub
+from langchain_huggingface import HuggingFaceHubEmbeddings
 # vectorestore
+from langchain_community.vectorstores import FAISS
 # retrieval chain
 from langchain.chains import RetrievalQA
 # prompt template

rag_app/{load_data_from_urls.py → loading_data/load_urls_recurisvely.py} RENAMED Viewed

File without changes

rag_app/loading_data/scrap_website.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# scrap a given url recursively
+import subprocess
+import os
+from urllib.parse import urlparse
+from langchain_community.document_loaders import DirectoryLoader
+def runcmd(cmd, verbose = False, *args, **kwargs):
+    process = subprocess.Popen(
+        cmd,
+        stdout = subprocess.PIPE,
+        stderr = subprocess.PIPE,
+        text = True,
+        shell = True
+    )
+    std_out, std_err = process.communicate()
+    if verbose:
+        print(std_out.strip(), std_err)
+    pass
+    return process.returncode
+def scrap_website(target_url:str, depth:int=5):
+    target_domain = urlparse(target_url).netloc
+    target_directory='./downloads/'
+    # To download the files locally for processing, here's the command line
+    command_this=f'wget -e robots=off --recursive -l {depth} --no-clobber --page-requisites --html-extension \
+    --convert-links --restrict-file-names=windows --force-directories --directory-prefix={target_directory}\
+    --domains target_domain --no-parent {target_url}'
+    cmd_status = runcmd(command_this, verbose=True)
+    if cmd_status==0:
+        documents_path = os.path.dirname(os.path.realpath(f'{target_directory}/{target_domain}'))
+        loader = DirectoryLoader(documents_path, silent_errors=True, show_progress=True)
+        docs = loader.load()
+    return docs

rag_app/react_agent.py CHANGED Viewed

	@@ -0,0 +1,77 @@

+# HF libraries
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain.agents import AgentExecutor
+from langchain.agents.format_scratchpad import format_log_to_str
+from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
+# Import things that are needed generically
+from langchain.tools.render import render_text_description
+import os
+from dotenv import load_dotenv
+from rag_app.structured_tools.structured_tools import (
+    google_search, knowledgeBase_search
+)
+from langchain.prompts import PromptTemplate
+from rag_app.templates.react_json_with_memory import template_system
+# from innovation_pathfinder_ai.utils import logger
+# from langchain.globals import set_llm_cache
+# from langchain.cache import SQLiteCache
+# set_llm_cache(SQLiteCache(database_path=".cache.db"))
+# logger = logger.get_console_logger("hf_mixtral_agent")
+config = load_dotenv(".env")
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+GOOGLE_CSE_ID = os.getenv('GOOGLE_CSE_ID')
+GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
+# LANGCHAIN_TRACING_V2 = "true"
+# LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com"
+# LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
+# LANGCHAIN_PROJECT = os.getenv('LANGCHAIN_PROJECT')
+# Load the model from the Hugging Face Hub
+llm = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
+                          temperature=0.1,
+                          max_new_tokens=1024,
+                          repetition_penalty=1.2,
+                          return_full_text=False
+    )
+tools = [
+    knowledgeBase_search,
+    google_search,
+    ]
+prompt = PromptTemplate.from_template(
+    template=template_system
+)
+prompt = prompt.partial(
+    tools=render_text_description(tools),
+    tool_names=", ".join([t.name for t in tools]),
+)
+# define the agent
+chat_model_with_stop = llm.bind(stop=["\nObservation"])
+agent = (
+    {
+        "input": lambda x: x["input"],
+        "agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]),
+        "chat_history": lambda x: x["chat_history"],
+    }
+    | prompt
+    | chat_model_with_stop
+    | ReActJsonSingleInputOutputParser()
+)
+# instantiate AgentExecutor
+agent_executor = AgentExecutor(
+    agent=agent,
+    tools=tools,
+    verbose=True,
+    max_iterations=10,       # cap number of iterations
+    #max_execution_time=60,  # timout at 60 sec
+    return_intermediate_steps=True,
+    handle_parsing_errors=True,
+    )

rag_app/structured_tools/__init__.py ADDED Viewed

File without changes

rag_app/structured_tools/structured_tools.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from langchain.tools import BaseTool, StructuredTool, tool
+from langchain_community.tools import WikipediaQueryRun
+from langchain_community.utilities import WikipediaAPIWrapper
+#from langchain.tools import Tool
+from langchain_google_community import GoogleSearchAPIWrapper
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+from langchain_community.vectorstores import Chroma
+import ast
+import chromadb
+from rag_app.utils.utils import (
+    parse_list_to_dicts, format_search_results
+)
+from rag_app.database.db_handler import (
+    add_many
+)
+import os
+# from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
+persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
+@tool
+def memory_search(query:str) -> str:
+    """Search the memory vector store for existing knowledge and relevent pervious researches. \
+        This is your primary source to start your search with checking what you already have learned from the past, before going online."""
+    # Since we have more than one collections we should change the name of this tool
+    client = chromadb.PersistentClient(
+     path=persist_directory,
+    )
+    collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
+    #store using envar
+    embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
+        )
+    vector_db = Chroma(
+    client=client, # client for Chroma
+    collection_name=collection_name,
+    embedding_function=embedding_function,
+    )
+    retriever = vector_db.as_retriever()
+    docs = retriever.invoke(query)
+    return docs.__str__()
+@tool
+def knowledgeBase_search(query:str) -> str:
+    """Search the internal knowledge base for research papers and relevent chunks"""
+    # Since we have more than one collections we should change the name of this tool
+    client = chromadb.PersistentClient(
+     path=persist_directory,
+    )
+    #collection_name="ArxivPapers"
+    #store using envar
+    embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
+        )
+    vector_db = Chroma(
+    client=client, # client for Chroma
+    #collection_name=collection_name,
+    embedding_function=embedding_function,
+    )
+    retriever = vector_db.as_retriever()
+    # This is deprecated, changed to invoke
+    # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.
+    docs = retriever.invoke(query)
+    for doc in docs:
+        print(doc)
+    return docs.__str__()
+@tool
+def google_search(query: str) -> str:
+    """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
+    global all_sources
+    websearch = GoogleSearchAPIWrapper()
+    search_results:dict = websearch.results(query, 3)
+    print(search_results)
+    if len(search_results)>1:
+        cleaner_sources =format_search_results(search_results)
+        parsed_csources = parse_list_to_dicts(cleaner_sources)
+        add_many(parsed_csources)
+    else:
+        cleaner_sources = search_results
+    return cleaner_sources.__str__()

rag_app/templates/__init__.py ADDED Viewed

File without changes

rag_app/templates/react_json_with_memory.py ADDED Viewed

	@@ -0,0 +1,47 @@

+template_system = """
+Answer the following questions as best you can. You have access to the following tools:
+<TOOLS>
+{tools}
+</TOOLS>
+The way you use the tools is by specifying a json blob.
+Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).
+The only values that should be in the "action" field are: {tool_names}
+The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:
+```
+{{
+  "action": $TOOL_NAME,
+  "action_input": $INPUT
+}}
+```
+ALWAYS use the following format:
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action:
+```
+$JSON_BLOB
+```
+Observation: the result of the action
+... (this Thought/Action/Observation can repeat N times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+Begin! Reminder to always use the exact characters `Final Answer` when responding.
+Previous conversation history:
+<CONVERSATION_HISTORY>
+{chat_history}
+</CONVERSATION_HISTORY>
+<NEW_INPUT>
+{input}
+</NEW_INPUT>
+{agent_scratchpad}
+"""

rag_app/utils/__init__.py ADDED Viewed

File without changes

rag_app/utils/logger.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# logger.py
+import logging
+from rich.logging import RichHandler
+from typing import Optional
+def get_console_logger(name: Optional[str] = "default") -> logging.Logger:
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        logger.setLevel(logging.DEBUG)
+        console_handler = RichHandler()
+        console_handler.setLevel(logging.DEBUG)
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    return logger

rag_app/utils/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import hashlib
+import datetime
+import os
+import uuid
+# from rag_app.utils import logger
+# logger = logger.get_console_logger("utils")
+def extract_urls(data_list):
+    """
+    Extracts URLs from a list of of dictionaries.
+    Parameters:
+    - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
+    Returns:
+    - list: A list of URLs extracted from the dictionaries.
+    """
+    urls = []
+    print(data_list)
+    for item in data_list:
+        try:
+            # Find the start and end indices of the URL
+            lower_case = item.lower()
+            link_prefix = 'link: '
+            summary_prefix = ', summary:'
+            start_idx = lower_case.index(link_prefix) + len(link_prefix)
+            end_idx = lower_case.index(summary_prefix, start_idx)
+            # Extract the URL using the indices found
+            url = item[start_idx:end_idx]
+            urls.append(url)
+        except ValueError:
+            # Handles the case where 'link: ' or ', summary:' is not found in the string
+            print("Could not find a URL in the item:", item)
+    last_sources = urls[-3:]
+    return last_sources
+def format_search_results(search_results):
+    """
+    Formats a list of dictionaries containing search results into a list of strings.
+    Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
+    Parameters:
+    - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
+    Returns:
+    - list: A list of formatted strings based on the search results.
+    """
+    if len(search_results)>1:
+        formatted_results = [
+            "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
+            for i in search_results
+        ]
+    return formatted_results
+def parse_list_to_dicts(items: list) -> list:
+    parsed_items = []
+    for item in items:
+        # Extract title, link, and summary from each string
+        title_start = item.find('Title: ') + len('Title: ')
+        link_start = item.find('Link: ') + len('Link: ')
+        summary_start = item.find('Summary: ') + len('Summary: ')
+        title_end = item.find(', Link: ')
+        link_end = item.find(', Summary: ')
+        summary_end = len(item)
+        title = item[title_start:title_end]
+        link = item[link_start:link_end]
+        summary = item[summary_start:summary_end]
+        # Use the hash_text function for the hash_id
+        hash_id = hash_text(link)
+        # Construct the dictionary for each item
+        parsed_item = {
+            "url": link,
+            "title": title,
+            "hash_id": hash_id,
+            "summary": summary
+        }
+        parsed_items.append(parsed_item)
+    return parsed_items
+def hash_text(text: str) -> str:
+    return hashlib.md5(text.encode()).hexdigest()
+def convert_timestamp_to_datetime(timestamp: str) -> str:
+    return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
+def create_folder_if_not_exists(folder_path: str) -> None:
+    """
+    Create a folder if it doesn't already exist.
+    Args:
+    - folder_path (str): The path of the folder to create.
+    """
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
+        print(f"Folder '{folder_path}' created.")
+    else:
+        print(f"Folder '{folder_path}' already exists.")
+def generate_uuid() -> str:
+    """
+    Generate a UUID (Universally Unique Identifier) and return it as a string.
+    Returns:
+        str: A UUID string.
+    """
+    return str(uuid.uuid4())

requirements.txt CHANGED Viewed

@@ -2,12 +2,16 @@
  langchain-community
  langchain-HuggingFace
  langchain-text-splitters
  beautifulsoup4
  faiss-cpu
  chromadb
  validators
  sentence_transformers
  typing-extensions
- unstructured
  gradio
- boto3

  langchain-community
  langchain-HuggingFace
  langchain-text-splitters
+ langchain_google_community
  beautifulsoup4
  faiss-cpu
  chromadb
  validators
  sentence_transformers
  typing-extensions
+ unstructured
+ unstructured[pdf]
  gradio
+ boto3
+ rich
+ sqlmodel

test_this.py CHANGED Viewed

@@ -1,23 +1,27 @@
-from rag_app.load_data_from_urls import load_docs_from_urls
 from rag_app.create_embedding import create_embeddings
 from rag_app.generate_summary import generate_description, generate_keywords
-from rag_app.handle_vector_store import build_vector_store
 # 1. load the urls
 # 2. build the vectorstore -> the function will create the chunking and embeddings
 # 3. initialize the db retriever
 # 4.
-docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
-# for doc in docs:
-#     keywords=generate_keywords(doc)
-#     description=generate_description(doc)
-#     doc.metadata['keywords']=keywords
-#     doc.metadata['description']=description
-#     print(doc.metadata)
-build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
-#print(create_embeddings(docs))

+from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
 from rag_app.create_embedding import create_embeddings
 from rag_app.generate_summary import generate_description, generate_keywords
+from rag_app.build_vector_store import build_vector_store
 # 1. load the urls
 # 2. build the vectorstore -> the function will create the chunking and embeddings
 # 3. initialize the db retriever
 # 4.
+# docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
+# # for doc in docs:
+# #     keywords=generate_keywords(doc)
+# #     description=generate_description(doc)
+# #     doc.metadata['keywords']=keywords
+# #     doc.metadata['description']=description
+# #     print(doc.metadata)
+# build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
+# print(create_embeddings(docs))
+from rag_app.loading_data.scrap_website import scrap_website
+print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))