Asaad Almutareb commited on
Commit
fb95c43
·
1 Parent(s): 7acac3e

added ReAct agent with its tools

Browse files

added wget website scrapper as alternative to recursive URL tool
fixed some LC deprecation warnings

.devcontainer/Dockerfile CHANGED
@@ -23,7 +23,10 @@ RUN DEBIAN_FRONTEND=noninteractive \
23
  libxml2-dev \
24
  libxmlsec1-dev \
25
  libffi-dev \
26
- liblzma-dev
 
 
 
27
 
28
  # Python and poetry installation
29
  USER $USER
 
23
  libxml2-dev \
24
  libxmlsec1-dev \
25
  libffi-dev \
26
+ liblzma-dev \
27
+ ffmpeg \
28
+ libsm6 \
29
+ libxext6
30
 
31
  # Python and poetry installation
32
  USER $USER
.gitignore CHANGED
@@ -161,3 +161,6 @@ cython_debug/
161
  *.zip
162
  *.pkl
163
  *.faiss
 
 
 
 
161
  *.zip
162
  *.pkl
163
  *.faiss
164
+ *.sqlite3
165
+ *.bin
166
+ *.pickle
README.md CHANGED
@@ -1,33 +1,21 @@
1
- # Anatomy of Advanced Enterprise Rag Systems
2
-
3
- This repository accompanies the blog series "The Anatomy of Advanced Enterprise Rag Systems" and provides a hands-on learning experience for building sophisticated Rag systems. Dive deep into each component, from setup and evaluation to security and multi-agent interactions.
4
-
5
- Explore these key topics:
6
-
7
- - Test Setup and Evaluation Metrics: Learn how to assess the performance and effectiveness of your Rag system.
8
- - Data Preparation and Management: Discover techniques for organizing and optimizing your knowledge base.
9
- - User Input Processing: Understand how to handle diverse user queries and extract relevant information.
10
- - Retrieval System: Unleash the power of retrieving relevant passages from your knowledge base.
11
- - Information Processing and Generation: Craft accurate and informative responses using state-of-the-art techniques.
12
- - Feedback and Continuous Improvement: Enhance your Rag system over time using user feedback and data analysis.
13
- - Multi-agents and Agent-services: Explore advanced architectures for distributed and collaborative Rag systems.
14
- - Monitoring and Security: Ensure the robustness and trustworthiness of your Rag system with proper monitoring and security practices.
15
-
16
- What you'll find here:
17
-
18
- - Code examples: Implementations of key concepts from each topic, ready to use and adapt.
19
- - Data samples: Pre-prepared data sets for experimentation and testing.
20
- - Additional resources: Links to relevant articles, libraries, and tools to deepen your understanding.
21
-
22
- Getting started:
23
-
24
- - Clone this repository: git clone https://github.com/<username>/advanced-enterprise-rag-systems.git
25
- - Follow the instructions in each topic directory.
26
-
27
- Contributing:
28
-
29
- We welcome your contributions! Share your expertise, improve existing code examples, or add new ones. Submit a pull request to share your valuable additions.
30
-
31
- License:
32
-
33
- This project is licensed under the MIT License: LICENSE.
 
1
+ # Insurance Advisor Agent
2
+
3
+ 1. Improve Vectorstore search
4
+ Isayah:
5
+ - multi index (chunks+summaries)
6
+ - metadata filtering
7
+ Karan:
8
+ - re-ranking
9
+ - hybrid search (bm25+vectorstore)
10
+
11
+
12
+ - HyDE/HyQE
13
+ 2. Use ReAct agent instead of chain [DONE]
14
+ - add tools (Google search engine) [DONE]
15
+ - query re-writing
16
+ - ...
17
+ 3. Use Corrective RAG
18
+ 4. Workflow for the agents
19
+ - determine funnel stage of customer
20
+ - different prompts per step
21
+ - follow a "script"
 
 
 
 
 
 
 
 
 
 
 
 
app_gui.py CHANGED
@@ -1,5 +1,6 @@
1
  # Import Gradio for UI, along with other necessary libraries
2
  import gradio as gr
 
3
  # need to import the qa!
4
 
5
  # Function to add a new input to the chat history
@@ -12,20 +13,27 @@ def add_text(history, text):
12
  def bot(history):
13
  # Obtain the response from the 'infer' function using the latest input
14
  response = infer(history[-1][0], history)
15
- sources = [doc.metadata.get("source") for doc in response['source_documents']]
16
- src_list = '\n'.join(sources)
17
- print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
18
 
19
 
20
- history[-1][1] = print_this #response['answer']
21
  # Update the history with the bot's response
22
- #history[-1][1] = response['result']
 
23
  return history
24
 
25
  # Function to infer the response using the RAG model
26
  def infer(question, history):
27
  # Use the question and history to query the RAG model
28
- result = qa({"query": question, "history": history, "question": question})
 
 
 
 
 
 
29
  return result
30
 
31
  # CSS styling for the Gradio interface
 
1
  # Import Gradio for UI, along with other necessary libraries
2
  import gradio as gr
3
+ from rag_app.react_agent import agent_executor
4
  # need to import the qa!
5
 
6
  # Function to add a new input to the chat history
 
13
  def bot(history):
14
  # Obtain the response from the 'infer' function using the latest input
15
  response = infer(history[-1][0], history)
16
+ #sources = [doc.metadata.get("source") for doc in response['source_documents']]
17
+ #src_list = '\n'.join(sources)
18
+ #print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
19
 
20
 
21
+ #history[-1][1] = print_this #response['answer']
22
  # Update the history with the bot's response
23
+ print(*response)
24
+ history[-1][1] = response['output']
25
  return history
26
 
27
  # Function to infer the response using the RAG model
28
  def infer(question, history):
29
  # Use the question and history to query the RAG model
30
+ #result = qa({"query": question, "history": history, "question": question})
31
+ result = agent_executor.invoke(
32
+ {
33
+ "input": question,
34
+ "chat_history": history
35
+ }
36
+ )
37
  return result
38
 
39
  # CSS styling for the Gradio interface
core-langchain-rag.py CHANGED
@@ -274,7 +274,7 @@ def infer(question, history):
274
  # Use the question and history to query the RAG model
275
  #result = generate_qa_retriever(history, question)
276
  result = qa({"query": question, "history": history, "question": question})
277
- print(*result)
278
  return result
279
 
280
  # CSS styling for the Gradio interface
 
274
  # Use the question and history to query the RAG model
275
  #result = generate_qa_retriever(history, question)
276
  result = qa({"query": question, "history": history, "question": question})
277
+ #print(*result)
278
  return result
279
 
280
  # CSS styling for the Gradio interface
example.env CHANGED
@@ -10,7 +10,15 @@ CHROMA_VS_NAME=""
10
  FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
11
  CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
12
 
 
 
 
 
 
 
13
  # llm and embedding models
14
- EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
 
15
  LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
16
- LLM_MODEL_ARGS=
 
 
10
  FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
11
  CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
12
 
13
+ # for chromadb
14
+ VECTOR_DATABASE_LOCATION="./vectorstore/chroma-insurance-agent-1500"
15
+
16
+ # Name for the Conversation Memory Collection
17
+ CONVERSATION_COLLECTION_NAME="ConversationMemory"
18
+
19
  # llm and embedding models
20
+ #EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
21
+ EMBEDDING_MODEL="microsoft/Multilingual-MiniLM-L12-H384"
22
  LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
23
+ LLM_MODEL_ARGS=
24
+
rag_app/{handle_vector_store.py → build_vector_store.py} RENAMED
@@ -1,7 +1,7 @@
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
4
- from langchain_community.document_loaders import ReadTheDocsLoader
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.retrievers import BM25Retriever
 
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
4
+ from langchain_community.document_loaders import DirectoryLoader
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.retrievers import BM25Retriever
rag_app/database/db_handler.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlmodel import SQLModel, create_engine, Session, select
2
+ from rag_app.database.schema import Sources
3
+ from rag_app.utils.logger import get_console_logger
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ sqlite_file_name = os.getenv('SOURCES_CACHE')
10
+
11
+ sqlite_url = f"sqlite:///{sqlite_file_name}"
12
+ engine = create_engine(sqlite_url, echo=False)
13
+
14
+ logger = get_console_logger("db_handler")
15
+
16
+ SQLModel.metadata.create_all(engine)
17
+
18
+
19
+ def read_one(hash_id: dict):
20
+ with Session(engine) as session:
21
+ statement = select(Sources).where(Sources.hash_id == hash_id)
22
+ sources = session.exec(statement).first()
23
+ return sources
24
+
25
+
26
+ def add_one(data: dict):
27
+ with Session(engine) as session:
28
+ if session.exec(
29
+ select(Sources).where(Sources.hash_id == data.get("hash_id"))
30
+ ).first():
31
+ logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
32
+ return None # or raise an exception, or handle as needed
33
+ sources = Sources(**data)
34
+ session.add(sources)
35
+ session.commit()
36
+ session.refresh(sources)
37
+ logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
38
+ return sources
39
+
40
+
41
+ def update_one(hash_id: dict, data: dict):
42
+ with Session(engine) as session:
43
+ # Check if the item with the given hash_id exists
44
+ sources = session.exec(
45
+ select(Sources).where(Sources.hash_id == hash_id)
46
+ ).first()
47
+ if not sources:
48
+ logger.warning(f"No item with hash_id {hash_id} found for update")
49
+ return None # or raise an exception, or handle as needed
50
+ for key, value in data.items():
51
+ setattr(sources, key, value)
52
+ session.commit()
53
+ logger.info(f"Item with hash_id {hash_id} updated in the database")
54
+ return sources
55
+
56
+
57
+ def delete_one(id: int):
58
+ with Session(engine) as session:
59
+ # Check if the item with the given hash_id exists
60
+ sources = session.exec(
61
+ select(Sources).where(Sources.hash_id == id)
62
+ ).first()
63
+ if not sources:
64
+ logger.warning(f"No item with hash_id {id} found for deletion")
65
+ return None # or raise an exception, or handle as needed
66
+ session.delete(sources)
67
+ session.commit()
68
+ logger.info(f"Item with hash_id {id} deleted from the database")
69
+
70
+
71
+ def add_many(data: list):
72
+ with Session(engine) as session:
73
+ for info in data:
74
+ # Reuse add_one function for each item
75
+ result = add_one(info)
76
+ if result is None:
77
+ logger.warning(
78
+ f"Item with hash_id {info.get('hash_id')} could not be added"
79
+ )
80
+ else:
81
+ logger.info(
82
+ f"Item with hash_id {info.get('hash_id')} added to the database"
83
+ )
84
+ session.commit() # Commit at the end of the loop
85
+
86
+
87
+ def delete_many(ids: list):
88
+ with Session(engine) as session:
89
+ for id in ids:
90
+ # Reuse delete_one function for each item
91
+ result = delete_one(id)
92
+ if result is None:
93
+ logger.warning(f"No item with hash_id {id} found for deletion")
94
+ else:
95
+ logger.info(f"Item with hash_id {id} deleted from the database")
96
+ session.commit() # Commit at the end of the loop
97
+
98
+
99
+ def read_all(query: dict = None):
100
+ with Session(engine) as session:
101
+ statement = select(Sources)
102
+ if query:
103
+ statement = statement.where(
104
+ *[getattr(Sources, key) == value for key, value in query.items()]
105
+ )
106
+ sources = session.exec(statement).all()
107
+ return sources
108
+
109
+
110
+ def delete_all():
111
+ with Session(engine) as session:
112
+ session.exec(Sources).delete()
113
+ session.commit()
114
+ logger.info("All items deleted from the database")
rag_app/database/schema.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlmodel import SQLModel, Field
2
+ from typing import Optional
3
+
4
+ import datetime
5
+
6
+ class Sources(SQLModel, table=True):
7
+ id: Optional[int] = Field(default=None, primary_key=True)
8
+ url: str = Field()
9
+ title: Optional[str] = Field(default="NA", unique=False)
10
+ hash_id: str = Field(unique=True)
11
+ created_at: float = Field(default=datetime.datetime.now().timestamp())
12
+ summary: str = Field(default="")
13
+ embedded: bool = Field(default=False)
14
+
15
+ __table_args__ = {"extend_existing": True}
rag_app/get_db_retriever.py CHANGED
@@ -2,9 +2,9 @@
2
 
3
  # HF libraries
4
  from langchain.llms import HuggingFaceHub
5
- from langchain.embeddings import HuggingFaceHubEmbeddings
6
  # vectorestore
7
- from langchain.vectorstores import FAISS
8
  # retrieval chain
9
  from langchain.chains import RetrievalQA
10
  # prompt template
 
2
 
3
  # HF libraries
4
  from langchain.llms import HuggingFaceHub
5
+ from langchain_huggingface import HuggingFaceHubEmbeddings
6
  # vectorestore
7
+ from langchain_community.vectorstores import FAISS
8
  # retrieval chain
9
  from langchain.chains import RetrievalQA
10
  # prompt template
rag_app/{load_data_from_urls.py → loading_data/load_urls_recurisvely.py} RENAMED
File without changes
rag_app/loading_data/scrap_website.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scrap a given url recursively
2
+
3
+ import subprocess
4
+ import os
5
+ from urllib.parse import urlparse
6
+ from langchain_community.document_loaders import DirectoryLoader
7
+
8
+ def runcmd(cmd, verbose = False, *args, **kwargs):
9
+
10
+ process = subprocess.Popen(
11
+ cmd,
12
+ stdout = subprocess.PIPE,
13
+ stderr = subprocess.PIPE,
14
+ text = True,
15
+ shell = True
16
+ )
17
+ std_out, std_err = process.communicate()
18
+ if verbose:
19
+ print(std_out.strip(), std_err)
20
+ pass
21
+ return process.returncode
22
+
23
+ def scrap_website(target_url:str, depth:int=5):
24
+ target_domain = urlparse(target_url).netloc
25
+ target_directory='./downloads/'
26
+ # To download the files locally for processing, here's the command line
27
+ command_this=f'wget -e robots=off --recursive -l {depth} --no-clobber --page-requisites --html-extension \
28
+ --convert-links --restrict-file-names=windows --force-directories --directory-prefix={target_directory}\
29
+ --domains target_domain --no-parent {target_url}'
30
+ cmd_status = runcmd(command_this, verbose=True)
31
+ if cmd_status==0:
32
+ documents_path = os.path.dirname(os.path.realpath(f'{target_directory}/{target_domain}'))
33
+ loader = DirectoryLoader(documents_path, silent_errors=True, show_progress=True)
34
+ docs = loader.load()
35
+
36
+ return docs
rag_app/react_agent.py CHANGED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HF libraries
2
+ from langchain_huggingface import HuggingFaceEndpoint
3
+ from langchain.agents import AgentExecutor
4
+ from langchain.agents.format_scratchpad import format_log_to_str
5
+ from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
6
+ # Import things that are needed generically
7
+ from langchain.tools.render import render_text_description
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from rag_app.structured_tools.structured_tools import (
11
+ google_search, knowledgeBase_search
12
+ )
13
+
14
+ from langchain.prompts import PromptTemplate
15
+ from rag_app.templates.react_json_with_memory import template_system
16
+ # from innovation_pathfinder_ai.utils import logger
17
+ # from langchain.globals import set_llm_cache
18
+ # from langchain.cache import SQLiteCache
19
+
20
+ # set_llm_cache(SQLiteCache(database_path=".cache.db"))
21
+ # logger = logger.get_console_logger("hf_mixtral_agent")
22
+
23
+ config = load_dotenv(".env")
24
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
25
+ GOOGLE_CSE_ID = os.getenv('GOOGLE_CSE_ID')
26
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
27
+ # LANGCHAIN_TRACING_V2 = "true"
28
+ # LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com"
29
+ # LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
30
+ # LANGCHAIN_PROJECT = os.getenv('LANGCHAIN_PROJECT')
31
+
32
+ # Load the model from the Hugging Face Hub
33
+ llm = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
34
+ temperature=0.1,
35
+ max_new_tokens=1024,
36
+ repetition_penalty=1.2,
37
+ return_full_text=False
38
+ )
39
+
40
+
41
+ tools = [
42
+ knowledgeBase_search,
43
+ google_search,
44
+ ]
45
+
46
+ prompt = PromptTemplate.from_template(
47
+ template=template_system
48
+ )
49
+ prompt = prompt.partial(
50
+ tools=render_text_description(tools),
51
+ tool_names=", ".join([t.name for t in tools]),
52
+ )
53
+
54
+
55
+ # define the agent
56
+ chat_model_with_stop = llm.bind(stop=["\nObservation"])
57
+ agent = (
58
+ {
59
+ "input": lambda x: x["input"],
60
+ "agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]),
61
+ "chat_history": lambda x: x["chat_history"],
62
+ }
63
+ | prompt
64
+ | chat_model_with_stop
65
+ | ReActJsonSingleInputOutputParser()
66
+ )
67
+
68
+ # instantiate AgentExecutor
69
+ agent_executor = AgentExecutor(
70
+ agent=agent,
71
+ tools=tools,
72
+ verbose=True,
73
+ max_iterations=10, # cap number of iterations
74
+ #max_execution_time=60, # timout at 60 sec
75
+ return_intermediate_steps=True,
76
+ handle_parsing_errors=True,
77
+ )
rag_app/structured_tools/__init__.py ADDED
File without changes
rag_app/structured_tools/structured_tools.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.tools import BaseTool, StructuredTool, tool
2
+ from langchain_community.tools import WikipediaQueryRun
3
+ from langchain_community.utilities import WikipediaAPIWrapper
4
+ #from langchain.tools import Tool
5
+ from langchain_google_community import GoogleSearchAPIWrapper
6
+ from langchain_community.embeddings.sentence_transformer import (
7
+ SentenceTransformerEmbeddings,
8
+ )
9
+ from langchain_community.vectorstores import Chroma
10
+ import ast
11
+
12
+ import chromadb
13
+
14
+ from rag_app.utils.utils import (
15
+ parse_list_to_dicts, format_search_results
16
+ )
17
+ from rag_app.database.db_handler import (
18
+ add_many
19
+ )
20
+
21
+ import os
22
+ # from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
23
+
24
+ persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
25
+
26
+ @tool
27
+ def memory_search(query:str) -> str:
28
+ """Search the memory vector store for existing knowledge and relevent pervious researches. \
29
+ This is your primary source to start your search with checking what you already have learned from the past, before going online."""
30
+ # Since we have more than one collections we should change the name of this tool
31
+ client = chromadb.PersistentClient(
32
+ path=persist_directory,
33
+ )
34
+
35
+ collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
36
+ #store using envar
37
+
38
+ embedding_function = SentenceTransformerEmbeddings(
39
+ model_name=os.getenv("EMBEDDING_MODEL"),
40
+ )
41
+
42
+ vector_db = Chroma(
43
+ client=client, # client for Chroma
44
+ collection_name=collection_name,
45
+ embedding_function=embedding_function,
46
+ )
47
+
48
+ retriever = vector_db.as_retriever()
49
+ docs = retriever.invoke(query)
50
+
51
+ return docs.__str__()
52
+
53
+ @tool
54
+ def knowledgeBase_search(query:str) -> str:
55
+ """Search the internal knowledge base for research papers and relevent chunks"""
56
+ # Since we have more than one collections we should change the name of this tool
57
+ client = chromadb.PersistentClient(
58
+ path=persist_directory,
59
+ )
60
+
61
+ #collection_name="ArxivPapers"
62
+ #store using envar
63
+
64
+ embedding_function = SentenceTransformerEmbeddings(
65
+ model_name=os.getenv("EMBEDDING_MODEL"),
66
+ )
67
+
68
+ vector_db = Chroma(
69
+ client=client, # client for Chroma
70
+ #collection_name=collection_name,
71
+ embedding_function=embedding_function,
72
+ )
73
+
74
+ retriever = vector_db.as_retriever()
75
+ # This is deprecated, changed to invoke
76
+ # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.
77
+ docs = retriever.invoke(query)
78
+ for doc in docs:
79
+ print(doc)
80
+
81
+ return docs.__str__()
82
+
83
+ @tool
84
+ def google_search(query: str) -> str:
85
+ """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
86
+ global all_sources
87
+
88
+ websearch = GoogleSearchAPIWrapper()
89
+ search_results:dict = websearch.results(query, 3)
90
+ print(search_results)
91
+ if len(search_results)>1:
92
+ cleaner_sources =format_search_results(search_results)
93
+ parsed_csources = parse_list_to_dicts(cleaner_sources)
94
+ add_many(parsed_csources)
95
+ else:
96
+ cleaner_sources = search_results
97
+
98
+ return cleaner_sources.__str__()
rag_app/templates/__init__.py ADDED
File without changes
rag_app/templates/react_json_with_memory.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ template_system = """
2
+ Answer the following questions as best you can. You have access to the following tools:
3
+
4
+ <TOOLS>
5
+ {tools}
6
+ </TOOLS>
7
+
8
+ The way you use the tools is by specifying a json blob.
9
+ Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).
10
+
11
+ The only values that should be in the "action" field are: {tool_names}
12
+
13
+ The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:
14
+
15
+ ```
16
+ {{
17
+ "action": $TOOL_NAME,
18
+ "action_input": $INPUT
19
+ }}
20
+ ```
21
+
22
+ ALWAYS use the following format:
23
+
24
+ Question: the input question you must answer
25
+ Thought: you should always think about what to do
26
+ Action:
27
+ ```
28
+ $JSON_BLOB
29
+ ```
30
+ Observation: the result of the action
31
+ ... (this Thought/Action/Observation can repeat N times)
32
+ Thought: I now know the final answer
33
+ Final Answer: the final answer to the original input question
34
+
35
+ Begin! Reminder to always use the exact characters `Final Answer` when responding.
36
+
37
+ Previous conversation history:
38
+ <CONVERSATION_HISTORY>
39
+ {chat_history}
40
+ </CONVERSATION_HISTORY>
41
+
42
+ <NEW_INPUT>
43
+ {input}
44
+ </NEW_INPUT>
45
+
46
+ {agent_scratchpad}
47
+ """
rag_app/utils/__init__.py ADDED
File without changes
rag_app/utils/logger.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logger.py
2
+
3
+ import logging
4
+ from rich.logging import RichHandler
5
+ from typing import Optional
6
+
7
+
8
+ def get_console_logger(name: Optional[str] = "default") -> logging.Logger:
9
+ logger = logging.getLogger(name)
10
+ if not logger.handlers:
11
+ logger.setLevel(logging.DEBUG)
12
+ console_handler = RichHandler()
13
+ console_handler.setLevel(logging.DEBUG)
14
+ formatter = logging.Formatter(
15
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
16
+ )
17
+ console_handler.setFormatter(formatter)
18
+ logger.addHandler(console_handler)
19
+
20
+ return logger
rag_app/utils/utils.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import datetime
3
+ import os
4
+ import uuid
5
+
6
+ # from rag_app.utils import logger
7
+
8
+ # logger = logger.get_console_logger("utils")
9
+
10
+
11
+
12
+ def extract_urls(data_list):
13
+ """
14
+ Extracts URLs from a list of of dictionaries.
15
+
16
+ Parameters:
17
+ - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
18
+
19
+ Returns:
20
+ - list: A list of URLs extracted from the dictionaries.
21
+ """
22
+ urls = []
23
+ print(data_list)
24
+ for item in data_list:
25
+ try:
26
+ # Find the start and end indices of the URL
27
+ lower_case = item.lower()
28
+ link_prefix = 'link: '
29
+ summary_prefix = ', summary:'
30
+ start_idx = lower_case.index(link_prefix) + len(link_prefix)
31
+ end_idx = lower_case.index(summary_prefix, start_idx)
32
+ # Extract the URL using the indices found
33
+ url = item[start_idx:end_idx]
34
+ urls.append(url)
35
+ except ValueError:
36
+ # Handles the case where 'link: ' or ', summary:' is not found in the string
37
+ print("Could not find a URL in the item:", item)
38
+ last_sources = urls[-3:]
39
+ return last_sources
40
+
41
+ def format_search_results(search_results):
42
+ """
43
+ Formats a list of dictionaries containing search results into a list of strings.
44
+ Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
45
+
46
+ Parameters:
47
+ - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
48
+
49
+ Returns:
50
+ - list: A list of formatted strings based on the search results.
51
+ """
52
+ if len(search_results)>1:
53
+ formatted_results = [
54
+ "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
55
+ for i in search_results
56
+ ]
57
+ return formatted_results
58
+
59
+ def parse_list_to_dicts(items: list) -> list:
60
+ parsed_items = []
61
+ for item in items:
62
+ # Extract title, link, and summary from each string
63
+ title_start = item.find('Title: ') + len('Title: ')
64
+ link_start = item.find('Link: ') + len('Link: ')
65
+ summary_start = item.find('Summary: ') + len('Summary: ')
66
+
67
+ title_end = item.find(', Link: ')
68
+ link_end = item.find(', Summary: ')
69
+ summary_end = len(item)
70
+
71
+ title = item[title_start:title_end]
72
+ link = item[link_start:link_end]
73
+ summary = item[summary_start:summary_end]
74
+
75
+ # Use the hash_text function for the hash_id
76
+ hash_id = hash_text(link)
77
+
78
+ # Construct the dictionary for each item
79
+ parsed_item = {
80
+ "url": link,
81
+ "title": title,
82
+ "hash_id": hash_id,
83
+ "summary": summary
84
+ }
85
+ parsed_items.append(parsed_item)
86
+ return parsed_items
87
+
88
+ def hash_text(text: str) -> str:
89
+ return hashlib.md5(text.encode()).hexdigest()
90
+
91
+
92
+ def convert_timestamp_to_datetime(timestamp: str) -> str:
93
+ return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
94
+
95
+ def create_folder_if_not_exists(folder_path: str) -> None:
96
+ """
97
+ Create a folder if it doesn't already exist.
98
+
99
+ Args:
100
+ - folder_path (str): The path of the folder to create.
101
+ """
102
+ if not os.path.exists(folder_path):
103
+ os.makedirs(folder_path)
104
+ print(f"Folder '{folder_path}' created.")
105
+ else:
106
+ print(f"Folder '{folder_path}' already exists.")
107
+
108
+ def generate_uuid() -> str:
109
+ """
110
+ Generate a UUID (Universally Unique Identifier) and return it as a string.
111
+
112
+ Returns:
113
+ str: A UUID string.
114
+ """
115
+ return str(uuid.uuid4())
requirements.txt CHANGED
@@ -2,12 +2,16 @@
2
  langchain-community
3
  langchain-HuggingFace
4
  langchain-text-splitters
 
5
  beautifulsoup4
6
  faiss-cpu
7
  chromadb
8
  validators
9
  sentence_transformers
10
  typing-extensions
11
- unstructured
 
12
  gradio
13
- boto3
 
 
 
2
  langchain-community
3
  langchain-HuggingFace
4
  langchain-text-splitters
5
+ langchain_google_community
6
  beautifulsoup4
7
  faiss-cpu
8
  chromadb
9
  validators
10
  sentence_transformers
11
  typing-extensions
12
+ unstructured
13
+ unstructured[pdf]
14
  gradio
15
+ boto3
16
+ rich
17
+ sqlmodel
test_this.py CHANGED
@@ -1,23 +1,27 @@
1
- from rag_app.load_data_from_urls import load_docs_from_urls
2
  from rag_app.create_embedding import create_embeddings
3
  from rag_app.generate_summary import generate_description, generate_keywords
4
- from rag_app.handle_vector_store import build_vector_store
5
 
6
  # 1. load the urls
7
  # 2. build the vectorstore -> the function will create the chunking and embeddings
8
  # 3. initialize the db retriever
9
  # 4.
10
 
11
- docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
12
 
13
- # for doc in docs:
14
- # keywords=generate_keywords(doc)
15
- # description=generate_description(doc)
16
- # doc.metadata['keywords']=keywords
17
- # doc.metadata['description']=description
18
- # print(doc.metadata)
19
 
20
- build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
21
 
22
 
23
- #print(create_embeddings(docs))
 
 
 
 
 
1
+ from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
2
  from rag_app.create_embedding import create_embeddings
3
  from rag_app.generate_summary import generate_description, generate_keywords
4
+ from rag_app.build_vector_store import build_vector_store
5
 
6
  # 1. load the urls
7
  # 2. build the vectorstore -> the function will create the chunking and embeddings
8
  # 3. initialize the db retriever
9
  # 4.
10
 
11
+ # docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
12
 
13
+ # # for doc in docs:
14
+ # # keywords=generate_keywords(doc)
15
+ # # description=generate_description(doc)
16
+ # # doc.metadata['keywords']=keywords
17
+ # # doc.metadata['description']=description
18
+ # # print(doc.metadata)
19
 
20
+ # build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
21
 
22
 
23
+ # print(create_embeddings(docs))
24
+
25
+ from rag_app.loading_data.scrap_website import scrap_website
26
+
27
+ print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))