Spaces:
Sleeping
Sleeping
Asaad Almutareb
commited on
Commit
·
fb95c43
1
Parent(s):
7acac3e
added ReAct agent with its tools
Browse filesadded wget website scrapper as alternative to recursive URL tool
fixed some LC deprecation warnings
- .devcontainer/Dockerfile +4 -1
- .gitignore +3 -0
- README.md +21 -33
- app_gui.py +14 -6
- core-langchain-rag.py +1 -1
- example.env +10 -2
- rag_app/{handle_vector_store.py → build_vector_store.py} +1 -1
- rag_app/database/db_handler.py +114 -0
- rag_app/database/schema.py +15 -0
- rag_app/get_db_retriever.py +2 -2
- rag_app/{load_data_from_urls.py → loading_data/load_urls_recurisvely.py} +0 -0
- rag_app/loading_data/scrap_website.py +36 -0
- rag_app/react_agent.py +77 -0
- rag_app/structured_tools/__init__.py +0 -0
- rag_app/structured_tools/structured_tools.py +98 -0
- rag_app/templates/__init__.py +0 -0
- rag_app/templates/react_json_with_memory.py +47 -0
- rag_app/utils/__init__.py +0 -0
- rag_app/utils/logger.py +20 -0
- rag_app/utils/utils.py +115 -0
- requirements.txt +6 -2
- test_this.py +15 -11
.devcontainer/Dockerfile
CHANGED
@@ -23,7 +23,10 @@ RUN DEBIAN_FRONTEND=noninteractive \
|
|
23 |
libxml2-dev \
|
24 |
libxmlsec1-dev \
|
25 |
libffi-dev \
|
26 |
-
liblzma-dev
|
|
|
|
|
|
|
27 |
|
28 |
# Python and poetry installation
|
29 |
USER $USER
|
|
|
23 |
libxml2-dev \
|
24 |
libxmlsec1-dev \
|
25 |
libffi-dev \
|
26 |
+
liblzma-dev \
|
27 |
+
ffmpeg \
|
28 |
+
libsm6 \
|
29 |
+
libxext6
|
30 |
|
31 |
# Python and poetry installation
|
32 |
USER $USER
|
.gitignore
CHANGED
@@ -161,3 +161,6 @@ cython_debug/
|
|
161 |
*.zip
|
162 |
*.pkl
|
163 |
*.faiss
|
|
|
|
|
|
|
|
161 |
*.zip
|
162 |
*.pkl
|
163 |
*.faiss
|
164 |
+
*.sqlite3
|
165 |
+
*.bin
|
166 |
+
*.pickle
|
README.md
CHANGED
@@ -1,33 +1,21 @@
|
|
1 |
-
#
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
-
|
9 |
-
-
|
10 |
-
|
11 |
-
|
12 |
-
-
|
13 |
-
|
14 |
-
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
-
|
20 |
-
-
|
21 |
-
|
22 |
-
Getting started:
|
23 |
-
|
24 |
-
- Clone this repository: git clone https://github.com/<username>/advanced-enterprise-rag-systems.git
|
25 |
-
- Follow the instructions in each topic directory.
|
26 |
-
|
27 |
-
Contributing:
|
28 |
-
|
29 |
-
We welcome your contributions! Share your expertise, improve existing code examples, or add new ones. Submit a pull request to share your valuable additions.
|
30 |
-
|
31 |
-
License:
|
32 |
-
|
33 |
-
This project is licensed under the MIT License: LICENSE.
|
|
|
1 |
+
# Insurance Advisor Agent
|
2 |
+
|
3 |
+
1. Improve Vectorstore search
|
4 |
+
Isayah:
|
5 |
+
- multi index (chunks+summaries)
|
6 |
+
- metadata filtering
|
7 |
+
Karan:
|
8 |
+
- re-ranking
|
9 |
+
- hybrid search (bm25+vectorstore)
|
10 |
+
|
11 |
+
|
12 |
+
- HyDE/HyQE
|
13 |
+
2. Use ReAct agent instead of chain [DONE]
|
14 |
+
- add tools (Google search engine) [DONE]
|
15 |
+
- query re-writing
|
16 |
+
- ...
|
17 |
+
3. Use Corrective RAG
|
18 |
+
4. Workflow for the agents
|
19 |
+
- determine funnel stage of customer
|
20 |
+
- different prompts per step
|
21 |
+
- follow a "script"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_gui.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# Import Gradio for UI, along with other necessary libraries
|
2 |
import gradio as gr
|
|
|
3 |
# need to import the qa!
|
4 |
|
5 |
# Function to add a new input to the chat history
|
@@ -12,20 +13,27 @@ def add_text(history, text):
|
|
12 |
def bot(history):
|
13 |
# Obtain the response from the 'infer' function using the latest input
|
14 |
response = infer(history[-1][0], history)
|
15 |
-
sources = [doc.metadata.get("source") for doc in response['source_documents']]
|
16 |
-
src_list = '\n'.join(sources)
|
17 |
-
print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
|
18 |
|
19 |
|
20 |
-
history[-1][1] = print_this #response['answer']
|
21 |
# Update the history with the bot's response
|
22 |
-
|
|
|
23 |
return history
|
24 |
|
25 |
# Function to infer the response using the RAG model
|
26 |
def infer(question, history):
|
27 |
# Use the question and history to query the RAG model
|
28 |
-
result = qa({"query": question, "history": history, "question": question})
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
return result
|
30 |
|
31 |
# CSS styling for the Gradio interface
|
|
|
1 |
# Import Gradio for UI, along with other necessary libraries
|
2 |
import gradio as gr
|
3 |
+
from rag_app.react_agent import agent_executor
|
4 |
# need to import the qa!
|
5 |
|
6 |
# Function to add a new input to the chat history
|
|
|
13 |
def bot(history):
|
14 |
# Obtain the response from the 'infer' function using the latest input
|
15 |
response = infer(history[-1][0], history)
|
16 |
+
#sources = [doc.metadata.get("source") for doc in response['source_documents']]
|
17 |
+
#src_list = '\n'.join(sources)
|
18 |
+
#print_this = response['result'] + "\n\n\n Sources: \n\n\n" + src_list
|
19 |
|
20 |
|
21 |
+
#history[-1][1] = print_this #response['answer']
|
22 |
# Update the history with the bot's response
|
23 |
+
print(*response)
|
24 |
+
history[-1][1] = response['output']
|
25 |
return history
|
26 |
|
27 |
# Function to infer the response using the RAG model
|
28 |
def infer(question, history):
|
29 |
# Use the question and history to query the RAG model
|
30 |
+
#result = qa({"query": question, "history": history, "question": question})
|
31 |
+
result = agent_executor.invoke(
|
32 |
+
{
|
33 |
+
"input": question,
|
34 |
+
"chat_history": history
|
35 |
+
}
|
36 |
+
)
|
37 |
return result
|
38 |
|
39 |
# CSS styling for the Gradio interface
|
core-langchain-rag.py
CHANGED
@@ -274,7 +274,7 @@ def infer(question, history):
|
|
274 |
# Use the question and history to query the RAG model
|
275 |
#result = generate_qa_retriever(history, question)
|
276 |
result = qa({"query": question, "history": history, "question": question})
|
277 |
-
print(*result)
|
278 |
return result
|
279 |
|
280 |
# CSS styling for the Gradio interface
|
|
|
274 |
# Use the question and history to query the RAG model
|
275 |
#result = generate_qa_retriever(history, question)
|
276 |
result = qa({"query": question, "history": history, "question": question})
|
277 |
+
#print(*result)
|
278 |
return result
|
279 |
|
280 |
# CSS styling for the Gradio interface
|
example.env
CHANGED
@@ -10,7 +10,15 @@ CHROMA_VS_NAME=""
|
|
10 |
FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
|
11 |
CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# llm and embedding models
|
14 |
-
EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
|
|
15 |
LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
16 |
-
LLM_MODEL_ARGS=
|
|
|
|
10 |
FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
|
11 |
CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
|
12 |
|
13 |
+
# for chromadb
|
14 |
+
VECTOR_DATABASE_LOCATION="./vectorstore/chroma-insurance-agent-1500"
|
15 |
+
|
16 |
+
# Name for the Conversation Memory Collection
|
17 |
+
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
18 |
+
|
19 |
# llm and embedding models
|
20 |
+
#EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
21 |
+
EMBEDDING_MODEL="microsoft/Multilingual-MiniLM-L12-H384"
|
22 |
LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
23 |
+
LLM_MODEL_ARGS=
|
24 |
+
|
rag_app/{handle_vector_store.py → build_vector_store.py}
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
4 |
-
from langchain_community.document_loaders import
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
from langchain_community.retrievers import BM25Retriever
|
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
4 |
+
from langchain_community.document_loaders import DirectoryLoader
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
from langchain_community.retrievers import BM25Retriever
|
rag_app/database/db_handler.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sqlmodel import SQLModel, create_engine, Session, select
|
2 |
+
from rag_app.database.schema import Sources
|
3 |
+
from rag_app.utils.logger import get_console_logger
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
sqlite_file_name = os.getenv('SOURCES_CACHE')
|
10 |
+
|
11 |
+
sqlite_url = f"sqlite:///{sqlite_file_name}"
|
12 |
+
engine = create_engine(sqlite_url, echo=False)
|
13 |
+
|
14 |
+
logger = get_console_logger("db_handler")
|
15 |
+
|
16 |
+
SQLModel.metadata.create_all(engine)
|
17 |
+
|
18 |
+
|
19 |
+
def read_one(hash_id: dict):
|
20 |
+
with Session(engine) as session:
|
21 |
+
statement = select(Sources).where(Sources.hash_id == hash_id)
|
22 |
+
sources = session.exec(statement).first()
|
23 |
+
return sources
|
24 |
+
|
25 |
+
|
26 |
+
def add_one(data: dict):
|
27 |
+
with Session(engine) as session:
|
28 |
+
if session.exec(
|
29 |
+
select(Sources).where(Sources.hash_id == data.get("hash_id"))
|
30 |
+
).first():
|
31 |
+
logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
|
32 |
+
return None # or raise an exception, or handle as needed
|
33 |
+
sources = Sources(**data)
|
34 |
+
session.add(sources)
|
35 |
+
session.commit()
|
36 |
+
session.refresh(sources)
|
37 |
+
logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
|
38 |
+
return sources
|
39 |
+
|
40 |
+
|
41 |
+
def update_one(hash_id: dict, data: dict):
|
42 |
+
with Session(engine) as session:
|
43 |
+
# Check if the item with the given hash_id exists
|
44 |
+
sources = session.exec(
|
45 |
+
select(Sources).where(Sources.hash_id == hash_id)
|
46 |
+
).first()
|
47 |
+
if not sources:
|
48 |
+
logger.warning(f"No item with hash_id {hash_id} found for update")
|
49 |
+
return None # or raise an exception, or handle as needed
|
50 |
+
for key, value in data.items():
|
51 |
+
setattr(sources, key, value)
|
52 |
+
session.commit()
|
53 |
+
logger.info(f"Item with hash_id {hash_id} updated in the database")
|
54 |
+
return sources
|
55 |
+
|
56 |
+
|
57 |
+
def delete_one(id: int):
|
58 |
+
with Session(engine) as session:
|
59 |
+
# Check if the item with the given hash_id exists
|
60 |
+
sources = session.exec(
|
61 |
+
select(Sources).where(Sources.hash_id == id)
|
62 |
+
).first()
|
63 |
+
if not sources:
|
64 |
+
logger.warning(f"No item with hash_id {id} found for deletion")
|
65 |
+
return None # or raise an exception, or handle as needed
|
66 |
+
session.delete(sources)
|
67 |
+
session.commit()
|
68 |
+
logger.info(f"Item with hash_id {id} deleted from the database")
|
69 |
+
|
70 |
+
|
71 |
+
def add_many(data: list):
|
72 |
+
with Session(engine) as session:
|
73 |
+
for info in data:
|
74 |
+
# Reuse add_one function for each item
|
75 |
+
result = add_one(info)
|
76 |
+
if result is None:
|
77 |
+
logger.warning(
|
78 |
+
f"Item with hash_id {info.get('hash_id')} could not be added"
|
79 |
+
)
|
80 |
+
else:
|
81 |
+
logger.info(
|
82 |
+
f"Item with hash_id {info.get('hash_id')} added to the database"
|
83 |
+
)
|
84 |
+
session.commit() # Commit at the end of the loop
|
85 |
+
|
86 |
+
|
87 |
+
def delete_many(ids: list):
|
88 |
+
with Session(engine) as session:
|
89 |
+
for id in ids:
|
90 |
+
# Reuse delete_one function for each item
|
91 |
+
result = delete_one(id)
|
92 |
+
if result is None:
|
93 |
+
logger.warning(f"No item with hash_id {id} found for deletion")
|
94 |
+
else:
|
95 |
+
logger.info(f"Item with hash_id {id} deleted from the database")
|
96 |
+
session.commit() # Commit at the end of the loop
|
97 |
+
|
98 |
+
|
99 |
+
def read_all(query: dict = None):
|
100 |
+
with Session(engine) as session:
|
101 |
+
statement = select(Sources)
|
102 |
+
if query:
|
103 |
+
statement = statement.where(
|
104 |
+
*[getattr(Sources, key) == value for key, value in query.items()]
|
105 |
+
)
|
106 |
+
sources = session.exec(statement).all()
|
107 |
+
return sources
|
108 |
+
|
109 |
+
|
110 |
+
def delete_all():
|
111 |
+
with Session(engine) as session:
|
112 |
+
session.exec(Sources).delete()
|
113 |
+
session.commit()
|
114 |
+
logger.info("All items deleted from the database")
|
rag_app/database/schema.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sqlmodel import SQLModel, Field
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
class Sources(SQLModel, table=True):
|
7 |
+
id: Optional[int] = Field(default=None, primary_key=True)
|
8 |
+
url: str = Field()
|
9 |
+
title: Optional[str] = Field(default="NA", unique=False)
|
10 |
+
hash_id: str = Field(unique=True)
|
11 |
+
created_at: float = Field(default=datetime.datetime.now().timestamp())
|
12 |
+
summary: str = Field(default="")
|
13 |
+
embedded: bool = Field(default=False)
|
14 |
+
|
15 |
+
__table_args__ = {"extend_existing": True}
|
rag_app/get_db_retriever.py
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
|
3 |
# HF libraries
|
4 |
from langchain.llms import HuggingFaceHub
|
5 |
-
from
|
6 |
# vectorestore
|
7 |
-
from
|
8 |
# retrieval chain
|
9 |
from langchain.chains import RetrievalQA
|
10 |
# prompt template
|
|
|
2 |
|
3 |
# HF libraries
|
4 |
from langchain.llms import HuggingFaceHub
|
5 |
+
from langchain_huggingface import HuggingFaceHubEmbeddings
|
6 |
# vectorestore
|
7 |
+
from langchain_community.vectorstores import FAISS
|
8 |
# retrieval chain
|
9 |
from langchain.chains import RetrievalQA
|
10 |
# prompt template
|
rag_app/{load_data_from_urls.py → loading_data/load_urls_recurisvely.py}
RENAMED
File without changes
|
rag_app/loading_data/scrap_website.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# scrap a given url recursively
|
2 |
+
|
3 |
+
import subprocess
|
4 |
+
import os
|
5 |
+
from urllib.parse import urlparse
|
6 |
+
from langchain_community.document_loaders import DirectoryLoader
|
7 |
+
|
8 |
+
def runcmd(cmd, verbose = False, *args, **kwargs):
|
9 |
+
|
10 |
+
process = subprocess.Popen(
|
11 |
+
cmd,
|
12 |
+
stdout = subprocess.PIPE,
|
13 |
+
stderr = subprocess.PIPE,
|
14 |
+
text = True,
|
15 |
+
shell = True
|
16 |
+
)
|
17 |
+
std_out, std_err = process.communicate()
|
18 |
+
if verbose:
|
19 |
+
print(std_out.strip(), std_err)
|
20 |
+
pass
|
21 |
+
return process.returncode
|
22 |
+
|
23 |
+
def scrap_website(target_url:str, depth:int=5):
|
24 |
+
target_domain = urlparse(target_url).netloc
|
25 |
+
target_directory='./downloads/'
|
26 |
+
# To download the files locally for processing, here's the command line
|
27 |
+
command_this=f'wget -e robots=off --recursive -l {depth} --no-clobber --page-requisites --html-extension \
|
28 |
+
--convert-links --restrict-file-names=windows --force-directories --directory-prefix={target_directory}\
|
29 |
+
--domains target_domain --no-parent {target_url}'
|
30 |
+
cmd_status = runcmd(command_this, verbose=True)
|
31 |
+
if cmd_status==0:
|
32 |
+
documents_path = os.path.dirname(os.path.realpath(f'{target_directory}/{target_domain}'))
|
33 |
+
loader = DirectoryLoader(documents_path, silent_errors=True, show_progress=True)
|
34 |
+
docs = loader.load()
|
35 |
+
|
36 |
+
return docs
|
rag_app/react_agent.py
CHANGED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HF libraries
|
2 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
3 |
+
from langchain.agents import AgentExecutor
|
4 |
+
from langchain.agents.format_scratchpad import format_log_to_str
|
5 |
+
from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
|
6 |
+
# Import things that are needed generically
|
7 |
+
from langchain.tools.render import render_text_description
|
8 |
+
import os
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from rag_app.structured_tools.structured_tools import (
|
11 |
+
google_search, knowledgeBase_search
|
12 |
+
)
|
13 |
+
|
14 |
+
from langchain.prompts import PromptTemplate
|
15 |
+
from rag_app.templates.react_json_with_memory import template_system
|
16 |
+
# from innovation_pathfinder_ai.utils import logger
|
17 |
+
# from langchain.globals import set_llm_cache
|
18 |
+
# from langchain.cache import SQLiteCache
|
19 |
+
|
20 |
+
# set_llm_cache(SQLiteCache(database_path=".cache.db"))
|
21 |
+
# logger = logger.get_console_logger("hf_mixtral_agent")
|
22 |
+
|
23 |
+
config = load_dotenv(".env")
|
24 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
25 |
+
GOOGLE_CSE_ID = os.getenv('GOOGLE_CSE_ID')
|
26 |
+
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
27 |
+
# LANGCHAIN_TRACING_V2 = "true"
|
28 |
+
# LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com"
|
29 |
+
# LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
|
30 |
+
# LANGCHAIN_PROJECT = os.getenv('LANGCHAIN_PROJECT')
|
31 |
+
|
32 |
+
# Load the model from the Hugging Face Hub
|
33 |
+
llm = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
34 |
+
temperature=0.1,
|
35 |
+
max_new_tokens=1024,
|
36 |
+
repetition_penalty=1.2,
|
37 |
+
return_full_text=False
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
tools = [
|
42 |
+
knowledgeBase_search,
|
43 |
+
google_search,
|
44 |
+
]
|
45 |
+
|
46 |
+
prompt = PromptTemplate.from_template(
|
47 |
+
template=template_system
|
48 |
+
)
|
49 |
+
prompt = prompt.partial(
|
50 |
+
tools=render_text_description(tools),
|
51 |
+
tool_names=", ".join([t.name for t in tools]),
|
52 |
+
)
|
53 |
+
|
54 |
+
|
55 |
+
# define the agent
|
56 |
+
chat_model_with_stop = llm.bind(stop=["\nObservation"])
|
57 |
+
agent = (
|
58 |
+
{
|
59 |
+
"input": lambda x: x["input"],
|
60 |
+
"agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]),
|
61 |
+
"chat_history": lambda x: x["chat_history"],
|
62 |
+
}
|
63 |
+
| prompt
|
64 |
+
| chat_model_with_stop
|
65 |
+
| ReActJsonSingleInputOutputParser()
|
66 |
+
)
|
67 |
+
|
68 |
+
# instantiate AgentExecutor
|
69 |
+
agent_executor = AgentExecutor(
|
70 |
+
agent=agent,
|
71 |
+
tools=tools,
|
72 |
+
verbose=True,
|
73 |
+
max_iterations=10, # cap number of iterations
|
74 |
+
#max_execution_time=60, # timout at 60 sec
|
75 |
+
return_intermediate_steps=True,
|
76 |
+
handle_parsing_errors=True,
|
77 |
+
)
|
rag_app/structured_tools/__init__.py
ADDED
File without changes
|
rag_app/structured_tools/structured_tools.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.tools import BaseTool, StructuredTool, tool
|
2 |
+
from langchain_community.tools import WikipediaQueryRun
|
3 |
+
from langchain_community.utilities import WikipediaAPIWrapper
|
4 |
+
#from langchain.tools import Tool
|
5 |
+
from langchain_google_community import GoogleSearchAPIWrapper
|
6 |
+
from langchain_community.embeddings.sentence_transformer import (
|
7 |
+
SentenceTransformerEmbeddings,
|
8 |
+
)
|
9 |
+
from langchain_community.vectorstores import Chroma
|
10 |
+
import ast
|
11 |
+
|
12 |
+
import chromadb
|
13 |
+
|
14 |
+
from rag_app.utils.utils import (
|
15 |
+
parse_list_to_dicts, format_search_results
|
16 |
+
)
|
17 |
+
from rag_app.database.db_handler import (
|
18 |
+
add_many
|
19 |
+
)
|
20 |
+
|
21 |
+
import os
|
22 |
+
# from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
|
23 |
+
|
24 |
+
persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
|
25 |
+
|
26 |
+
@tool
|
27 |
+
def memory_search(query:str) -> str:
|
28 |
+
"""Search the memory vector store for existing knowledge and relevent pervious researches. \
|
29 |
+
This is your primary source to start your search with checking what you already have learned from the past, before going online."""
|
30 |
+
# Since we have more than one collections we should change the name of this tool
|
31 |
+
client = chromadb.PersistentClient(
|
32 |
+
path=persist_directory,
|
33 |
+
)
|
34 |
+
|
35 |
+
collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
|
36 |
+
#store using envar
|
37 |
+
|
38 |
+
embedding_function = SentenceTransformerEmbeddings(
|
39 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
40 |
+
)
|
41 |
+
|
42 |
+
vector_db = Chroma(
|
43 |
+
client=client, # client for Chroma
|
44 |
+
collection_name=collection_name,
|
45 |
+
embedding_function=embedding_function,
|
46 |
+
)
|
47 |
+
|
48 |
+
retriever = vector_db.as_retriever()
|
49 |
+
docs = retriever.invoke(query)
|
50 |
+
|
51 |
+
return docs.__str__()
|
52 |
+
|
53 |
+
@tool
|
54 |
+
def knowledgeBase_search(query:str) -> str:
|
55 |
+
"""Search the internal knowledge base for research papers and relevent chunks"""
|
56 |
+
# Since we have more than one collections we should change the name of this tool
|
57 |
+
client = chromadb.PersistentClient(
|
58 |
+
path=persist_directory,
|
59 |
+
)
|
60 |
+
|
61 |
+
#collection_name="ArxivPapers"
|
62 |
+
#store using envar
|
63 |
+
|
64 |
+
embedding_function = SentenceTransformerEmbeddings(
|
65 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
66 |
+
)
|
67 |
+
|
68 |
+
vector_db = Chroma(
|
69 |
+
client=client, # client for Chroma
|
70 |
+
#collection_name=collection_name,
|
71 |
+
embedding_function=embedding_function,
|
72 |
+
)
|
73 |
+
|
74 |
+
retriever = vector_db.as_retriever()
|
75 |
+
# This is deprecated, changed to invoke
|
76 |
+
# LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.
|
77 |
+
docs = retriever.invoke(query)
|
78 |
+
for doc in docs:
|
79 |
+
print(doc)
|
80 |
+
|
81 |
+
return docs.__str__()
|
82 |
+
|
83 |
+
@tool
|
84 |
+
def google_search(query: str) -> str:
|
85 |
+
"""Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
|
86 |
+
global all_sources
|
87 |
+
|
88 |
+
websearch = GoogleSearchAPIWrapper()
|
89 |
+
search_results:dict = websearch.results(query, 3)
|
90 |
+
print(search_results)
|
91 |
+
if len(search_results)>1:
|
92 |
+
cleaner_sources =format_search_results(search_results)
|
93 |
+
parsed_csources = parse_list_to_dicts(cleaner_sources)
|
94 |
+
add_many(parsed_csources)
|
95 |
+
else:
|
96 |
+
cleaner_sources = search_results
|
97 |
+
|
98 |
+
return cleaner_sources.__str__()
|
rag_app/templates/__init__.py
ADDED
File without changes
|
rag_app/templates/react_json_with_memory.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
template_system = """
|
2 |
+
Answer the following questions as best you can. You have access to the following tools:
|
3 |
+
|
4 |
+
<TOOLS>
|
5 |
+
{tools}
|
6 |
+
</TOOLS>
|
7 |
+
|
8 |
+
The way you use the tools is by specifying a json blob.
|
9 |
+
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).
|
10 |
+
|
11 |
+
The only values that should be in the "action" field are: {tool_names}
|
12 |
+
|
13 |
+
The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:
|
14 |
+
|
15 |
+
```
|
16 |
+
{{
|
17 |
+
"action": $TOOL_NAME,
|
18 |
+
"action_input": $INPUT
|
19 |
+
}}
|
20 |
+
```
|
21 |
+
|
22 |
+
ALWAYS use the following format:
|
23 |
+
|
24 |
+
Question: the input question you must answer
|
25 |
+
Thought: you should always think about what to do
|
26 |
+
Action:
|
27 |
+
```
|
28 |
+
$JSON_BLOB
|
29 |
+
```
|
30 |
+
Observation: the result of the action
|
31 |
+
... (this Thought/Action/Observation can repeat N times)
|
32 |
+
Thought: I now know the final answer
|
33 |
+
Final Answer: the final answer to the original input question
|
34 |
+
|
35 |
+
Begin! Reminder to always use the exact characters `Final Answer` when responding.
|
36 |
+
|
37 |
+
Previous conversation history:
|
38 |
+
<CONVERSATION_HISTORY>
|
39 |
+
{chat_history}
|
40 |
+
</CONVERSATION_HISTORY>
|
41 |
+
|
42 |
+
<NEW_INPUT>
|
43 |
+
{input}
|
44 |
+
</NEW_INPUT>
|
45 |
+
|
46 |
+
{agent_scratchpad}
|
47 |
+
"""
|
rag_app/utils/__init__.py
ADDED
File without changes
|
rag_app/utils/logger.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# logger.py
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from rich.logging import RichHandler
|
5 |
+
from typing import Optional
|
6 |
+
|
7 |
+
|
8 |
+
def get_console_logger(name: Optional[str] = "default") -> logging.Logger:
|
9 |
+
logger = logging.getLogger(name)
|
10 |
+
if not logger.handlers:
|
11 |
+
logger.setLevel(logging.DEBUG)
|
12 |
+
console_handler = RichHandler()
|
13 |
+
console_handler.setLevel(logging.DEBUG)
|
14 |
+
formatter = logging.Formatter(
|
15 |
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
16 |
+
)
|
17 |
+
console_handler.setFormatter(formatter)
|
18 |
+
logger.addHandler(console_handler)
|
19 |
+
|
20 |
+
return logger
|
rag_app/utils/utils.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import datetime
|
3 |
+
import os
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
# from rag_app.utils import logger
|
7 |
+
|
8 |
+
# logger = logger.get_console_logger("utils")
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def extract_urls(data_list):
|
13 |
+
"""
|
14 |
+
Extracts URLs from a list of of dictionaries.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
- list: A list of URLs extracted from the dictionaries.
|
21 |
+
"""
|
22 |
+
urls = []
|
23 |
+
print(data_list)
|
24 |
+
for item in data_list:
|
25 |
+
try:
|
26 |
+
# Find the start and end indices of the URL
|
27 |
+
lower_case = item.lower()
|
28 |
+
link_prefix = 'link: '
|
29 |
+
summary_prefix = ', summary:'
|
30 |
+
start_idx = lower_case.index(link_prefix) + len(link_prefix)
|
31 |
+
end_idx = lower_case.index(summary_prefix, start_idx)
|
32 |
+
# Extract the URL using the indices found
|
33 |
+
url = item[start_idx:end_idx]
|
34 |
+
urls.append(url)
|
35 |
+
except ValueError:
|
36 |
+
# Handles the case where 'link: ' or ', summary:' is not found in the string
|
37 |
+
print("Could not find a URL in the item:", item)
|
38 |
+
last_sources = urls[-3:]
|
39 |
+
return last_sources
|
40 |
+
|
41 |
+
def format_search_results(search_results):
|
42 |
+
"""
|
43 |
+
Formats a list of dictionaries containing search results into a list of strings.
|
44 |
+
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
|
45 |
+
|
46 |
+
Parameters:
|
47 |
+
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
- list: A list of formatted strings based on the search results.
|
51 |
+
"""
|
52 |
+
if len(search_results)>1:
|
53 |
+
formatted_results = [
|
54 |
+
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
|
55 |
+
for i in search_results
|
56 |
+
]
|
57 |
+
return formatted_results
|
58 |
+
|
59 |
+
def parse_list_to_dicts(items: list) -> list:
|
60 |
+
parsed_items = []
|
61 |
+
for item in items:
|
62 |
+
# Extract title, link, and summary from each string
|
63 |
+
title_start = item.find('Title: ') + len('Title: ')
|
64 |
+
link_start = item.find('Link: ') + len('Link: ')
|
65 |
+
summary_start = item.find('Summary: ') + len('Summary: ')
|
66 |
+
|
67 |
+
title_end = item.find(', Link: ')
|
68 |
+
link_end = item.find(', Summary: ')
|
69 |
+
summary_end = len(item)
|
70 |
+
|
71 |
+
title = item[title_start:title_end]
|
72 |
+
link = item[link_start:link_end]
|
73 |
+
summary = item[summary_start:summary_end]
|
74 |
+
|
75 |
+
# Use the hash_text function for the hash_id
|
76 |
+
hash_id = hash_text(link)
|
77 |
+
|
78 |
+
# Construct the dictionary for each item
|
79 |
+
parsed_item = {
|
80 |
+
"url": link,
|
81 |
+
"title": title,
|
82 |
+
"hash_id": hash_id,
|
83 |
+
"summary": summary
|
84 |
+
}
|
85 |
+
parsed_items.append(parsed_item)
|
86 |
+
return parsed_items
|
87 |
+
|
88 |
+
def hash_text(text: str) -> str:
|
89 |
+
return hashlib.md5(text.encode()).hexdigest()
|
90 |
+
|
91 |
+
|
92 |
+
def convert_timestamp_to_datetime(timestamp: str) -> str:
|
93 |
+
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
|
94 |
+
|
95 |
+
def create_folder_if_not_exists(folder_path: str) -> None:
|
96 |
+
"""
|
97 |
+
Create a folder if it doesn't already exist.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
- folder_path (str): The path of the folder to create.
|
101 |
+
"""
|
102 |
+
if not os.path.exists(folder_path):
|
103 |
+
os.makedirs(folder_path)
|
104 |
+
print(f"Folder '{folder_path}' created.")
|
105 |
+
else:
|
106 |
+
print(f"Folder '{folder_path}' already exists.")
|
107 |
+
|
108 |
+
def generate_uuid() -> str:
|
109 |
+
"""
|
110 |
+
Generate a UUID (Universally Unique Identifier) and return it as a string.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
str: A UUID string.
|
114 |
+
"""
|
115 |
+
return str(uuid.uuid4())
|
requirements.txt
CHANGED
@@ -2,12 +2,16 @@
|
|
2 |
langchain-community
|
3 |
langchain-HuggingFace
|
4 |
langchain-text-splitters
|
|
|
5 |
beautifulsoup4
|
6 |
faiss-cpu
|
7 |
chromadb
|
8 |
validators
|
9 |
sentence_transformers
|
10 |
typing-extensions
|
11 |
-
unstructured
|
|
|
12 |
gradio
|
13 |
-
boto3
|
|
|
|
|
|
2 |
langchain-community
|
3 |
langchain-HuggingFace
|
4 |
langchain-text-splitters
|
5 |
+
langchain_google_community
|
6 |
beautifulsoup4
|
7 |
faiss-cpu
|
8 |
chromadb
|
9 |
validators
|
10 |
sentence_transformers
|
11 |
typing-extensions
|
12 |
+
unstructured
|
13 |
+
unstructured[pdf]
|
14 |
gradio
|
15 |
+
boto3
|
16 |
+
rich
|
17 |
+
sqlmodel
|
test_this.py
CHANGED
@@ -1,23 +1,27 @@
|
|
1 |
-
from rag_app.
|
2 |
from rag_app.create_embedding import create_embeddings
|
3 |
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
-
from rag_app.
|
5 |
|
6 |
# 1. load the urls
|
7 |
# 2. build the vectorstore -> the function will create the chunking and embeddings
|
8 |
# 3. initialize the db retriever
|
9 |
# 4.
|
10 |
|
11 |
-
docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
|
12 |
|
13 |
-
# for doc in docs:
|
14 |
-
# keywords=generate_keywords(doc)
|
15 |
-
# description=generate_description(doc)
|
16 |
-
# doc.metadata['keywords']=keywords
|
17 |
-
# doc.metadata['description']=description
|
18 |
-
# print(doc.metadata)
|
19 |
|
20 |
-
build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
|
21 |
|
22 |
|
23 |
-
#print(create_embeddings(docs))
|
|
|
|
|
|
|
|
|
|
1 |
+
from rag_app.loading_data.load_urls_recurisvely import load_docs_from_urls
|
2 |
from rag_app.create_embedding import create_embeddings
|
3 |
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
+
from rag_app.build_vector_store import build_vector_store
|
5 |
|
6 |
# 1. load the urls
|
7 |
# 2. build the vectorstore -> the function will create the chunking and embeddings
|
8 |
# 3. initialize the db retriever
|
9 |
# 4.
|
10 |
|
11 |
+
# docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
|
12 |
|
13 |
+
# # for doc in docs:
|
14 |
+
# # keywords=generate_keywords(doc)
|
15 |
+
# # description=generate_description(doc)
|
16 |
+
# # doc.metadata['keywords']=keywords
|
17 |
+
# # doc.metadata['description']=description
|
18 |
+
# # print(doc.metadata)
|
19 |
|
20 |
+
# build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
|
21 |
|
22 |
|
23 |
+
# print(create_embeddings(docs))
|
24 |
+
|
25 |
+
from rag_app.loading_data.scrap_website import scrap_website
|
26 |
+
|
27 |
+
print(scrap_website(target_url='https://www.wuerttembergische.de/',depth=1))
|