Spaces:
Sleeping
Sleeping
fahmiaziz98
commited on
Commit
·
40ca01e
1
Parent(s):
91c6bea
Refactor document processing and retrieval workflow; add utility functions for markdown conversion and logging
Browse files- app.py +24 -11
- requirements.txt +12 -12
- src/indexing/document_processing.py +5 -2
- src/indexing/vectore_store.py +10 -16
- src/tools_retrieval/retriever.py +14 -8
- src/utils.py +95 -0
app.py
CHANGED
|
@@ -4,8 +4,15 @@ from src.indexing.document_processing import DocumentProcessor
|
|
| 4 |
from src.indexing.vectore_store import VectorStoreManager
|
| 5 |
from src.tools_retrieval.retriever import RetrieverManager
|
| 6 |
from src.workflow import RAGWorkflow
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
UPLOAD_FOLDER = "uploads/"
|
| 10 |
PERSIST_DIRECTORY = "./chroma_db"
|
| 11 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
@@ -29,28 +36,34 @@ st.set_page_config(
|
|
| 29 |
st.title("Agentic RAG Chatbot")
|
| 30 |
|
| 31 |
with st.sidebar:
|
| 32 |
-
st.header("
|
| 33 |
-
uploaded_file = st.file_uploader("Upload
|
| 34 |
-
st.
|
| 35 |
-
process_button = st.button("Process PDF")
|
| 36 |
|
| 37 |
if uploaded_file and process_button:
|
| 38 |
-
with st.spinner("Processing
|
| 39 |
file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
|
| 40 |
with open(file_path, "wb") as f:
|
| 41 |
f.write(uploaded_file.getbuffer())
|
| 42 |
-
|
|
|
|
| 43 |
doc_processor = DocumentProcessor()
|
| 44 |
-
chunks = doc_processor.load_and_split_pdf(
|
| 45 |
|
| 46 |
vector_store_manager = VectorStoreManager()
|
| 47 |
-
vector_store = vector_store_manager.index_documents(chunks
|
| 48 |
|
| 49 |
st.session_state.vector_store = vector_store
|
| 50 |
-
st.success("
|
|
|
|
|
|
|
| 51 |
|
| 52 |
retriever_manager = RetrieverManager(vector_store)
|
| 53 |
-
retriever_tool = retriever_manager.create_retriever(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
st.session_state.retriever = retriever_tool
|
| 55 |
st.success("Retriever tool created successfully!")
|
| 56 |
rag_workflow = RAGWorkflow(retriever_tool)
|
|
|
|
| 4 |
from src.indexing.vectore_store import VectorStoreManager
|
| 5 |
from src.tools_retrieval.retriever import RetrieverManager
|
| 6 |
from src.workflow import RAGWorkflow
|
| 7 |
+
from src.utils import (
|
| 8 |
+
logger,
|
| 9 |
+
convert_document_to_markdown,
|
| 10 |
+
save_to_markdown,
|
| 11 |
+
determine_top_k,
|
| 12 |
+
determine_reranking_top_n
|
| 13 |
+
)
|
| 14 |
|
| 15 |
+
|
| 16 |
UPLOAD_FOLDER = "uploads/"
|
| 17 |
PERSIST_DIRECTORY = "./chroma_db"
|
| 18 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
|
|
| 36 |
st.title("Agentic RAG Chatbot")
|
| 37 |
|
| 38 |
with st.sidebar:
|
| 39 |
+
st.header("Upload")
|
| 40 |
+
uploaded_file = st.file_uploader("Upload Document", type=["pdf", "xlsx", "docx", "txt"])
|
| 41 |
+
process_button = st.button("Process Document")
|
|
|
|
| 42 |
|
| 43 |
if uploaded_file and process_button:
|
| 44 |
+
with st.spinner("Processing Document..."):
|
| 45 |
file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
|
| 46 |
with open(file_path, "wb") as f:
|
| 47 |
f.write(uploaded_file.getbuffer())
|
| 48 |
+
convert_to_md = convert_document_to_markdown(file_path)
|
| 49 |
+
file_path_md = save_to_markdown(convert_to_md, file_path)
|
| 50 |
doc_processor = DocumentProcessor()
|
| 51 |
+
chunks = doc_processor.load_and_split_pdf(file_path_md)
|
| 52 |
|
| 53 |
vector_store_manager = VectorStoreManager()
|
| 54 |
+
vector_store = vector_store_manager.index_documents(chunks)
|
| 55 |
|
| 56 |
st.session_state.vector_store = vector_store
|
| 57 |
+
st.success("Document processed and indexed successfully!")
|
| 58 |
+
top_k = determine_top_k(len(chunks))
|
| 59 |
+
top_n = determine_reranking_top_n(top_k)
|
| 60 |
|
| 61 |
retriever_manager = RetrieverManager(vector_store)
|
| 62 |
+
retriever_tool = retriever_manager.create_retriever(
|
| 63 |
+
documents=chunks,
|
| 64 |
+
top_n=top_n,
|
| 65 |
+
k=top_k
|
| 66 |
+
)
|
| 67 |
st.session_state.retriever = retriever_tool
|
| 68 |
st.success("Retriever tool created successfully!")
|
| 69 |
rag_workflow = RAGWorkflow(retriever_tool)
|
requirements.txt
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
langchain-
|
| 4 |
-
langchain-
|
| 5 |
-
|
| 6 |
-
scikit-learn
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 1 |
+
markitdown[all]==0.1.1
|
| 2 |
+
langchain==0.3.24
|
| 3 |
+
langchain-groq==0.3.2
|
| 4 |
+
langchain-community==0.3.23
|
| 5 |
+
langgraph==0.4.3
|
| 6 |
+
scikit-learn==1.6.1
|
| 7 |
+
tiktoken==0.9.0
|
| 8 |
+
rank_bm25==0.2.2
|
| 9 |
+
fastembed==0.6.1
|
| 10 |
+
flashrank==0.2.10
|
| 11 |
+
langchain-unstructured==0.1.6
|
| 12 |
+
unstructured==0.17.2
|
src/indexing/document_processing.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 2 |
-
from
|
|
|
|
| 3 |
|
| 4 |
class DocumentProcessor:
|
| 5 |
def __init__(self, chunk_size=500, chunk_overlap=100):
|
|
@@ -10,7 +11,9 @@ class DocumentProcessor:
|
|
| 10 |
|
| 11 |
def load_and_split_pdf(self, file_path: str):
|
| 12 |
"""Load PDF and split into chunks"""
|
| 13 |
-
|
|
|
|
| 14 |
docs = loader.load()
|
| 15 |
chunks = self.text_splitter.split_documents(docs)
|
|
|
|
| 16 |
return chunks
|
|
|
|
| 1 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 2 |
+
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
| 3 |
+
from src.utils import logger
|
| 4 |
|
| 5 |
class DocumentProcessor:
|
| 6 |
def __init__(self, chunk_size=500, chunk_overlap=100):
|
|
|
|
| 11 |
|
| 12 |
def load_and_split_pdf(self, file_path: str):
|
| 13 |
"""Load PDF and split into chunks"""
|
| 14 |
+
logger.info(f"Loading and splitting PDF: {file_path}")
|
| 15 |
+
loader = UnstructuredMarkdownLoader(file_path)
|
| 16 |
docs = loader.load()
|
| 17 |
chunks = self.text_splitter.split_documents(docs)
|
| 18 |
+
logger.info(f"Loaded and split PDF into {len(chunks)} chunks")
|
| 19 |
return chunks
|
src/indexing/vectore_store.py
CHANGED
|
@@ -1,27 +1,21 @@
|
|
| 1 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
from langchain_community.vectorstores import SKLearnVectorStore
|
| 3 |
-
from
|
| 4 |
|
| 5 |
class VectorStoreManager:
|
| 6 |
-
def __init__(self, embedding_model="
|
| 7 |
-
self.embeddings =
|
| 8 |
|
| 9 |
-
def create_vector_store(self
|
| 10 |
"""Create a new vector store"""
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
# )
|
| 15 |
-
vector_store = Chroma(
|
| 16 |
-
collection_name=collection_name,
|
| 17 |
-
embedding_function=self.embeddings,
|
| 18 |
-
persist_directory=presist_directory, # Where to save data locally, remove if not necessary
|
| 19 |
)
|
| 20 |
-
|
| 21 |
return vector_store
|
| 22 |
|
| 23 |
-
def index_documents(self, documents
|
| 24 |
"""Index documents into vector store"""
|
| 25 |
-
vector_store = self.create_vector_store(
|
| 26 |
vector_store.add_documents(documents=documents)
|
| 27 |
return vector_store
|
|
|
|
|
|
|
| 1 |
from langchain_community.vectorstores import SKLearnVectorStore
|
| 2 |
+
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
| 3 |
|
| 4 |
class VectorStoreManager:
|
| 5 |
+
def __init__(self, embedding_model="BAAI/bge-base-en-v1.5"):
|
| 6 |
+
self.embeddings = FastEmbedEmbeddings(model_name=embedding_model)
|
| 7 |
|
| 8 |
+
def create_vector_store(self):
|
| 9 |
"""Create a new vector store"""
|
| 10 |
+
vector_store = SKLearnVectorStore.from_documents(
|
| 11 |
+
metric="cosine",
|
| 12 |
+
embedding=self.embeddings,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
)
|
| 14 |
+
|
| 15 |
return vector_store
|
| 16 |
|
| 17 |
+
def index_documents(self, documents):
|
| 18 |
"""Index documents into vector store"""
|
| 19 |
+
vector_store = self.create_vector_store()
|
| 20 |
vector_store.add_documents(documents=documents)
|
| 21 |
return vector_store
|
src/tools_retrieval/retriever.py
CHANGED
|
@@ -10,25 +10,31 @@ class RetrieverManager:
|
|
| 10 |
def __init__(self, vector_store):
|
| 11 |
self.vector_store = vector_store
|
| 12 |
|
| 13 |
-
def create_base_retriever(self, search_type="similarity", k=3):
|
| 14 |
"""Create basic vector store retriever"""
|
| 15 |
return self.vector_store.as_retriever(
|
| 16 |
search_type=search_type,
|
| 17 |
search_kwargs={"k": k}
|
| 18 |
)
|
| 19 |
|
| 20 |
-
def create_ensemble_retriever(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"""Create ensemble retriever combining vector and keyword search"""
|
| 22 |
-
vector_retriever = self.create_base_retriever()
|
| 23 |
keyword_retriever = BM25Retriever.from_documents(texts)
|
| 24 |
-
keyword_retriever.k =
|
| 25 |
|
| 26 |
return EnsembleRetriever(
|
| 27 |
retrievers=[vector_retriever, keyword_retriever],
|
| 28 |
weights=[vector_weight, keyword_weight]
|
| 29 |
)
|
| 30 |
|
| 31 |
-
def create_compression_retriever(self, base_retriever, top_n
|
| 32 |
"""Create compression retriever with reranking"""
|
| 33 |
compressor = FlashrankRerank(top_n=top_n)
|
| 34 |
return ContextualCompressionRetriever(
|
|
@@ -36,9 +42,9 @@ class RetrieverManager:
|
|
| 36 |
base_retriever=base_retriever
|
| 37 |
)
|
| 38 |
|
| 39 |
-
def create_retriever(self, documents):
|
| 40 |
-
base_retriever = self.create_ensemble_retriever(documents)
|
| 41 |
-
compression_retriever = self.create_compression_retriever(base_retriever=base_retriever)
|
| 42 |
return create_retriever_tool(
|
| 43 |
compression_retriever,
|
| 44 |
"retrieve_docs",
|
|
|
|
| 10 |
def __init__(self, vector_store):
|
| 11 |
self.vector_store = vector_store
|
| 12 |
|
| 13 |
+
def create_base_retriever(self, search_type: str ="similarity", k: int = 3):
|
| 14 |
"""Create basic vector store retriever"""
|
| 15 |
return self.vector_store.as_retriever(
|
| 16 |
search_type=search_type,
|
| 17 |
search_kwargs={"k": k}
|
| 18 |
)
|
| 19 |
|
| 20 |
+
def create_ensemble_retriever(
|
| 21 |
+
self,
|
| 22 |
+
texts,
|
| 23 |
+
k: int = 3,
|
| 24 |
+
vector_weight: float = 0.5,
|
| 25 |
+
keyword_weight: float =0.5
|
| 26 |
+
):
|
| 27 |
"""Create ensemble retriever combining vector and keyword search"""
|
| 28 |
+
vector_retriever = self.create_base_retriever(k=k)
|
| 29 |
keyword_retriever = BM25Retriever.from_documents(texts)
|
| 30 |
+
keyword_retriever.k = k
|
| 31 |
|
| 32 |
return EnsembleRetriever(
|
| 33 |
retrievers=[vector_retriever, keyword_retriever],
|
| 34 |
weights=[vector_weight, keyword_weight]
|
| 35 |
)
|
| 36 |
|
| 37 |
+
def create_compression_retriever(self, base_retriever, top_n: int):
|
| 38 |
"""Create compression retriever with reranking"""
|
| 39 |
compressor = FlashrankRerank(top_n=top_n)
|
| 40 |
return ContextualCompressionRetriever(
|
|
|
|
| 42 |
base_retriever=base_retriever
|
| 43 |
)
|
| 44 |
|
| 45 |
+
def create_retriever(self, documents, top_n: int, k: int = 3, ):
|
| 46 |
+
base_retriever = self.create_ensemble_retriever(texts=documents, k=k)
|
| 47 |
+
compression_retriever = self.create_compression_retriever(base_retriever=base_retriever, top_n=top_n)
|
| 48 |
return create_retriever_tool(
|
| 49 |
compression_retriever,
|
| 50 |
"retrieve_docs",
|
src/utils.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Any
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from markitdown import MarkItDown
|
| 6 |
+
|
| 7 |
+
def setup_logging():
|
| 8 |
+
"""Sets up the logging configuration."""
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=logging.INFO,
|
| 11 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 12 |
+
handlers=[
|
| 13 |
+
logging.FileHandler("app.log", encoding="utf-8"),
|
| 14 |
+
logging.StreamHandler()
|
| 15 |
+
]
|
| 16 |
+
)
|
| 17 |
+
return logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
logger = setup_logging()
|
| 20 |
+
|
| 21 |
+
def extract_filename(filepath: Path) -> str:
|
| 22 |
+
"""Extracts the filename without extension.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
filepath: The complete path to the file.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
The filename without extension.
|
| 29 |
+
"""
|
| 30 |
+
logger.info(f"Extracting filename from {filepath}")
|
| 31 |
+
return os.path.splitext(os.path.basename(filepath))[0] # More concise way to get filename
|
| 32 |
+
|
| 33 |
+
def convert_document_to_markdown(filepath: Path) -> str:
|
| 34 |
+
"""Converts a document to markdown.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
filepath: The path to the document file.
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
The raw markdown content.
|
| 41 |
+
"""
|
| 42 |
+
logger.info(f"Converting document to markdown: {filepath}")
|
| 43 |
+
md = MarkItDown(enable_plugins=False) # Set to True to enable plugins if needed
|
| 44 |
+
result = md.convert(filepath)
|
| 45 |
+
return result.markdown
|
| 46 |
+
|
| 47 |
+
def save_to_markdown(text: Any, path: Path) -> str:
|
| 48 |
+
"""Saves text content to a markdown file.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
text: The text or markdown content to save.
|
| 52 |
+
path: The complete path to the markdown file.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
The path to the saved markdown file as a string.
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
filename = extract_filename(path)
|
| 59 |
+
filepath = f'{filename}.md' # Create the full filepath
|
| 60 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 61 |
+
f.write(text)
|
| 62 |
+
logger.info(f"Markdown file saved successfully at {filepath}")
|
| 63 |
+
return filepath # Return the filepath
|
| 64 |
+
|
| 65 |
+
def determine_top_k(num_chunks: int) -> int:
|
| 66 |
+
"""Determines the top_k value based on the number of chunks.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
num_chunks: The total number of chunks.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
The appropriate top_k value.
|
| 73 |
+
"""
|
| 74 |
+
if num_chunks <= 5:
|
| 75 |
+
top_k = num_chunks
|
| 76 |
+
else:
|
| 77 |
+
top_k = 5
|
| 78 |
+
logger.info(f"Determined top_k: {top_k} based on num_chunks: {num_chunks}")
|
| 79 |
+
return top_k
|
| 80 |
+
|
| 81 |
+
def determine_reranking_top_n(top_k: int) -> int:
|
| 82 |
+
"""Determines the top_n value for reranking based on top_k.
|
| 83 |
+
Args:
|
| 84 |
+
top_k: The number of top results to consider.
|
| 85 |
+
Returns:
|
| 86 |
+
The appropriate top_n value for reranking.
|
| 87 |
+
"""
|
| 88 |
+
total_top_k = top_k * 2
|
| 89 |
+
|
| 90 |
+
if total_top_k <= 5:
|
| 91 |
+
top_n = round(total_top_k / 2) + 1
|
| 92 |
+
else:
|
| 93 |
+
top_n = 6
|
| 94 |
+
logger.info(f"Determined top_n: {top_n} based on top_k: {top_k}")
|
| 95 |
+
return top_n
|