fahmiaziz98 commited on
Commit
40ca01e
·
1 Parent(s): 91c6bea

Refactor document processing and retrieval workflow; add utility functions for markdown conversion and logging

Browse files
app.py CHANGED
@@ -4,8 +4,15 @@ from src.indexing.document_processing import DocumentProcessor
4
  from src.indexing.vectore_store import VectorStoreManager
5
  from src.tools_retrieval.retriever import RetrieverManager
6
  from src.workflow import RAGWorkflow
 
 
 
 
 
 
 
7
 
8
-
9
  UPLOAD_FOLDER = "uploads/"
10
  PERSIST_DIRECTORY = "./chroma_db"
11
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@@ -29,28 +36,34 @@ st.set_page_config(
29
  st.title("Agentic RAG Chatbot")
30
 
31
  with st.sidebar:
32
- st.header("PDF Upload")
33
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
34
- st.info("Supported file type: PDF")
35
- process_button = st.button("Process PDF")
36
 
37
  if uploaded_file and process_button:
38
- with st.spinner("Processing PDF..."):
39
  file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
40
  with open(file_path, "wb") as f:
41
  f.write(uploaded_file.getbuffer())
42
-
 
43
  doc_processor = DocumentProcessor()
44
- chunks = doc_processor.load_and_split_pdf(file_path)
45
 
46
  vector_store_manager = VectorStoreManager()
47
- vector_store = vector_store_manager.index_documents(chunks, uploaded_file.name, PERSIST_DIRECTORY)
48
 
49
  st.session_state.vector_store = vector_store
50
- st.success("PDF processed and indexed successfully!")
 
 
51
 
52
  retriever_manager = RetrieverManager(vector_store)
53
- retriever_tool = retriever_manager.create_retriever(chunks)
 
 
 
 
54
  st.session_state.retriever = retriever_tool
55
  st.success("Retriever tool created successfully!")
56
  rag_workflow = RAGWorkflow(retriever_tool)
 
4
  from src.indexing.vectore_store import VectorStoreManager
5
  from src.tools_retrieval.retriever import RetrieverManager
6
  from src.workflow import RAGWorkflow
7
+ from src.utils import (
8
+ logger,
9
+ convert_document_to_markdown,
10
+ save_to_markdown,
11
+ determine_top_k,
12
+ determine_reranking_top_n
13
+ )
14
 
15
+
16
  UPLOAD_FOLDER = "uploads/"
17
  PERSIST_DIRECTORY = "./chroma_db"
18
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 
36
  st.title("Agentic RAG Chatbot")
37
 
38
  with st.sidebar:
39
+ st.header("Upload")
40
+ uploaded_file = st.file_uploader("Upload Document", type=["pdf", "xlsx", "docx", "txt"])
41
+ process_button = st.button("Process Document")
 
42
 
43
  if uploaded_file and process_button:
44
+ with st.spinner("Processing Document..."):
45
  file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
46
  with open(file_path, "wb") as f:
47
  f.write(uploaded_file.getbuffer())
48
+ convert_to_md = convert_document_to_markdown(file_path)
49
+ file_path_md = save_to_markdown(convert_to_md, file_path)
50
  doc_processor = DocumentProcessor()
51
+ chunks = doc_processor.load_and_split_pdf(file_path_md)
52
 
53
  vector_store_manager = VectorStoreManager()
54
+ vector_store = vector_store_manager.index_documents(chunks)
55
 
56
  st.session_state.vector_store = vector_store
57
+ st.success("Document processed and indexed successfully!")
58
+ top_k = determine_top_k(len(chunks))
59
+ top_n = determine_reranking_top_n(top_k)
60
 
61
  retriever_manager = RetrieverManager(vector_store)
62
+ retriever_tool = retriever_manager.create_retriever(
63
+ documents=chunks,
64
+ top_n=top_n,
65
+ k=top_k
66
+ )
67
  st.session_state.retriever = retriever_tool
68
  st.success("Retriever tool created successfully!")
69
  rag_workflow = RAGWorkflow(retriever_tool)
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
- langchain
2
- langgraph
3
- langchain-huggingface
4
- langchain-groq
5
- langchain-community
6
- scikit-learn
7
- langchain-chroma
8
- pypdf==5.1.0
9
- tiktoken
10
- rank_bm25
11
- fastembed
12
- flashrank
 
1
+ markitdown[all]==0.1.1
2
+ langchain==0.3.24
3
+ langchain-groq==0.3.2
4
+ langchain-community==0.3.23
5
+ langgraph==0.4.3
6
+ scikit-learn==1.6.1
7
+ tiktoken==0.9.0
8
+ rank_bm25==0.2.2
9
+ fastembed==0.6.1
10
+ flashrank==0.2.10
11
+ langchain-unstructured==0.1.6
12
+ unstructured==0.17.2
src/indexing/document_processing.py CHANGED
@@ -1,5 +1,6 @@
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- from langchain.document_loaders import PyPDFLoader
 
3
 
4
  class DocumentProcessor:
5
  def __init__(self, chunk_size=500, chunk_overlap=100):
@@ -10,7 +11,9 @@ class DocumentProcessor:
10
 
11
  def load_and_split_pdf(self, file_path: str):
12
  """Load PDF and split into chunks"""
13
- loader = PyPDFLoader(file_path)
 
14
  docs = loader.load()
15
  chunks = self.text_splitter.split_documents(docs)
 
16
  return chunks
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
3
+ from src.utils import logger
4
 
5
  class DocumentProcessor:
6
  def __init__(self, chunk_size=500, chunk_overlap=100):
 
11
 
12
  def load_and_split_pdf(self, file_path: str):
13
  """Load PDF and split into chunks"""
14
+ logger.info(f"Loading and splitting PDF: {file_path}")
15
+ loader = UnstructuredMarkdownLoader(file_path)
16
  docs = loader.load()
17
  chunks = self.text_splitter.split_documents(docs)
18
+ logger.info(f"Loaded and split PDF into {len(chunks)} chunks")
19
  return chunks
src/indexing/vectore_store.py CHANGED
@@ -1,27 +1,21 @@
1
- from langchain_huggingface import HuggingFaceEmbeddings
2
  from langchain_community.vectorstores import SKLearnVectorStore
3
- from langchain_chroma import Chroma
4
 
5
  class VectorStoreManager:
6
- def __init__(self, embedding_model="intfloat/multilingual-e5-small"):
7
- self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
8
 
9
- def create_vector_store(self, collection_name, presist_directory):
10
  """Create a new vector store"""
11
- # vector_store = SKLearnVectorStore.from_documents(
12
- # documents=documents,
13
- # embedding=self.embeddings,
14
- # )
15
- vector_store = Chroma(
16
- collection_name=collection_name,
17
- embedding_function=self.embeddings,
18
- persist_directory=presist_directory, # Where to save data locally, remove if not necessary
19
  )
20
-
21
  return vector_store
22
 
23
- def index_documents(self, documents, collection_name, presist_directory):
24
  """Index documents into vector store"""
25
- vector_store = self.create_vector_store(collection_name, presist_directory)
26
  vector_store.add_documents(documents=documents)
27
  return vector_store
 
 
1
  from langchain_community.vectorstores import SKLearnVectorStore
2
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
3
 
4
  class VectorStoreManager:
5
+ def __init__(self, embedding_model="BAAI/bge-base-en-v1.5"):
6
+ self.embeddings = FastEmbedEmbeddings(model_name=embedding_model)
7
 
8
+ def create_vector_store(self):
9
  """Create a new vector store"""
10
+ vector_store = SKLearnVectorStore.from_documents(
11
+ metric="cosine",
12
+ embedding=self.embeddings,
 
 
 
 
 
13
  )
14
+
15
  return vector_store
16
 
17
+ def index_documents(self, documents):
18
  """Index documents into vector store"""
19
+ vector_store = self.create_vector_store()
20
  vector_store.add_documents(documents=documents)
21
  return vector_store
src/tools_retrieval/retriever.py CHANGED
@@ -10,25 +10,31 @@ class RetrieverManager:
10
  def __init__(self, vector_store):
11
  self.vector_store = vector_store
12
 
13
- def create_base_retriever(self, search_type="similarity", k=3):
14
  """Create basic vector store retriever"""
15
  return self.vector_store.as_retriever(
16
  search_type=search_type,
17
  search_kwargs={"k": k}
18
  )
19
 
20
- def create_ensemble_retriever(self, texts, vector_weight=0.5, keyword_weight=0.5):
 
 
 
 
 
 
21
  """Create ensemble retriever combining vector and keyword search"""
22
- vector_retriever = self.create_base_retriever()
23
  keyword_retriever = BM25Retriever.from_documents(texts)
24
- keyword_retriever.k = 3
25
 
26
  return EnsembleRetriever(
27
  retrievers=[vector_retriever, keyword_retriever],
28
  weights=[vector_weight, keyword_weight]
29
  )
30
 
31
- def create_compression_retriever(self, base_retriever, top_n=5):
32
  """Create compression retriever with reranking"""
33
  compressor = FlashrankRerank(top_n=top_n)
34
  return ContextualCompressionRetriever(
@@ -36,9 +42,9 @@ class RetrieverManager:
36
  base_retriever=base_retriever
37
  )
38
 
39
- def create_retriever(self, documents):
40
- base_retriever = self.create_ensemble_retriever(documents)
41
- compression_retriever = self.create_compression_retriever(base_retriever=base_retriever)
42
  return create_retriever_tool(
43
  compression_retriever,
44
  "retrieve_docs",
 
10
  def __init__(self, vector_store):
11
  self.vector_store = vector_store
12
 
13
+ def create_base_retriever(self, search_type: str ="similarity", k: int = 3):
14
  """Create basic vector store retriever"""
15
  return self.vector_store.as_retriever(
16
  search_type=search_type,
17
  search_kwargs={"k": k}
18
  )
19
 
20
+ def create_ensemble_retriever(
21
+ self,
22
+ texts,
23
+ k: int = 3,
24
+ vector_weight: float = 0.5,
25
+ keyword_weight: float =0.5
26
+ ):
27
  """Create ensemble retriever combining vector and keyword search"""
28
+ vector_retriever = self.create_base_retriever(k=k)
29
  keyword_retriever = BM25Retriever.from_documents(texts)
30
+ keyword_retriever.k = k
31
 
32
  return EnsembleRetriever(
33
  retrievers=[vector_retriever, keyword_retriever],
34
  weights=[vector_weight, keyword_weight]
35
  )
36
 
37
+ def create_compression_retriever(self, base_retriever, top_n: int):
38
  """Create compression retriever with reranking"""
39
  compressor = FlashrankRerank(top_n=top_n)
40
  return ContextualCompressionRetriever(
 
42
  base_retriever=base_retriever
43
  )
44
 
45
+ def create_retriever(self, documents, top_n: int, k: int = 3, ):
46
+ base_retriever = self.create_ensemble_retriever(texts=documents, k=k)
47
+ compression_retriever = self.create_compression_retriever(base_retriever=base_retriever, top_n=top_n)
48
  return create_retriever_tool(
49
  compression_retriever,
50
  "retrieve_docs",
src/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import Any
4
+ from pathlib import Path
5
+ from markitdown import MarkItDown
6
+
7
+ def setup_logging():
8
+ """Sets up the logging configuration."""
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s',
12
+ handlers=[
13
+ logging.FileHandler("app.log", encoding="utf-8"),
14
+ logging.StreamHandler()
15
+ ]
16
+ )
17
+ return logging.getLogger(__name__)
18
+
19
+ logger = setup_logging()
20
+
21
+ def extract_filename(filepath: Path) -> str:
22
+ """Extracts the filename without extension.
23
+
24
+ Args:
25
+ filepath: The complete path to the file.
26
+
27
+ Returns:
28
+ The filename without extension.
29
+ """
30
+ logger.info(f"Extracting filename from {filepath}")
31
+ return os.path.splitext(os.path.basename(filepath))[0] # More concise way to get filename
32
+
33
+ def convert_document_to_markdown(filepath: Path) -> str:
34
+ """Converts a document to markdown.
35
+
36
+ Args:
37
+ filepath: The path to the document file.
38
+
39
+ Returns:
40
+ The raw markdown content.
41
+ """
42
+ logger.info(f"Converting document to markdown: {filepath}")
43
+ md = MarkItDown(enable_plugins=False) # Set to True to enable plugins if needed
44
+ result = md.convert(filepath)
45
+ return result.markdown
46
+
47
+ def save_to_markdown(text: Any, path: Path) -> str:
48
+ """Saves text content to a markdown file.
49
+
50
+ Args:
51
+ text: The text or markdown content to save.
52
+ path: The complete path to the markdown file.
53
+
54
+ Returns:
55
+ The path to the saved markdown file as a string.
56
+ """
57
+
58
+ filename = extract_filename(path)
59
+ filepath = f'{filename}.md' # Create the full filepath
60
+ with open(filepath, 'w', encoding='utf-8') as f:
61
+ f.write(text)
62
+ logger.info(f"Markdown file saved successfully at {filepath}")
63
+ return filepath # Return the filepath
64
+
65
+ def determine_top_k(num_chunks: int) -> int:
66
+ """Determines the top_k value based on the number of chunks.
67
+
68
+ Args:
69
+ num_chunks: The total number of chunks.
70
+
71
+ Returns:
72
+ The appropriate top_k value.
73
+ """
74
+ if num_chunks <= 5:
75
+ top_k = num_chunks
76
+ else:
77
+ top_k = 5
78
+ logger.info(f"Determined top_k: {top_k} based on num_chunks: {num_chunks}")
79
+ return top_k
80
+
81
+ def determine_reranking_top_n(top_k: int) -> int:
82
+ """Determines the top_n value for reranking based on top_k.
83
+ Args:
84
+ top_k: The number of top results to consider.
85
+ Returns:
86
+ The appropriate top_n value for reranking.
87
+ """
88
+ total_top_k = top_k * 2
89
+
90
+ if total_top_k <= 5:
91
+ top_n = round(total_top_k / 2) + 1
92
+ else:
93
+ top_n = 6
94
+ logger.info(f"Determined top_n: {top_n} based on top_k: {top_k}")
95
+ return top_n