Spaces:

DenysPetro
/

RAG_NLP

Sleeping

App Files Files Community

DenysPetro commited on 25 days ago

Commit

2fa7106

•

1 Parent(s): 0619696

changed structure

Browse files

Files changed (8) hide show

app.py +22 -503
assistant.py +97 -0
citation.py +99 -0
sherlock.pdf → data/sherlock.pdf +0 -0
embeddings.py +58 -0
reranker.py +32 -0
retrievers.py +174 -0
text_processing.py +33 -0

app.py CHANGED Viewed

@@ -1,520 +1,37 @@
-import numpy as np
-import pandas as pd
-import faiss
-import fitz
 import nltk
-import re
-import os
-from abc import ABC, abstractmethod
-from sentence_transformers import SentenceTransformer
-from sentence_transformers import CrossEncoder
-from rank_bm25 import BM25Okapi
-from sklearn.preprocessing import normalize
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from FlagEmbedding import FlagReranker
 import gradio as gr
-from litellm import completion
-import textwrap
 nltk.download('punkt', quiet=True)
 nltk.download('punkt')
 nltk.download('punkt_tab')
-def extract_text_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return text
-def clean_text(text):
-    text = re.sub(r'(\s?\.\s?)+', '', text)
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\x00-\x7F]+', '', text)
-    return text.strip()
-def chunk_text(text, chunk_size=1000, chunk_overlap=150):
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            separators=["\n\n", "\n", " "]
-        )
-        return text_splitter.split_text(text)
-def generate_embeddings(chunks, model_name='all-MiniLM-L6-v2'):
-    model = SentenceTransformer(model_name)
-    embeddings = model.encode(chunks, convert_to_numpy=True)
-    return embeddings
-def process_pdf_for_rag(pdf_path, chunk_size=500, model_name='all-MiniLM-L6-v2'):
-    print("Extracting text from PDF...")
-    raw_text = extract_text_from_pdf(pdf_path)
-    print("Cleaning text...")
-    clean_text_content = clean_text(raw_text)
-    print("Chunking text...")
-    chunks = chunk_text(clean_text_content, chunk_size)
-    print("Generating embeddings...")
-    embeddings = generate_embeddings(chunks, model_name)
-    print("Processing complete!")
-    return chunks
-class Citation:
-    def __init__(self, cleaned_text):
-        """
-        Initialize with cleaned text.
-        :param cleaned_text: The cleaned text from the PDF.
-        """
-        self.cleaned_text = cleaned_text
-        self.chapters = self.extract_chapter_names()
-        self.chapter_data = self.extract_chapter_data()
-        self.df = self.to_dataframe()
-    def extract_chapter_names(self):
-        """
-        Extract chapter names using the provided `extract_chapter_names` function.
-        :return: List of chapter names.
-        """
-        toc_match = re.search(r"Table of contents(.*?)Table of contents", self.cleaned_text, re.DOTALL)
-        if not toc_match:
-            raise ValueError("Table of Contents section not found.")
-        toc_section = toc_match.group(1)
-        chapter_names = re.findall(r"(.*?)\d+", toc_section)
-        return [name.strip() for name in chapter_names if name.strip()]
-    def extract_chapter_data(self):
-        """
-        Extract full text for each chapter from the second occurrence of the chapter name
-        to the second occurrence of the next chapter name, or to the end if it's the last chapter.
-        :return: A list of tuples (chapter_name, plain_text).
-        Second occurence because first in text is in table of contents  :)
-        """
-        chapter_data = []
-        for i, chapter in enumerate(self.chapters):
-            current_chapter_pattern = re.escape(chapter)
-            next_chapter_pattern = re.escape(self.chapters[i + 1]) if i + 1 < len(self.chapters) else None
-            if next_chapter_pattern:
-                matches = list(re.finditer(rf"{current_chapter_pattern}(.*?){next_chapter_pattern}", self.cleaned_text, re.DOTALL))
-            else:
-                matches = list(re.finditer(rf"{current_chapter_pattern}(.*)", self.cleaned_text, re.DOTALL))
-            if len(matches) >= 2:
-                start = matches[1].start(1)
-                end = matches[1].end(1)
-                if next_chapter_pattern:
-                    next_match = re.search(rf"{next_chapter_pattern}", self.cleaned_text[end:])
-                    if next_match:
-                        end += next_match.start()
-                chapter_content = self.cleaned_text[start:end].strip()
-            else:
-                chapter_content = ""
-            if i == len(self.chapters) - 1:
-                last_chapter_matches = list(re.finditer(rf"{current_chapter_pattern}", self.cleaned_text))
-                if len(last_chapter_matches) >= 2:
-                    start = last_chapter_matches[1].start()
-                    chapter_content = self.cleaned_text[start:].strip()
-            chapter_data.append((chapter, chapter_content))
-        return chapter_data
-    def to_dataframe(self):
-        """
-        Converts the chapter data into a Pandas DataFrame.
-        :return: DataFrame with 'Chapter' and 'Content' columns.
-        """
-        df = pd.DataFrame(self.chapter_data, columns=['Chapter', 'Content'])
-        return df
-    def search_citate(self, retrieved_chunks):
-        """
-        Finds the chapter name for each chunk in the content and returns a formatted string.
-        Args:
-            retrieved_chunks (list): List of text chunks to search in the chapters' content.
-        Returns:
-            str: A formatted string of citations, each chunk starting on a new line.
-        """
-        citations = []
-        for idx, chunk in enumerate(retrieved_chunks, start=1):
-            for _, row in self.df.iterrows():
-                chapter_name = row['Chapter']
-                chapter_content = row['Content']
-                if chunk in chapter_content:
-                    citations.append(f"chunk [{idx}] from chapter: {chapter_name}")
-                    break
-        return "\n".join(citations)
-# pdf_text = extract_text_from_pdf("sherlock.pdf")
-# cleaned_text = clean_text(pdf_text)
-# citation = Citation(cleaned_text)
-# df = citation.to_dataframe()
-# df.to_csv('chapters_text.csv', index=True)
-# # chapter_name, chapter_content = citation.chapter_data[0]
-# # print(f"Chapter: {chapter_name}\nContent:\n{chapter_content}")
-# # Assuming 'retrieved_chunks' is the list of chunks you want to search for
-# retrieved_chunks = ['peculiarities of the typewriter', 'Irene Adler']
-# # Create a Citation object
-# citation = Citation(cleaned_text)
-# # Get the citations for the retrieved chunks
-# citations = citation.search_citate(retrieved_chunks)
-# # Print the results
-# print(citations)
-class Retriever(ABC):
-    """
-    Abstract base class for retrievers.
-    """
-    @abstractmethod
-    def retrieve(self, query, top_k=5):
-        """
-        Retrieve top-k relevant chunks for a query.
-        Args:
-            query (str): The search query.
-            top_k (int): Number of top results to retrieve.
-        Returns:
-            list: List of (chunk, score) tuples sorted by relevance.
-        """
-        pass
-class KeyWordRetriever(Retriever):
-    """
-    Keyword-based retriever using BM25.
-    """
-    def __init__(self, chunks):
-        """
-        Initialize the BM25 retriever with pre-tokenized chunks.
-        Args:
-            chunks (list): List of text chunks to index.
-        """
-        self.tokenized_chunks = [nltk.word_tokenize(chunk) for chunk in chunks]
-        self.bm25 = BM25Okapi(self.tokenized_chunks)
-        self.chunks = chunks
-    def retrieve(self, query, top_k=5):
-        """
-        Retrieve top-k chunks based on BM25 scores.
-        Args:
-            query (str): The search query.
-            top_k (int): Number of top results to retrieve.
-        Returns:
-            list: List of (chunk, score) tuples sorted by relevance.
-        """
-        query_tokens = nltk.word_tokenize(query)
-        scores = self.bm25.get_scores(query_tokens)
-        ranked_indices = np.argsort(scores)[::-1][:top_k]
-        return [(self.chunks[i], scores[i]) for i in ranked_indices]
-class SemanticRetriever(Retriever):
-    """
-    Semantic retriever using SentenceTransformers and FAISS.
-    """
-    def __init__(self, chunks, model_name='all-MiniLM-L6-v2', index_path="faiss_index"):
-        """
-        Initialize the semantic retriever with SentenceTransformers and FAISS.
-        Args:
-            chunks (list): List of text chunks.
-            model_name (str): Model name for SentenceTransformers.
-            index_path (str): Path to save/load the FAISS index.
-        """
-        self.chunks = chunks
-        self.model = SentenceTransformer(model_name)
-        self.index_path = index_path
-        self.index = self._create_faiss_index(chunks)
-    def _create_faiss_index(self, chunks):
-        """
-        Create a FAISS index from text chunks.
-        Args:
-            chunks (list): List of text chunks.
-        Returns:
-            faiss.Index: Trained FAISS index.
-        """
-        embeddings = generate_embeddings(chunks)
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dimension)
-        index.add(embeddings)
-        faiss.write_index(index, self.index_path)
-        print(f"FAISS index saved to {self.index_path}")
-        return index
-    def retrieve(self, query, top_k=5):
-        """
-        Retrieve top-k chunks based on semantic similarity.
-        Args:
-            query (str): The search query.
-            top_k (int): Number of top results to retrieve.
-        Returns:
-            list: List of (chunk, score) tuples sorted by relevance.
-        """
-        query_embedding = self.model.encode([query], convert_to_numpy=True).astype("float32")
-        faiss.normalize_L2(query_embedding)
-        distances, indices = self.index.search(query_embedding, top_k)
-        results = [(self.chunks[idx], 1 / (1 + distances[0][i])) for i, idx in enumerate(indices[0])]
-        return results
-class HybridRetriever(Retriever):
-    """
-    Hybrid retriever that combines results from keyword-based and semantic retrievers.
-    """
-    def __init__(self, keyword_retriever, semantic_retriever):
-        """
-        Initialize the HybridRetriever.
-        Args:
-            keyword_retriever (KeyWordRetriever): An instance of KeyWordRetriever.
-            semantic_retriever (SemanticRetriever): An instance of SemanticRetriever.
-        """
-        self.keyword_retriever = keyword_retriever
-        self.semantic_retriever = semantic_retriever
-    def normalize_scores(self, scores):
-        """
-        Normalize a list of scores to a [0, 1] range.
-        Args:
-            scores (list): List of scores.
-        Returns:
-            list: Normalized scores.
-        """
-        min_score = min(scores)
-        max_score = max(scores)
-        if max_score == min_score:
-            return [0.5] * len(scores)  # Avoid division by zero if all scores are the same
-        return [(score - min_score) / (max_score - min_score) for score in scores]
-    def retrieve(self, query, top_k=5):
-        """
-        Retrieve top-k chunks by combining keyword and semantic relevance.
-        Args:
-            query (str): The search query.
-            top_k (int): Number of top results to retrieve.
-        Returns:
-            list: List of (chunk, combined_score) tuples sorted by combined relevance.
-        """
-        # Retrieve results from both retrievers
-        keyword_results = self.keyword_retriever.retrieve(query, top_k=top_k)
-        semantic_results = self.semantic_retriever.retrieve(query, top_k=top_k)
-        # Extract chunks and scores from both retrievers
-        keyword_chunks, keyword_scores = zip(*keyword_results) if keyword_results else ([], [])
-        semantic_chunks, semantic_scores = zip(*semantic_results) if semantic_results else ([], [])
-        # Normalize scores for both retrievers
-        normalized_keyword_scores = self.normalize_scores(keyword_scores) if keyword_scores else []
-        normalized_semantic_scores = self.normalize_scores(semantic_scores) if semantic_scores else []
-        # Combine results by creating a mapping of chunk -> combined score
-        score_map = {}
-        # Add keyword scores to the map
-        for chunk, score in zip(keyword_chunks, normalized_keyword_scores):
-            score_map[chunk] = score_map.get(chunk, 0) + score
-        # Add semantic scores to the map
-        for chunk, score in zip(semantic_chunks, normalized_semantic_scores):
-            score_map[chunk] = score_map.get(chunk, 0) + score
-        # Sort the results by combined score
-        sorted_results = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
-        # Return top-k results
-        return sorted_results[:top_k]
-class Reranker:
-    def __init__(self, model_name='BAAI/bge-reranker-large'):
-        self.model = FlagReranker(model_name, use_fp16=True)
-    def rerank(self, query, retrieved_results, filter_num=1):
-        """
-        Rerank the retrieved results based on query relevance.
-        Args:
-            query (str): Query string.
-            retrieved_results (list): List of chunks in text format.
-        Returns:
-            list: Reranked results as a list of chunks (sorted by relevance).
-        """
-        if not retrieved_results:
-            return []
-        input_pairs = [(query, chunk) for chunk in retrieved_results]
-        scores = self.model.compute_score(input_pairs)
-        reranked_results = sorted(
-            zip(retrieved_results, scores),
-            key=lambda x: x[1],
-            reverse=True
-        )
-        reranked_chunks = [chunk for chunk, _ in reranked_results]
-        reranked_chunks = reranked_chunks[:filter_num]
-        return reranked_chunks
-class Assistant:
-    def __init__(self, retriever, reranker=None, model="groq/llama3-8b-8192", citation=None):
-        """
-        Initialize the Assistant.
-        Args:
-            retriever (Retriever): An instance of a Retriever class (KeyWordRetriever or SemanticRetriever).
-            model (str): The name of the LLM model to use (default is "groq/llama3-8b-8192").
-        """
-        self.retriever = retriever
-        self.model = model
-        self.reranker = reranker
-        self.citation = citation
-    def simulate_llm_response(self, prompt, context, api_key):
-        """
-        Simulate an LLM response for demonstration purposes.
-        Args:
-            prompt (str): The prompt to send to the simulated LLM.
-            context (str): The context to include in the prompt.
-            api_key (str): The API key for Groq.
-        Returns:
-            str: The generated completion text.
-        """
-        os.environ['GROQ_API_KEY'] = api_key
-        instruction = """
-        Contextual AI Assistant
-        You are an AI assistant designed to provide concise, accurate, and clear responses. Always adhere to the following principles:
-        Core Principles:
-        - Truthfulness: Prioritize accuracy. If unsure, acknowledge the limitation without guessing.
-        - Contextual Understanding: Analyze the conversation history to understand the user's intent.
-        - Clarity and Conciseness: Provide brief, direct answers without unnecessary elaboration.
-        - Helpful Guidance: Offer practical suggestions when relevant, but keep it concise.
-        - Error Handling: Acknowledge limitations and suggest alternatives when unable to answer.
-        Important! Maximum length of your answer can be of 3-4 sentences.
-        """
-        response = completion(
-            model=self.model,
-            messages=[
-                {"role": "system", "content": instruction},
-                {"role": "system", "content": context},
-                {"role": "user", "content": prompt}
-            ],
-            stream=True
-        )
-        generated_text = ""
-        for chunk in response:
-            generated_text += str(chunk["choices"][0]['delta']['content'])
-        generated_text = generated_text[:-4]
-        # max_line_length = 160
-        # final_text = textwrap.fill(generated_text, width=max_line_length)
-        return generated_text
-    def handle_query(self, query, api_key, retriever_type="semantic", top_k=5, use_reranker=False):
-        """
-        Handle the user's query by retrieving relevant chunks and generating a simulated LLM response.
-        Args:
-            query (str): The user's query.
-            retriever_type (str): Type of retriever to use ("semantic" or "keyword").
-            top_k (int): Number of top results to retrieve.
-        Returns:
-            str: The generated response from the simulated LLM.
-        """
-        if retriever_type.lower() == "keyword":
-            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
-        elif retriever_type.lower() == "semantic":
-            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
-        elif retriever_type.lower() == "hybrid":
-            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
-        else:
-            raise ValueError(f"Unknown retriever type: {retriever_type}")
-        if use_reranker and self.reranker:
-            reranked_results = self.reranker.rerank(query, retrieved_chunks)
-            citations = self.citation.search_citate(reranked_results)
-            reranked_chunks = " ".join(reranked_results)
-            return self.simulate_llm_response(query, reranked_chunks, api_key), reranked_chunks, citations
-        citations = self.citation.search_citate(retrieved_chunks)
-        retrieved_chunks_string = " ".join(retrieved_chunks)
-        print(retrieved_chunks_string)
-        return self.simulate_llm_response(query, retrieved_chunks_string, api_key), retrieved_chunks_string, citations
-pdf_path = "sherlock.pdf"
 chunks = process_pdf_for_rag(pdf_path, chunk_size=500)
-pdf_text = extract_text_from_pdf("sherlock.pdf")
 keyword_retriever = KeyWordRetriever(chunks)
 semantic_retriever = SemanticRetriever(chunks)
 hybrid_retriever = HybridRetriever(keyword_retriever, semantic_retriever)
 reranker = Reranker()
-cleaned_text = clean_text(pdf_text)
-citation = Citation(cleaned_text)
-query =  'Who is Irene?'
-key = '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'
 assistant = Assistant(hybrid_retriever, reranker, citation=citation)
-response = assistant.handle_query(query, api_key=key, retriever_type="hybrid", top_k=5, use_reranker=False)
-print("Assistant's Response:")
-print(response)
 def run_rag_ui(api_key, query, retriever_type, top_k, use_reranker):
     if retriever_type.lower() == "keyword":
         retriever = keyword_retriever
@@ -530,11 +47,15 @@ def run_rag_ui(api_key, query, retriever_type, top_k, use_reranker):
     cleaned_text = clean_text(pdf_text)
     citation = Citation(cleaned_text)
     assistant = Assistant(retriever, reranker, citation=citation)
-    response, retrieved_chunks, citations = assistant.handle_query(query, api_key, retriever_type=retriever_type, top_k=top_k, use_reranker=use_reranker)
     return response, citations, retrieved_chunks
 iface = gr.Interface(
     fn=run_rag_ui,
     inputs=[
@@ -553,6 +74,4 @@ iface = gr.Interface(
     description="Enter your query, select the retrieval method, and get retrieved chunks along with LLM responses."
 )
 iface.launch(share=True)

 import nltk
 import gradio as gr
+from assistant import Assistant
+from citation import Citation
+from retrievers import KeyWordRetriever, SemanticRetriever, HybridRetriever
+from text_processing import extract_text_from_pdf, clean_text
+from embeddings import process_pdf_for_rag
+from reranker import Reranker
 nltk.download('punkt', quiet=True)
 nltk.download('punkt')
 nltk.download('punkt_tab')
+# Load and preprocess PDF
+data_path = "data"
+pdf_path = data_path + "/sherlock.pdf"
 chunks = process_pdf_for_rag(pdf_path, chunk_size=500)
+pdf_text = extract_text_from_pdf(pdf_path)
+cleaned_text = clean_text(pdf_text)
+citation = Citation(cleaned_text)
+# Initialize retrievers
 keyword_retriever = KeyWordRetriever(chunks)
 semantic_retriever = SemanticRetriever(chunks)
 hybrid_retriever = HybridRetriever(keyword_retriever, semantic_retriever)
+# Initialize assistant
 reranker = Reranker()
 assistant = Assistant(hybrid_retriever, reranker, citation=citation)
+# Gradio UI
 def run_rag_ui(api_key, query, retriever_type, top_k, use_reranker):
     if retriever_type.lower() == "keyword":
         retriever = keyword_retriever
     cleaned_text = clean_text(pdf_text)
     citation = Citation(cleaned_text)
     assistant = Assistant(retriever, reranker, citation=citation)
+    response, retrieved_chunks, citations = assistant.handle_query(
+        query, api_key,
+        retriever_type=retriever_type,
+        top_k=top_k,
+        use_reranker=use_reranker
+    )
     return response, citations, retrieved_chunks
 iface = gr.Interface(
     fn=run_rag_ui,
     inputs=[
     description="Enter your query, select the retrieval method, and get retrieved chunks along with LLM responses."
 )
 iface.launch(share=True)

assistant.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+from litellm import completion
+class Assistant:
+    def __init__(self, retriever, reranker=None, model="groq/llama3-8b-8192", citation=None):
+        """
+        Initialize the Assistant.
+        Args:
+            retriever (Retriever): An instance of a Retriever class (KeyWordRetriever or SemanticRetriever).
+            model (str): The name of the LLM model to use (default is "groq/llama3-8b-8192").
+        """
+        self.retriever = retriever
+        self.model = model
+        self.reranker = reranker
+        self.citation = citation
+    def simulate_llm_response(self, prompt, context, api_key):
+        """
+        Simulate an LLM response for demonstration purposes.
+        Args:
+            prompt (str): The prompt to send to the simulated LLM.
+            context (str): The context to include in the prompt.
+            api_key (str): The API key for Groq.
+        Returns:
+            str: The generated completion text.
+        """
+        os.environ['GROQ_API_KEY'] = api_key
+        instruction = """
+        Contextual AI Assistant
+        You are an AI assistant designed to provide concise, accurate, and clear responses. Always adhere to the following principles:
+        Core Principles:
+        - Truthfulness: Prioritize accuracy. If unsure, acknowledge the limitation without guessing.
+        - Contextual Understanding: Analyze the conversation history to understand the user's intent.
+        - Clarity and Conciseness: Provide brief, direct answers without unnecessary elaboration.
+        - Helpful Guidance: Offer practical suggestions when relevant, but keep it concise.
+        - Error Handling: Acknowledge limitations and suggest alternatives when unable to answer.
+        Important! Maximum length of your answer can be of 3-4 sentences.
+        """
+        response = completion(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": instruction},
+                {"role": "system", "content": context},
+                {"role": "user", "content": prompt}
+            ],
+            stream=True
+        )
+        generated_text = ""
+        for chunk in response:
+            generated_text += str(chunk["choices"][0]['delta']['content'])
+        generated_text = generated_text[:-4]
+        # max_line_length = 160
+        # final_text = textwrap.fill(generated_text, width=max_line_length)
+        return generated_text
+    def handle_query(self, query, api_key, retriever_type="semantic", top_k=5, use_reranker=False):
+        """
+        Handle the user's query by retrieving relevant chunks and generating a simulated LLM response.
+        Args:
+            query (str): The user's query.
+            retriever_type (str): Type of retriever to use ("semantic" or "keyword").
+            top_k (int): Number of top results to retrieve.
+        Returns:
+            str: The generated response from the simulated LLM.
+        """
+        if retriever_type.lower() == "keyword":
+            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
+        elif retriever_type.lower() == "semantic":
+            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
+        elif retriever_type.lower() == "hybrid":
+            retrieved_chunks = [chunk for chunk, _ in self.retriever.retrieve(query, top_k=top_k)]
+        else:
+            raise ValueError(f"Unknown retriever type: {retriever_type}")
+        if use_reranker and self.reranker:
+            reranked_results = self.reranker.rerank(query, retrieved_chunks)
+            citations = self.citation.search_citate(reranked_results)
+            reranked_chunks = " ".join(reranked_results)
+            return self.simulate_llm_response(query, reranked_chunks, api_key), reranked_chunks, citations
+        citations = self.citation.search_citate(retrieved_chunks)
+        retrieved_chunks_string = " ".join(retrieved_chunks)
+        print(retrieved_chunks_string)
+        return self.simulate_llm_response(query, retrieved_chunks_string, api_key), retrieved_chunks_string, citations

citation.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import re
+import pandas as pd
+class Citation:
+    def __init__(self, cleaned_text):
+        """
+        Initialize with cleaned text.
+        :param cleaned_text: The cleaned text from the PDF.
+        """
+        self.cleaned_text = cleaned_text
+        self.chapters = self.extract_chapter_names()
+        self.chapter_data = self.extract_chapter_data()
+        self.df = self.to_dataframe()
+    def extract_chapter_names(self):
+        """
+        Extract chapter names using the provided `extract_chapter_names` function.
+        :return: List of chapter names.
+        """
+        toc_match = re.search(r"Table of contents(.*?)Table of contents", self.cleaned_text, re.DOTALL)
+        if not toc_match:
+            raise ValueError("Table of Contents section not found.")
+        toc_section = toc_match.group(1)
+        chapter_names = re.findall(r"(.*?)\d+", toc_section)
+        return [name.strip() for name in chapter_names if name.strip()]
+    def extract_chapter_data(self):
+        """
+        Extract full text for each chapter from the second occurrence of the chapter name
+        to the second occurrence of the next chapter name, or to the end if it's the last chapter.
+        :return: A list of tuples (chapter_name, plain_text).
+        Second occurence because first in text is in table of contents  :)
+        """
+        chapter_data = []
+        for i, chapter in enumerate(self.chapters):
+            current_chapter_pattern = re.escape(chapter)
+            next_chapter_pattern = re.escape(self.chapters[i + 1]) if i + 1 < len(self.chapters) else None
+            if next_chapter_pattern:
+                matches = list(re.finditer(rf"{current_chapter_pattern}(.*?){next_chapter_pattern}", self.cleaned_text, re.DOTALL))
+            else:
+                matches = list(re.finditer(rf"{current_chapter_pattern}(.*)", self.cleaned_text, re.DOTALL))
+            if len(matches) >= 2:
+                start = matches[1].start(1)
+                end = matches[1].end(1)
+                if next_chapter_pattern:
+                    next_match = re.search(rf"{next_chapter_pattern}", self.cleaned_text[end:])
+                    if next_match:
+                        end += next_match.start()
+                chapter_content = self.cleaned_text[start:end].strip()
+            else:
+                chapter_content = ""
+            if i == len(self.chapters) - 1:
+                last_chapter_matches = list(re.finditer(rf"{current_chapter_pattern}", self.cleaned_text))
+                if len(last_chapter_matches) >= 2:
+                    start = last_chapter_matches[1].start()
+                    chapter_content = self.cleaned_text[start:].strip()
+            chapter_data.append((chapter, chapter_content))
+        return chapter_data
+    def to_dataframe(self):
+        """
+        Converts the chapter data into a Pandas DataFrame.
+        :return: DataFrame with 'Chapter' and 'Content' columns.
+        """
+        df = pd.DataFrame(self.chapter_data, columns=['Chapter', 'Content'])
+        return df
+    def search_citate(self, retrieved_chunks):
+        """
+        Finds the chapter name for each chunk in the content and returns a formatted string.
+        Args:
+            retrieved_chunks (list): List of text chunks to search in the chapters' content.
+        Returns:
+            str: A formatted string of citations, each chunk starting on a new line.
+        """
+        citations = []
+        for idx, chunk in enumerate(retrieved_chunks, start=1):
+            for _, row in self.df.iterrows():
+                chapter_name = row['Chapter']
+                chapter_content = row['Content']
+                if chunk in chapter_content:
+                    citations.append(f"chunk [{idx}] from chapter: {chapter_name}")
+                    break
+        return "\n".join(citations)

sherlock.pdf → data/sherlock.pdf RENAMED Viewed

File without changes

embeddings.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from text_processing import extract_text_from_pdf, clean_text
+def chunk_text(text, chunk_size=1000, chunk_overlap=150):
+    """
+    Split text into overlapping chunks.
+    Args:
+        text (str): Input text.
+        chunk_size (int): Size of each chunk.
+        chunk_overlap (int): Overlap between chunks.
+    Returns:
+        list: List of text chunks.
+    """
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=["\n\n", "\n", " "]
+    )
+    return text_splitter.split_text(text)
+def generate_embeddings(chunks, model_name='all-MiniLM-L6-v2'):
+    """
+    Generate embeddings for text chunks.
+    Args:
+        chunks (list): List of text chunks.
+        model_name (str): SentenceTransformer model name.
+    Returns:
+        np.ndarray: Array of embeddings.
+    """
+    model = SentenceTransformer(model_name)
+    return model.encode(chunks, convert_to_numpy=True)
+def process_pdf_for_rag(pdf_path, chunk_size=500):
+    """
+    Process a PDF for RAG by extracting, cleaning, and chunking.
+    Args:
+        pdf_path (str): Path to the PDF file.
+        chunk_size (int): Size of each chunk.
+    Returns:
+        list: List of text chunks.
+    """
+    print("Extracting text from PDF...")
+    raw_text = extract_text_from_pdf(pdf_path)
+    print("Cleaning text...")
+    clean_text_content = clean_text(raw_text)
+    print("Chunking text...")
+    chunks = chunk_text(clean_text_content, chunk_size)
+    print("Processing complete!")
+    return chunks

reranker.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from FlagEmbedding import FlagReranker
+class Reranker:
+    def __init__(self, model_name='BAAI/bge-reranker-large'):
+        self.model = FlagReranker(model_name, use_fp16=True)
+    def rerank(self, query, retrieved_results, filter_num=1):
+        """
+        Rerank the retrieved results based on query relevance.
+        Args:
+            query (str): Query string.
+            retrieved_results (list): List of chunks in text format.
+        Returns:
+            list: Reranked results as a list of chunks (sorted by relevance).
+        """
+        if not retrieved_results:
+            return []
+        input_pairs = [(query, chunk) for chunk in retrieved_results]
+        scores = self.model.compute_score(input_pairs)
+        reranked_results = sorted(
+            zip(retrieved_results, scores),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        reranked_chunks = [chunk for chunk, _ in reranked_results]
+        reranked_chunks = reranked_chunks[:filter_num]
+        return reranked_chunks

retrievers.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import numpy as np
+import faiss
+from rank_bm25 import BM25Okapi
+from nltk.tokenize import word_tokenize
+from sentence_transformers import SentenceTransformer
+from embeddings import generate_embeddings
+class Retriever:
+    """
+    Abstract base class for retrievers.
+    """
+    def retrieve(self, query, top_k=5):
+        raise NotImplementedError
+class KeyWordRetriever(Retriever):
+    """
+    Keyword-based retriever using BM25.
+    """
+    def __init__(self, chunks):
+        """
+        Initialize the BM25 retriever with pre-tokenized chunks.
+        Args:
+            chunks (list): List of text chunks to index.
+        """
+        self.tokenized_chunks = [word_tokenize(chunk) for chunk in chunks]
+        self.bm25 = BM25Okapi(self.tokenized_chunks)
+        self.chunks = chunks
+    def retrieve(self, query, top_k=5):
+        """
+        Retrieve top-k chunks based on BM25 scores.
+        Args:
+            query (str): The search query.
+            top_k (int): Number of top results to retrieve.
+        Returns:
+            list: List of (chunk, score) tuples sorted by relevance.
+        """
+        query_tokens = word_tokenize(query)
+        scores = self.bm25.get_scores(query_tokens)
+        ranked_indices = np.argsort(scores)[::-1][:top_k]
+        return [(self.chunks[i], scores[i]) for i in ranked_indices]
+class SemanticRetriever(Retriever):
+    """
+    Semantic retriever using SentenceTransformers and FAISS.
+    """
+    def __init__(self, chunks, model_name='all-MiniLM-L6-v2', index_path="faiss_index"):
+        """
+        Initialize the semantic retriever with SentenceTransformers and FAISS.
+        Args:
+            chunks (list): List of text chunks.
+            model_name (str): Model name for SentenceTransformers.
+            index_path (str): Path to save/load the FAISS index.
+        """
+        self.chunks = chunks
+        self.model = SentenceTransformer(model_name)
+        self.index_path = index_path
+        self.index = self._create_faiss_index(chunks)
+    def _create_faiss_index(self, chunks):
+        """
+        Create a FAISS index from text chunks.
+        Args:
+            chunks (list): List of text chunks.
+        Returns:
+            faiss.Index: Trained FAISS index.
+        """
+        embeddings = generate_embeddings(chunks)
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings)
+        faiss.write_index(index, self.index_path)
+        print(f"FAISS index saved to {self.index_path}")
+        return index
+    def retrieve(self, query, top_k=5):
+        """
+        Retrieve top-k chunks based on semantic similarity.
+        Args:
+            query (str): The search query.
+            top_k (int): Number of top results to retrieve.
+        Returns:
+            list: List of (chunk, score) tuples sorted by relevance.
+        """
+        query_embedding = self.model.encode([query], convert_to_numpy=True).astype("float32")
+        faiss.normalize_L2(query_embedding)
+        distances, indices = self.index.search(query_embedding, top_k)
+        results = [(self.chunks[idx], 1 / (1 + distances[0][i])) for i, idx in enumerate(indices[0])]
+        return results
+class HybridRetriever(Retriever):
+    """
+    Hybrid retriever that combines results from keyword-based and semantic retrievers.
+    """
+    def __init__(self, keyword_retriever, semantic_retriever):
+        """
+        Initialize the HybridRetriever.
+        Args:
+            keyword_retriever (KeyWordRetriever): An instance of KeyWordRetriever.
+            semantic_retriever (SemanticRetriever): An instance of SemanticRetriever.
+        """
+        self.keyword_retriever = keyword_retriever
+        self.semantic_retriever = semantic_retriever
+    def normalize_scores(self, scores):
+        """
+        Normalize a list of scores to a [0, 1] range.
+        Args:
+            scores (list): List of scores.
+        Returns:
+            list: Normalized scores.
+        """
+        min_score = min(scores)
+        max_score = max(scores)
+        if max_score == min_score:
+            return [0.5] * len(scores)  # Avoid division by zero if all scores are the same
+        return [(score - min_score) / (max_score - min_score) for score in scores]
+    def retrieve(self, query, top_k=5):
+        """
+        Retrieve top-k chunks by combining keyword and semantic relevance.
+        Args:
+            query (str): The search query.
+            top_k (int): Number of top results to retrieve.
+        Returns:
+            list: List of (chunk, combined_score) tuples sorted by combined relevance.
+        """
+        # Retrieve results from both retrievers
+        keyword_results = self.keyword_retriever.retrieve(query, top_k=top_k)
+        semantic_results = self.semantic_retriever.retrieve(query, top_k=top_k)
+        # Extract chunks and scores from both retrievers
+        keyword_chunks, keyword_scores = zip(*keyword_results) if keyword_results else ([], [])
+        semantic_chunks, semantic_scores = zip(*semantic_results) if semantic_results else ([], [])
+        # Normalize scores for both retrievers
+        normalized_keyword_scores = self.normalize_scores(keyword_scores) if keyword_scores else []
+        normalized_semantic_scores = self.normalize_scores(semantic_scores) if semantic_scores else []
+        # Combine results by creating a mapping of chunk -> combined score
+        score_map = {}
+        # Add keyword scores to the map
+        for chunk, score in zip(keyword_chunks, normalized_keyword_scores):
+            score_map[chunk] = score_map.get(chunk, 0) + score
+        # Add semantic scores to the map
+        for chunk, score in zip(semantic_chunks, normalized_semantic_scores):
+            score_map[chunk] = score_map.get(chunk, 0) + score
+        # Sort the results by combined score
+        sorted_results = sorted(score_map.items(), key=lambda x: x[1], reverse=True)
+        # Return top-k results
+        return sorted_results[:top_k]

text_processing.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import fitz
+import re
+def extract_text_from_pdf(pdf_path):
+    """
+    Extract text from a PDF file.
+    Args:
+        pdf_path (str): Path to the PDF file.
+    Returns:
+        str: Extracted text from the PDF.
+    """
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def clean_text(text):
+    """
+    Clean and normalize text.
+    Args:
+        text (str): Raw text.
+    Returns:
+        str: Cleaned text.
+    """
+    text = re.sub(r'(\s?\.\s?)+', '', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[^\x00-\x7F]+', '', text)
+    return text.strip()