Spaces:

asif00
/

Talk_to_Doc-Advanced_RAG_for_Reasoning_and_QA_with_Gemini_Pro

Running

App Files Files Community

asif00 commited on Feb 18

Commit

036f6a2

•

1 Parent(s): 607785c

Update: alpha

Browse files

Files changed (9) hide show

.gitignore +0 -1
app.py +70 -0
requirements.txt +5 -0
src/app.py +0 -15
src/brain.py +6 -2
src/content.html +33 -0
src/helper.py +2 -3
src/init.py +15 -3
src/style.css +61 -0

.gitignore CHANGED Viewed

@@ -3,7 +3,6 @@
 .env
 *.pdf
 *.json
-*.txt
 temp/*

 .env
 *.pdf
 *.json
 temp/*

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import gradio as gr
+from src.init import Initializer
+from dotenv import load_dotenv
+load_dotenv()
+AUG_TOKEN = os.environ.get("AUGMENT_MODEL")
+RES_TOKEN = os.environ.get("RESPONSE_MODEL")
+pdf_loaded = False
+processing = False
+def load_pdf(pdf_file_path):
+    global pdf_loaded
+    filename = pdf_file_path.name
+    global brain
+    brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, filename)
+    pdf_loaded = True
+    return "Processing complete!"
+def response(query, history):
+    global processing
+    if not pdf_loaded or processing:
+        return "Please wait...", history
+    processing = True
+    output = brain.generate_answers(query)
+    history.append((query, output))
+    processing = False
+    return "", history
+with open("src/style.css", "r") as file:
+    css = file.read()
+with open("src/content.html", "r") as file:
+    html_content = file.read()
+    parts = html_content.split("<!-- split here -->")
+    title_html = parts[0]
+    bts_html = parts[1] if len(parts) > 1 else ""
+def loading():
+    return "Loading ..."
+with gr.Blocks(css=css) as app:
+    with gr.Column(elem_id="column_container"):
+        gr.HTML(title_html)
+        with gr.Column():
+            pdf = gr.File(label="Load your PDF document", file_types=[".pdf"])
+            with gr.Row():
+                status = gr.Label(label="Status", value="")
+                load_pdf_button = gr.Button(value="Process")
+        chatbot = gr.Chatbot([], elem_id="chatbot")
+        query = gr.Textbox(
+            label="Ask a question about the PDF",
+            placeholder="What do you want to know?",
+        )
+        clear = gr.ClearButton([query, chatbot])
+        gr.HTML(bts_html)
+    load_pdf_button.click(loading, outputs=[status])
+    load_pdf_button.click(load_pdf, inputs=[pdf], outputs=[status])
+    query.submit(response, [query, chatbot], [query, chatbot])
+app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+google-generativeai
+langchain
+sentence-transformers
+chromadb
+pypdf

src/app.py DELETED Viewed

@@ -1,15 +0,0 @@
-import os
-load_dotenv()
-import gradio as gr
-from init import Initializer
-from dotenv import load_dotenv
-AUG_TOKEN = os.environ.get("AUG_TOKEN")
-RES_TOKEN = os.environ.get("RES_TOKEN")
-chroma_filename = ""
-brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, chroma_filename)
-# TODO:
-# Chatbot like UI
-# Multiple PDF file handling ability

src/brain.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from helper import load_chroma
 import numpy as np
 import logging
 import time
@@ -203,6 +203,7 @@ class Brain:
     def rag(self, query):
         try:
             if query is None:
                 return None
             results = self.chroma_collection.query(
                 query_texts=[query],
@@ -220,7 +221,10 @@ class Brain:
     def generate_answers(self, query):
         try:
             start_time = time.time()
-            output = self.rag(query=query)
             print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
             if output is None:
                 return None

+from src.helper import load_chroma
 import numpy as np
 import logging
 import time
     def rag(self, query):
         try:
             if query is None:
+                print("No query specified")
                 return None
             results = self.chroma_collection.query(
                 query_texts=[query],
     def generate_answers(self, query):
         try:
             start_time = time.time()
+            if query is None:
+                print("No query")
+                return "No Query"
+            output = self.rag(query)
             print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
             if output is None:
                 return None

src/content.html ADDED Viewed

	@@ -0,0 +1,33 @@

+<div id="column_container">
+    <h1>Talk to Doc: Advanced RAG for Reasoning and QA with Gemini Pro</h1>
+    <p>Welcome to Talk to Doc, where finding answers in PDFs is as easy as chatting. Here's how to get the info you need
+        without any hassle.</p>
+    <h2>How to Use It</h2>
+    <ol>
+        <li><strong>Upload Your PDF:</strong> Pick the PDF you have questions about and upload it here.</li>
+        <li><strong>Click "Process":</strong> It will take a few seconds depending on the size of the PDF. Once, the PDF
+            processing is complete, you will be able to ask questions to the bot.</li>
+        <li><strong>Start Asking:</strong> Just type your question in the box and hit enter.</li>
+    </ol>
+</div>
+<!-- split here -->
+<div id="column_container">
+    <h2>What's Happening Behind the Scenes?</h2>
+    <ul>
+        <li><strong>Chunking:</strong> Your PDF is divided into smaller sections for better analysis, using LangChain's
+            text splitting capabilities to manage the document's content efficiently.</li>
+        <li><strong>Embedding with Gemini:</strong> Each section is then given a unique embedding, using the
+            'GeminiEmbeddingFunction', which helps in understanding the content better for retrieval.</li>
+        <li><strong>Storing and Searching in ChromaDB:</strong> These embeddings are stored in ChromaDB, allowing for
+            fast and accurate retrieval of information related to your questions.</li>
+        <li><strong>Query Expansion:</strong> To enhance the search, your query is expanded using the
+            'models/text-bison-001' model. This helps in considering various ways the question might be asked or
+            phrased.</li>
+        <li><strong>Cross Encoder Re-ranking:</strong> The potential answers are then re-ranked for relevance using the
+            'cross-encoder/ms-marco-MiniLM-L-6-v2' model, ensuring that the most pertinent information is selected.</li>
+        <li><strong>Final Response Generation:</strong> The final answer is generated by the 'gemini-pro' model, which
+            synthesizes the information into a clear and concise response.</li>
+    </ul>
+</div>

src/helper.py CHANGED Viewed

@@ -4,13 +4,13 @@ from langchain.text_splitter import (
     RecursiveCharacterTextSplitter,
     SentenceTransformersTokenTextSplitter,
 )
 def _read_pdf(filename):
     reader = PdfReader(filename)
     pdf_texts = [p.extract_text().strip() for p in reader.pages]
     pdf_texts = [text for text in pdf_texts if text]
     return pdf_texts
 def _chunk_texts(texts):
     character_splitter = RecursiveCharacterTextSplitter(
         separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
@@ -28,14 +28,13 @@ def load_chroma(filename, collection_name, embedding_function):
     texts = _read_pdf(filename)
     chunks = _chunk_texts(texts)
     chroma_client = chromadb.Client()
-    chroma_collection = chroma_client.create_collection(
         name=collection_name, embedding_function=embedding_function
     )
     ids = [str(i) for i in range(len(chunks))]
     chroma_collection.add(ids=ids, documents=chunks)
     return chroma_collection
 def word_wrap(string, n_chars=72):
     if len(string) < n_chars:
         return string

     RecursiveCharacterTextSplitter,
     SentenceTransformersTokenTextSplitter,
 )
 def _read_pdf(filename):
     reader = PdfReader(filename)
     pdf_texts = [p.extract_text().strip() for p in reader.pages]
     pdf_texts = [text for text in pdf_texts if text]
     return pdf_texts
 def _chunk_texts(texts):
     character_splitter = RecursiveCharacterTextSplitter(
         separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
     texts = _read_pdf(filename)
     chunks = _chunk_texts(texts)
     chroma_client = chromadb.Client()
+    chroma_collection = chroma_client.get_or_create_collection(
         name=collection_name, embedding_function=embedding_function
     )
     ids = [str(i) for i in range(len(chunks))]
     chroma_collection.add(ids=ids, documents=chunks)
     return chroma_collection
 def word_wrap(string, n_chars=72):
     if len(string) < n_chars:
         return string

src/init.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from brain import Brain
 class Initializer:
     @staticmethod
@@ -39,7 +40,17 @@ class Initializer:
             {"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
             {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
         ]
-        chroma_collection_name = str.upper(chroma_filename) + "_COLLECT"
         return Brain(
             augment_model_name,
@@ -53,3 +64,4 @@ class Initializer:
             chroma_filename,
             chroma_collection_name,
         )

+import os
+from src.brain import Brain
+import re
 class Initializer:
     @staticmethod
             {"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
             {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
         ]
+        def base_name(file_path):
+            base_name = os.path.basename(file_path)
+            name, extension = os.path.splitext(base_name)
+            return name
+        def clean_up(message):
+            message = re.sub(r"[^\w\s,]", "", message)
+            message = re.sub(r"http\S+|www.\S+", "", message)
+            message = re.sub(r"\s+", "", message)
+            return message[:30]
+        chroma_collection_name = str.upper(clean_up(base_name(chroma_filename))) + "_COLLECT"
         return Brain(
             augment_model_name,
             chroma_filename,
             chroma_collection_name,
         )

src/style.css ADDED Viewed

	@@ -0,0 +1,61 @@

+body {
+    background-color: #171717;
+    font-family: 'San Francisco', 'Helvetica Neue', sans-serif;
+    color: #c9d1d9;
+}
+#column_container {
+    max-width: 700px;
+    margin: auto;
+    background-color: #0d1117;
+    border-radius: 12px;
+    box-shadow: 0 4px 8px rgba(255, 255, 255, 0.1);
+    padding: 40px;
+}
+h1 {
+    color: #58a6ff;
+    text-align: center;
+    font-family: 'Avenir Next', 'Helvetica Neue', sans-serif;
+    font-size: 3em;
+    text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.08);
+}
+p,
+li {
+    color: #c9d1d9;
+    line-height: 1.6;
+    font-family: 'Georgia', serif;
+}
+strong {
+    color: #79c0ff;
+}
+ul {
+    list-style: none;
+    padding: 0;
+    margin: 20px 0;
+}
+li {
+    padding-left: 20px;
+    position: relative;
+}
+button {
+    background-color: #238636;
+    color: #ffffff;
+    border: none;
+    border-radius: 8px;
+    padding: 14px 24px;
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 1em;
+    transition: background-color 0.3s ease;
+    box-shadow: 0 1px 3px rgba(255, 255, 255, 0.1);
+    button:hover {
+        background-color: #196c2e;
+    }
+}