Update: alpha
Browse files- .gitignore +0 -1
- app.py +70 -0
- requirements.txt +5 -0
- src/app.py +0 -15
- src/brain.py +6 -2
- src/content.html +33 -0
- src/helper.py +2 -3
- src/init.py +15 -3
- src/style.css +61 -0
.gitignore
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
.env
|
4 |
*.pdf
|
5 |
*.json
|
6 |
-
*.txt
|
7 |
temp/*
|
8 |
|
9 |
|
|
|
3 |
.env
|
4 |
*.pdf
|
5 |
*.json
|
|
|
6 |
temp/*
|
7 |
|
8 |
|
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from src.init import Initializer
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
AUG_TOKEN = os.environ.get("AUGMENT_MODEL")
|
9 |
+
RES_TOKEN = os.environ.get("RESPONSE_MODEL")
|
10 |
+
|
11 |
+
pdf_loaded = False
|
12 |
+
processing = False
|
13 |
+
|
14 |
+
|
15 |
+
def load_pdf(pdf_file_path):
|
16 |
+
global pdf_loaded
|
17 |
+
filename = pdf_file_path.name
|
18 |
+
global brain
|
19 |
+
brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, filename)
|
20 |
+
pdf_loaded = True
|
21 |
+
return "Processing complete!"
|
22 |
+
|
23 |
+
|
24 |
+
def response(query, history):
|
25 |
+
global processing
|
26 |
+
if not pdf_loaded or processing:
|
27 |
+
return "Please wait...", history
|
28 |
+
processing = True
|
29 |
+
output = brain.generate_answers(query)
|
30 |
+
history.append((query, output))
|
31 |
+
processing = False
|
32 |
+
return "", history
|
33 |
+
|
34 |
+
|
35 |
+
with open("src/style.css", "r") as file:
|
36 |
+
css = file.read()
|
37 |
+
|
38 |
+
with open("src/content.html", "r") as file:
|
39 |
+
html_content = file.read()
|
40 |
+
parts = html_content.split("<!-- split here -->")
|
41 |
+
title_html = parts[0]
|
42 |
+
bts_html = parts[1] if len(parts) > 1 else ""
|
43 |
+
|
44 |
+
|
45 |
+
def loading():
|
46 |
+
return "Loading ..."
|
47 |
+
|
48 |
+
|
49 |
+
with gr.Blocks(css=css) as app:
|
50 |
+
with gr.Column(elem_id="column_container"):
|
51 |
+
gr.HTML(title_html)
|
52 |
+
with gr.Column():
|
53 |
+
pdf = gr.File(label="Load your PDF document", file_types=[".pdf"])
|
54 |
+
with gr.Row():
|
55 |
+
status = gr.Label(label="Status", value="")
|
56 |
+
load_pdf_button = gr.Button(value="Process")
|
57 |
+
|
58 |
+
chatbot = gr.Chatbot([], elem_id="chatbot")
|
59 |
+
query = gr.Textbox(
|
60 |
+
label="Ask a question about the PDF",
|
61 |
+
placeholder="What do you want to know?",
|
62 |
+
)
|
63 |
+
clear = gr.ClearButton([query, chatbot])
|
64 |
+
gr.HTML(bts_html)
|
65 |
+
|
66 |
+
load_pdf_button.click(loading, outputs=[status])
|
67 |
+
load_pdf_button.click(load_pdf, inputs=[pdf], outputs=[status])
|
68 |
+
query.submit(response, [query, chatbot], [query, chatbot])
|
69 |
+
|
70 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
google-generativeai
|
2 |
+
langchain
|
3 |
+
sentence-transformers
|
4 |
+
chromadb
|
5 |
+
pypdf
|
src/app.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
load_dotenv()
|
4 |
-
import gradio as gr
|
5 |
-
from init import Initializer
|
6 |
-
from dotenv import load_dotenv
|
7 |
-
|
8 |
-
AUG_TOKEN = os.environ.get("AUG_TOKEN")
|
9 |
-
RES_TOKEN = os.environ.get("RES_TOKEN")
|
10 |
-
chroma_filename = ""
|
11 |
-
brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, chroma_filename)
|
12 |
-
|
13 |
-
# TODO:
|
14 |
-
# Chatbot like UI
|
15 |
-
# Multiple PDF file handling ability
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/brain.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from helper import load_chroma
|
2 |
import numpy as np
|
3 |
import logging
|
4 |
import time
|
@@ -203,6 +203,7 @@ class Brain:
|
|
203 |
def rag(self, query):
|
204 |
try:
|
205 |
if query is None:
|
|
|
206 |
return None
|
207 |
results = self.chroma_collection.query(
|
208 |
query_texts=[query],
|
@@ -220,7 +221,10 @@ class Brain:
|
|
220 |
def generate_answers(self, query):
|
221 |
try:
|
222 |
start_time = time.time()
|
223 |
-
|
|
|
|
|
|
|
224 |
print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
|
225 |
if output is None:
|
226 |
return None
|
|
|
1 |
+
from src.helper import load_chroma
|
2 |
import numpy as np
|
3 |
import logging
|
4 |
import time
|
|
|
203 |
def rag(self, query):
|
204 |
try:
|
205 |
if query is None:
|
206 |
+
print("No query specified")
|
207 |
return None
|
208 |
results = self.chroma_collection.query(
|
209 |
query_texts=[query],
|
|
|
221 |
def generate_answers(self, query):
|
222 |
try:
|
223 |
start_time = time.time()
|
224 |
+
if query is None:
|
225 |
+
print("No query")
|
226 |
+
return "No Query"
|
227 |
+
output = self.rag(query)
|
228 |
print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
|
229 |
if output is None:
|
230 |
return None
|
src/content.html
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div id="column_container">
|
2 |
+
<h1>Talk to Doc: Advanced RAG for Reasoning and QA with Gemini Pro</h1>
|
3 |
+
<p>Welcome to Talk to Doc, where finding answers in PDFs is as easy as chatting. Here's how to get the info you need
|
4 |
+
without any hassle.</p>
|
5 |
+
<h2>How to Use It</h2>
|
6 |
+
<ol>
|
7 |
+
<li><strong>Upload Your PDF:</strong> Pick the PDF you have questions about and upload it here.</li>
|
8 |
+
<li><strong>Click "Process":</strong> It will take a few seconds depending on the size of the PDF. Once, the PDF
|
9 |
+
processing is complete, you will be able to ask questions to the bot.</li>
|
10 |
+
<li><strong>Start Asking:</strong> Just type your question in the box and hit enter.</li>
|
11 |
+
</ol>
|
12 |
+
</div>
|
13 |
+
|
14 |
+
<!-- split here -->
|
15 |
+
|
16 |
+
<div id="column_container">
|
17 |
+
<h2>What's Happening Behind the Scenes?</h2>
|
18 |
+
<ul>
|
19 |
+
<li><strong>Chunking:</strong> Your PDF is divided into smaller sections for better analysis, using LangChain's
|
20 |
+
text splitting capabilities to manage the document's content efficiently.</li>
|
21 |
+
<li><strong>Embedding with Gemini:</strong> Each section is then given a unique embedding, using the
|
22 |
+
'GeminiEmbeddingFunction', which helps in understanding the content better for retrieval.</li>
|
23 |
+
<li><strong>Storing and Searching in ChromaDB:</strong> These embeddings are stored in ChromaDB, allowing for
|
24 |
+
fast and accurate retrieval of information related to your questions.</li>
|
25 |
+
<li><strong>Query Expansion:</strong> To enhance the search, your query is expanded using the
|
26 |
+
'models/text-bison-001' model. This helps in considering various ways the question might be asked or
|
27 |
+
phrased.</li>
|
28 |
+
<li><strong>Cross Encoder Re-ranking:</strong> The potential answers are then re-ranked for relevance using the
|
29 |
+
'cross-encoder/ms-marco-MiniLM-L-6-v2' model, ensuring that the most pertinent information is selected.</li>
|
30 |
+
<li><strong>Final Response Generation:</strong> The final answer is generated by the 'gemini-pro' model, which
|
31 |
+
synthesizes the information into a clear and concise response.</li>
|
32 |
+
</ul>
|
33 |
+
</div>
|
src/helper.py
CHANGED
@@ -4,13 +4,13 @@ from langchain.text_splitter import (
|
|
4 |
RecursiveCharacterTextSplitter,
|
5 |
SentenceTransformersTokenTextSplitter,
|
6 |
)
|
|
|
7 |
def _read_pdf(filename):
|
8 |
reader = PdfReader(filename)
|
9 |
pdf_texts = [p.extract_text().strip() for p in reader.pages]
|
10 |
pdf_texts = [text for text in pdf_texts if text]
|
11 |
return pdf_texts
|
12 |
|
13 |
-
|
14 |
def _chunk_texts(texts):
|
15 |
character_splitter = RecursiveCharacterTextSplitter(
|
16 |
separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
|
@@ -28,14 +28,13 @@ def load_chroma(filename, collection_name, embedding_function):
|
|
28 |
texts = _read_pdf(filename)
|
29 |
chunks = _chunk_texts(texts)
|
30 |
chroma_client = chromadb.Client()
|
31 |
-
chroma_collection = chroma_client.
|
32 |
name=collection_name, embedding_function=embedding_function
|
33 |
)
|
34 |
ids = [str(i) for i in range(len(chunks))]
|
35 |
chroma_collection.add(ids=ids, documents=chunks)
|
36 |
return chroma_collection
|
37 |
|
38 |
-
|
39 |
def word_wrap(string, n_chars=72):
|
40 |
if len(string) < n_chars:
|
41 |
return string
|
|
|
4 |
RecursiveCharacterTextSplitter,
|
5 |
SentenceTransformersTokenTextSplitter,
|
6 |
)
|
7 |
+
|
8 |
def _read_pdf(filename):
|
9 |
reader = PdfReader(filename)
|
10 |
pdf_texts = [p.extract_text().strip() for p in reader.pages]
|
11 |
pdf_texts = [text for text in pdf_texts if text]
|
12 |
return pdf_texts
|
13 |
|
|
|
14 |
def _chunk_texts(texts):
|
15 |
character_splitter = RecursiveCharacterTextSplitter(
|
16 |
separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
|
|
|
28 |
texts = _read_pdf(filename)
|
29 |
chunks = _chunk_texts(texts)
|
30 |
chroma_client = chromadb.Client()
|
31 |
+
chroma_collection = chroma_client.get_or_create_collection(
|
32 |
name=collection_name, embedding_function=embedding_function
|
33 |
)
|
34 |
ids = [str(i) for i in range(len(chunks))]
|
35 |
chroma_collection.add(ids=ids, documents=chunks)
|
36 |
return chroma_collection
|
37 |
|
|
|
38 |
def word_wrap(string, n_chars=72):
|
39 |
if len(string) < n_chars:
|
40 |
return string
|
src/init.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
|
2 |
-
|
|
|
3 |
|
4 |
class Initializer:
|
5 |
@staticmethod
|
@@ -39,7 +40,17 @@ class Initializer:
|
|
39 |
{"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
|
40 |
{"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
|
41 |
]
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
return Brain(
|
45 |
augment_model_name,
|
@@ -53,3 +64,4 @@ class Initializer:
|
|
53 |
chroma_filename,
|
54 |
chroma_collection_name,
|
55 |
)
|
|
|
|
1 |
+
import os
|
2 |
+
from src.brain import Brain
|
3 |
+
import re
|
4 |
|
5 |
class Initializer:
|
6 |
@staticmethod
|
|
|
40 |
{"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
|
41 |
{"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
|
42 |
]
|
43 |
+
def base_name(file_path):
|
44 |
+
base_name = os.path.basename(file_path)
|
45 |
+
name, extension = os.path.splitext(base_name)
|
46 |
+
return name
|
47 |
+
def clean_up(message):
|
48 |
+
message = re.sub(r"[^\w\s,]", "", message)
|
49 |
+
message = re.sub(r"http\S+|www.\S+", "", message)
|
50 |
+
message = re.sub(r"\s+", "", message)
|
51 |
+
return message[:30]
|
52 |
+
|
53 |
+
chroma_collection_name = str.upper(clean_up(base_name(chroma_filename))) + "_COLLECT"
|
54 |
|
55 |
return Brain(
|
56 |
augment_model_name,
|
|
|
64 |
chroma_filename,
|
65 |
chroma_collection_name,
|
66 |
)
|
67 |
+
|
src/style.css
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
background-color: #171717;
|
3 |
+
font-family: 'San Francisco', 'Helvetica Neue', sans-serif;
|
4 |
+
color: #c9d1d9;
|
5 |
+
}
|
6 |
+
|
7 |
+
#column_container {
|
8 |
+
max-width: 700px;
|
9 |
+
margin: auto;
|
10 |
+
background-color: #0d1117;
|
11 |
+
border-radius: 12px;
|
12 |
+
box-shadow: 0 4px 8px rgba(255, 255, 255, 0.1);
|
13 |
+
padding: 40px;
|
14 |
+
}
|
15 |
+
|
16 |
+
h1 {
|
17 |
+
color: #58a6ff;
|
18 |
+
text-align: center;
|
19 |
+
font-family: 'Avenir Next', 'Helvetica Neue', sans-serif;
|
20 |
+
font-size: 3em;
|
21 |
+
text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.08);
|
22 |
+
}
|
23 |
+
|
24 |
+
p,
|
25 |
+
li {
|
26 |
+
color: #c9d1d9;
|
27 |
+
line-height: 1.6;
|
28 |
+
font-family: 'Georgia', serif;
|
29 |
+
}
|
30 |
+
|
31 |
+
strong {
|
32 |
+
color: #79c0ff;
|
33 |
+
}
|
34 |
+
|
35 |
+
ul {
|
36 |
+
list-style: none;
|
37 |
+
padding: 0;
|
38 |
+
margin: 20px 0;
|
39 |
+
}
|
40 |
+
|
41 |
+
li {
|
42 |
+
padding-left: 20px;
|
43 |
+
position: relative;
|
44 |
+
}
|
45 |
+
|
46 |
+
button {
|
47 |
+
background-color: #238636;
|
48 |
+
color: #ffffff;
|
49 |
+
border: none;
|
50 |
+
border-radius: 8px;
|
51 |
+
padding: 14px 24px;
|
52 |
+
cursor: pointer;
|
53 |
+
font-family: inherit;
|
54 |
+
font-size: 1em;
|
55 |
+
transition: background-color 0.3s ease;
|
56 |
+
box-shadow: 0 1px 3px rgba(255, 255, 255, 0.1);
|
57 |
+
|
58 |
+
button:hover {
|
59 |
+
background-color: #196c2e;
|
60 |
+
}
|
61 |
+
}
|