asif00 commited on
Commit
036f6a2
1 Parent(s): 607785c

Update: alpha

Browse files
Files changed (9) hide show
  1. .gitignore +0 -1
  2. app.py +70 -0
  3. requirements.txt +5 -0
  4. src/app.py +0 -15
  5. src/brain.py +6 -2
  6. src/content.html +33 -0
  7. src/helper.py +2 -3
  8. src/init.py +15 -3
  9. src/style.css +61 -0
.gitignore CHANGED
@@ -3,7 +3,6 @@
3
  .env
4
  *.pdf
5
  *.json
6
- *.txt
7
  temp/*
8
 
9
 
 
3
  .env
4
  *.pdf
5
  *.json
 
6
  temp/*
7
 
8
 
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+
4
+ from src.init import Initializer
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+ AUG_TOKEN = os.environ.get("AUGMENT_MODEL")
9
+ RES_TOKEN = os.environ.get("RESPONSE_MODEL")
10
+
11
+ pdf_loaded = False
12
+ processing = False
13
+
14
+
15
+ def load_pdf(pdf_file_path):
16
+ global pdf_loaded
17
+ filename = pdf_file_path.name
18
+ global brain
19
+ brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, filename)
20
+ pdf_loaded = True
21
+ return "Processing complete!"
22
+
23
+
24
+ def response(query, history):
25
+ global processing
26
+ if not pdf_loaded or processing:
27
+ return "Please wait...", history
28
+ processing = True
29
+ output = brain.generate_answers(query)
30
+ history.append((query, output))
31
+ processing = False
32
+ return "", history
33
+
34
+
35
+ with open("src/style.css", "r") as file:
36
+ css = file.read()
37
+
38
+ with open("src/content.html", "r") as file:
39
+ html_content = file.read()
40
+ parts = html_content.split("<!-- split here -->")
41
+ title_html = parts[0]
42
+ bts_html = parts[1] if len(parts) > 1 else ""
43
+
44
+
45
+ def loading():
46
+ return "Loading ..."
47
+
48
+
49
+ with gr.Blocks(css=css) as app:
50
+ with gr.Column(elem_id="column_container"):
51
+ gr.HTML(title_html)
52
+ with gr.Column():
53
+ pdf = gr.File(label="Load your PDF document", file_types=[".pdf"])
54
+ with gr.Row():
55
+ status = gr.Label(label="Status", value="")
56
+ load_pdf_button = gr.Button(value="Process")
57
+
58
+ chatbot = gr.Chatbot([], elem_id="chatbot")
59
+ query = gr.Textbox(
60
+ label="Ask a question about the PDF",
61
+ placeholder="What do you want to know?",
62
+ )
63
+ clear = gr.ClearButton([query, chatbot])
64
+ gr.HTML(bts_html)
65
+
66
+ load_pdf_button.click(loading, outputs=[status])
67
+ load_pdf_button.click(load_pdf, inputs=[pdf], outputs=[status])
68
+ query.submit(response, [query, chatbot], [query, chatbot])
69
+
70
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ google-generativeai
2
+ langchain
3
+ sentence-transformers
4
+ chromadb
5
+ pypdf
src/app.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
-
3
- load_dotenv()
4
- import gradio as gr
5
- from init import Initializer
6
- from dotenv import load_dotenv
7
-
8
- AUG_TOKEN = os.environ.get("AUG_TOKEN")
9
- RES_TOKEN = os.environ.get("RES_TOKEN")
10
- chroma_filename = ""
11
- brain = Initializer.initialize(AUG_TOKEN, RES_TOKEN, chroma_filename)
12
-
13
- # TODO:
14
- # Chatbot like UI
15
- # Multiple PDF file handling ability
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/brain.py CHANGED
@@ -1,4 +1,4 @@
1
- from helper import load_chroma
2
  import numpy as np
3
  import logging
4
  import time
@@ -203,6 +203,7 @@ class Brain:
203
  def rag(self, query):
204
  try:
205
  if query is None:
 
206
  return None
207
  results = self.chroma_collection.query(
208
  query_texts=[query],
@@ -220,7 +221,10 @@ class Brain:
220
  def generate_answers(self, query):
221
  try:
222
  start_time = time.time()
223
- output = self.rag(query=query)
 
 
 
224
  print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
225
  if output is None:
226
  return None
 
1
+ from src.helper import load_chroma
2
  import numpy as np
3
  import logging
4
  import time
 
203
  def rag(self, query):
204
  try:
205
  if query is None:
206
+ print("No query specified")
207
  return None
208
  results = self.chroma_collection.query(
209
  query_texts=[query],
 
221
  def generate_answers(self, query):
222
  try:
223
  start_time = time.time()
224
+ if query is None:
225
+ print("No query")
226
+ return "No Query"
227
+ output = self.rag(query)
228
  print(f"\n\nExecution time: {time.time() - start_time} seconds\n")
229
  if output is None:
230
  return None
src/content.html ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div id="column_container">
2
+ <h1>Talk to Doc: Advanced RAG for Reasoning and QA with Gemini Pro</h1>
3
+ <p>Welcome to Talk to Doc, where finding answers in PDFs is as easy as chatting. Here's how to get the info you need
4
+ without any hassle.</p>
5
+ <h2>How to Use It</h2>
6
+ <ol>
7
+ <li><strong>Upload Your PDF:</strong> Pick the PDF you have questions about and upload it here.</li>
8
+ <li><strong>Click "Process":</strong> It will take a few seconds depending on the size of the PDF. Once, the PDF
9
+ processing is complete, you will be able to ask questions to the bot.</li>
10
+ <li><strong>Start Asking:</strong> Just type your question in the box and hit enter.</li>
11
+ </ol>
12
+ </div>
13
+
14
+ <!-- split here -->
15
+
16
+ <div id="column_container">
17
+ <h2>What's Happening Behind the Scenes?</h2>
18
+ <ul>
19
+ <li><strong>Chunking:</strong> Your PDF is divided into smaller sections for better analysis, using LangChain's
20
+ text splitting capabilities to manage the document's content efficiently.</li>
21
+ <li><strong>Embedding with Gemini:</strong> Each section is then given a unique embedding, using the
22
+ 'GeminiEmbeddingFunction', which helps in understanding the content better for retrieval.</li>
23
+ <li><strong>Storing and Searching in ChromaDB:</strong> These embeddings are stored in ChromaDB, allowing for
24
+ fast and accurate retrieval of information related to your questions.</li>
25
+ <li><strong>Query Expansion:</strong> To enhance the search, your query is expanded using the
26
+ 'models/text-bison-001' model. This helps in considering various ways the question might be asked or
27
+ phrased.</li>
28
+ <li><strong>Cross Encoder Re-ranking:</strong> The potential answers are then re-ranked for relevance using the
29
+ 'cross-encoder/ms-marco-MiniLM-L-6-v2' model, ensuring that the most pertinent information is selected.</li>
30
+ <li><strong>Final Response Generation:</strong> The final answer is generated by the 'gemini-pro' model, which
31
+ synthesizes the information into a clear and concise response.</li>
32
+ </ul>
33
+ </div>
src/helper.py CHANGED
@@ -4,13 +4,13 @@ from langchain.text_splitter import (
4
  RecursiveCharacterTextSplitter,
5
  SentenceTransformersTokenTextSplitter,
6
  )
 
7
  def _read_pdf(filename):
8
  reader = PdfReader(filename)
9
  pdf_texts = [p.extract_text().strip() for p in reader.pages]
10
  pdf_texts = [text for text in pdf_texts if text]
11
  return pdf_texts
12
 
13
-
14
  def _chunk_texts(texts):
15
  character_splitter = RecursiveCharacterTextSplitter(
16
  separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
@@ -28,14 +28,13 @@ def load_chroma(filename, collection_name, embedding_function):
28
  texts = _read_pdf(filename)
29
  chunks = _chunk_texts(texts)
30
  chroma_client = chromadb.Client()
31
- chroma_collection = chroma_client.create_collection(
32
  name=collection_name, embedding_function=embedding_function
33
  )
34
  ids = [str(i) for i in range(len(chunks))]
35
  chroma_collection.add(ids=ids, documents=chunks)
36
  return chroma_collection
37
 
38
-
39
  def word_wrap(string, n_chars=72):
40
  if len(string) < n_chars:
41
  return string
 
4
  RecursiveCharacterTextSplitter,
5
  SentenceTransformersTokenTextSplitter,
6
  )
7
+
8
  def _read_pdf(filename):
9
  reader = PdfReader(filename)
10
  pdf_texts = [p.extract_text().strip() for p in reader.pages]
11
  pdf_texts = [text for text in pdf_texts if text]
12
  return pdf_texts
13
 
 
14
  def _chunk_texts(texts):
15
  character_splitter = RecursiveCharacterTextSplitter(
16
  separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1600, chunk_overlap=200
 
28
  texts = _read_pdf(filename)
29
  chunks = _chunk_texts(texts)
30
  chroma_client = chromadb.Client()
31
+ chroma_collection = chroma_client.get_or_create_collection(
32
  name=collection_name, embedding_function=embedding_function
33
  )
34
  ids = [str(i) for i in range(len(chunks))]
35
  chroma_collection.add(ids=ids, documents=chunks)
36
  return chroma_collection
37
 
 
38
  def word_wrap(string, n_chars=72):
39
  if len(string) < n_chars:
40
  return string
src/init.py CHANGED
@@ -1,5 +1,6 @@
1
- from brain import Brain
2
-
 
3
 
4
  class Initializer:
5
  @staticmethod
@@ -39,7 +40,17 @@ class Initializer:
39
  {"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
40
  {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
41
  ]
42
- chroma_collection_name = str.upper(chroma_filename) + "_COLLECT"
 
 
 
 
 
 
 
 
 
 
43
 
44
  return Brain(
45
  augment_model_name,
@@ -53,3 +64,4 @@ class Initializer:
53
  chroma_filename,
54
  chroma_collection_name,
55
  )
 
 
1
+ import os
2
+ from src.brain import Brain
3
+ import re
4
 
5
  class Initializer:
6
  @staticmethod
 
40
  {"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
41
  {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4},
42
  ]
43
+ def base_name(file_path):
44
+ base_name = os.path.basename(file_path)
45
+ name, extension = os.path.splitext(base_name)
46
+ return name
47
+ def clean_up(message):
48
+ message = re.sub(r"[^\w\s,]", "", message)
49
+ message = re.sub(r"http\S+|www.\S+", "", message)
50
+ message = re.sub(r"\s+", "", message)
51
+ return message[:30]
52
+
53
+ chroma_collection_name = str.upper(clean_up(base_name(chroma_filename))) + "_COLLECT"
54
 
55
  return Brain(
56
  augment_model_name,
 
64
  chroma_filename,
65
  chroma_collection_name,
66
  )
67
+
src/style.css ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ background-color: #171717;
3
+ font-family: 'San Francisco', 'Helvetica Neue', sans-serif;
4
+ color: #c9d1d9;
5
+ }
6
+
7
+ #column_container {
8
+ max-width: 700px;
9
+ margin: auto;
10
+ background-color: #0d1117;
11
+ border-radius: 12px;
12
+ box-shadow: 0 4px 8px rgba(255, 255, 255, 0.1);
13
+ padding: 40px;
14
+ }
15
+
16
+ h1 {
17
+ color: #58a6ff;
18
+ text-align: center;
19
+ font-family: 'Avenir Next', 'Helvetica Neue', sans-serif;
20
+ font-size: 3em;
21
+ text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.08);
22
+ }
23
+
24
+ p,
25
+ li {
26
+ color: #c9d1d9;
27
+ line-height: 1.6;
28
+ font-family: 'Georgia', serif;
29
+ }
30
+
31
+ strong {
32
+ color: #79c0ff;
33
+ }
34
+
35
+ ul {
36
+ list-style: none;
37
+ padding: 0;
38
+ margin: 20px 0;
39
+ }
40
+
41
+ li {
42
+ padding-left: 20px;
43
+ position: relative;
44
+ }
45
+
46
+ button {
47
+ background-color: #238636;
48
+ color: #ffffff;
49
+ border: none;
50
+ border-radius: 8px;
51
+ padding: 14px 24px;
52
+ cursor: pointer;
53
+ font-family: inherit;
54
+ font-size: 1em;
55
+ transition: background-color 0.3s ease;
56
+ box-shadow: 0 1px 3px rgba(255, 255, 255, 0.1);
57
+
58
+ button:hover {
59
+ background-color: #196c2e;
60
+ }
61
+ }