Spaces:
Runtime error
Runtime error
Chintan Donda
commited on
Commit
•
8f40cff
1
Parent(s):
2cd07d5
Fixing issues
Browse files- app.py +36 -18
- src/constants.py +2 -1
- src/data_loader.py +17 -2
- src/langchain_utils.py +20 -16
app.py
CHANGED
@@ -39,8 +39,8 @@ class DomState:
|
|
39 |
|
40 |
def click_handler_for_get_relevant_paragraphs(
|
41 |
self,
|
42 |
-
|
43 |
-
|
44 |
):
|
45 |
self.relevant_paragraphs = self.kkms_kssw_obj.query(
|
46 |
question=question,
|
@@ -69,19 +69,24 @@ class DomState:
|
|
69 |
|
70 |
def click_handler_for_get_answer(
|
71 |
self,
|
72 |
-
relevant_paragraphs,
|
|
|
73 |
):
|
74 |
-
self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(
|
|
|
|
|
|
|
75 |
return self.answer
|
76 |
|
77 |
|
78 |
-
def click_handler_for_mandi_price(
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
85 |
if state_name and apmc_name and commodity_name and from_date and to_date:
|
86 |
self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
|
87 |
return self.mandi_price
|
@@ -117,12 +122,12 @@ class DomState:
|
|
117 |
self,
|
118 |
doc_type,
|
119 |
files_or_urls,
|
120 |
-
|
121 |
):
|
122 |
self.kkms_kssw_obj.upload_data(
|
123 |
doc_type=constants_utils.DATA_SOURCES[doc_type],
|
124 |
files_or_urls=files_or_urls,
|
125 |
-
index_category=
|
126 |
)
|
127 |
|
128 |
|
@@ -262,13 +267,16 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
|
|
262 |
with gr.Row(visible=True) as rowCustomQuery:
|
263 |
with gr.Column(scale=1, min_width=600):
|
264 |
with gr.Tab(label='Relevant paragraphs'):
|
|
|
|
|
|
|
265 |
question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
|
266 |
# Get the Relevant paragraphs for the question asked
|
267 |
relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
|
268 |
b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
|
269 |
b_relevant_paragraphs.click(
|
270 |
fn=dom.click_handler_for_get_relevant_paragraphs,
|
271 |
-
inputs=question,
|
272 |
outputs=[relevant_paragraphs]
|
273 |
)
|
274 |
|
@@ -396,6 +404,10 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
|
|
396 |
with gr.Row(visible=False) as rowLoadCustomData:
|
397 |
with gr.Column(scale=1, min_width=600):
|
398 |
with gr.Tab(label='Load Custom Data'):
|
|
|
|
|
|
|
|
|
399 |
doc_type = gr.Radio(
|
400 |
list(constants_utils.DATA_SOURCES.keys()),
|
401 |
label="Select data source (Supports uploading multiple Files/URLs)",
|
@@ -414,14 +426,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
|
|
414 |
b_files = gr.Button("Load PDF Files").style(size='sm')
|
415 |
b_files.click(
|
416 |
fn=dom.click_handler_for_load_files_urls,
|
417 |
-
inputs=[doc_type, upload_button]
|
418 |
)
|
419 |
|
420 |
with gr.Row(visible=False) as rowUploadOnlinePdf:
|
421 |
with gr.Column(scale=1, min_width=600):
|
422 |
urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
|
423 |
b_urls = gr.Button("Load Online PDFs").style(size='sm')
|
424 |
-
b_urls.click(
|
|
|
|
|
|
|
425 |
|
426 |
with gr.Row(visible=False) as rowUploadTextFile:
|
427 |
with gr.Column(scale=1, min_width=600):
|
@@ -435,14 +450,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
|
|
435 |
b_files = gr.Button("Load Text Files").style(size='sm')
|
436 |
b_files.click(
|
437 |
fn=dom.click_handler_for_load_files_urls,
|
438 |
-
inputs=[doc_type, file_output]
|
439 |
)
|
440 |
|
441 |
with gr.Row(visible=False) as rowUploadUrls:
|
442 |
with gr.Column(scale=1, min_width=600):
|
443 |
urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
|
444 |
b_urls = gr.Button("Load URLs").style(size='sm')
|
445 |
-
b_urls.click(
|
|
|
|
|
|
|
446 |
|
447 |
doc_type.change(
|
448 |
fn=dom.select_files_urls,
|
|
|
39 |
|
40 |
def click_handler_for_get_relevant_paragraphs(
|
41 |
self,
|
42 |
+
question_category,
|
43 |
+
question
|
44 |
):
|
45 |
self.relevant_paragraphs = self.kkms_kssw_obj.query(
|
46 |
question=question,
|
|
|
69 |
|
70 |
def click_handler_for_get_answer(
|
71 |
self,
|
72 |
+
relevant_paragraphs,
|
73 |
+
question
|
74 |
):
|
75 |
+
self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(
|
76 |
+
relevant_paragraphs,
|
77 |
+
question
|
78 |
+
)
|
79 |
return self.answer
|
80 |
|
81 |
|
82 |
+
def click_handler_for_mandi_price(
|
83 |
+
self,
|
84 |
+
state_name,
|
85 |
+
apmc_name,
|
86 |
+
commodity_name,
|
87 |
+
from_date,
|
88 |
+
to_date
|
89 |
+
):
|
90 |
if state_name and apmc_name and commodity_name and from_date and to_date:
|
91 |
self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
|
92 |
return self.mandi_price
|
|
|
122 |
self,
|
123 |
doc_type,
|
124 |
files_or_urls,
|
125 |
+
question_category
|
126 |
):
|
127 |
self.kkms_kssw_obj.upload_data(
|
128 |
doc_type=constants_utils.DATA_SOURCES[doc_type],
|
129 |
files_or_urls=files_or_urls,
|
130 |
+
index_category=question_category
|
131 |
)
|
132 |
|
133 |
|
|
|
267 |
with gr.Row(visible=True) as rowCustomQuery:
|
268 |
with gr.Column(scale=1, min_width=600):
|
269 |
with gr.Tab(label='Relevant paragraphs'):
|
270 |
+
question_category = gr.Dropdown(
|
271 |
+
constants_utils.INDEX_CATEGORY,
|
272 |
+
label="Select Query Type")
|
273 |
question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
|
274 |
# Get the Relevant paragraphs for the question asked
|
275 |
relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
|
276 |
b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
|
277 |
b_relevant_paragraphs.click(
|
278 |
fn=dom.click_handler_for_get_relevant_paragraphs,
|
279 |
+
inputs=[question_category, question],
|
280 |
outputs=[relevant_paragraphs]
|
281 |
)
|
282 |
|
|
|
404 |
with gr.Row(visible=False) as rowLoadCustomData:
|
405 |
with gr.Column(scale=1, min_width=600):
|
406 |
with gr.Tab(label='Load Custom Data'):
|
407 |
+
question_category = gr.Dropdown(
|
408 |
+
constants_utils.INDEX_CATEGORY,
|
409 |
+
label="Select Query Type")
|
410 |
+
|
411 |
doc_type = gr.Radio(
|
412 |
list(constants_utils.DATA_SOURCES.keys()),
|
413 |
label="Select data source (Supports uploading multiple Files/URLs)",
|
|
|
426 |
b_files = gr.Button("Load PDF Files").style(size='sm')
|
427 |
b_files.click(
|
428 |
fn=dom.click_handler_for_load_files_urls,
|
429 |
+
inputs=[doc_type, upload_button, question_category]
|
430 |
)
|
431 |
|
432 |
with gr.Row(visible=False) as rowUploadOnlinePdf:
|
433 |
with gr.Column(scale=1, min_width=600):
|
434 |
urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
|
435 |
b_urls = gr.Button("Load Online PDFs").style(size='sm')
|
436 |
+
b_urls.click(
|
437 |
+
fn=dom.click_handler_for_load_files_urls,
|
438 |
+
inputs=[doc_type, urls, question_category]
|
439 |
+
)
|
440 |
|
441 |
with gr.Row(visible=False) as rowUploadTextFile:
|
442 |
with gr.Column(scale=1, min_width=600):
|
|
|
450 |
b_files = gr.Button("Load Text Files").style(size='sm')
|
451 |
b_files.click(
|
452 |
fn=dom.click_handler_for_load_files_urls,
|
453 |
+
inputs=[doc_type, file_output, question_category]
|
454 |
)
|
455 |
|
456 |
with gr.Row(visible=False) as rowUploadUrls:
|
457 |
with gr.Column(scale=1, min_width=600):
|
458 |
urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
|
459 |
b_urls = gr.Button("Load URLs").style(size='sm')
|
460 |
+
b_urls.click(
|
461 |
+
fn=dom.click_handler_for_load_files_urls,
|
462 |
+
inputs=[doc_type, urls, question_category]
|
463 |
+
)
|
464 |
|
465 |
doc_type.change(
|
466 |
fn=dom.select_files_urls,
|
src/constants.py
CHANGED
@@ -22,7 +22,7 @@ INDEX_CATEGORY = [
|
|
22 |
# 'insurance',
|
23 |
# 'soil',
|
24 |
'general',
|
25 |
-
'vegetables'
|
26 |
]
|
27 |
|
28 |
# Doctype of the master index of each index category. Master index for each index category would be stored under this key.
|
@@ -43,6 +43,7 @@ DATA_SOURCES = {
|
|
43 |
# LangChain related constants
|
44 |
TEXT_SPLITTER_CHUNK_SIZE = 1000
|
45 |
TEXT_SPLITTER_CHUNK_OVERLAP = 0
|
|
|
46 |
|
47 |
|
48 |
URLS = [
|
|
|
22 |
# 'insurance',
|
23 |
# 'soil',
|
24 |
'general',
|
25 |
+
'vegetables'
|
26 |
]
|
27 |
|
28 |
# Doctype of the master index of each index category. Master index for each index category would be stored under this key.
|
|
|
43 |
# LangChain related constants
|
44 |
TEXT_SPLITTER_CHUNK_SIZE = 1000
|
45 |
TEXT_SPLITTER_CHUNK_OVERLAP = 0
|
46 |
+
TEXT_SPLITTER_SEPARATOR = '\n\n'
|
47 |
|
48 |
|
49 |
URLS = [
|
src/data_loader.py
CHANGED
@@ -135,7 +135,17 @@ class DATA_LOADER:
|
|
135 |
|
136 |
# Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
|
137 |
elif doc_type == 'directory':
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
# Load data from URLs in Knowledge Base format
|
141 |
elif doc_type == 'url-kb':
|
@@ -190,7 +200,12 @@ class DATA_LOADER:
|
|
190 |
):
|
191 |
cleaned_documents = []
|
192 |
for document in documents:
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
194 |
cleaned_documents.append(document)
|
195 |
return cleaned_documents
|
196 |
|
|
|
135 |
|
136 |
# Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
|
137 |
elif doc_type == 'directory':
|
138 |
+
# Load multiple PDFs from directory
|
139 |
+
if os.path.isdir(doc_filepath):
|
140 |
+
documents = SimpleDirectoryReader(
|
141 |
+
input_dir=doc_filepath
|
142 |
+
).load_data()
|
143 |
+
|
144 |
+
# Loading from a file
|
145 |
+
elif os.path.isfile(doc_filepath):
|
146 |
+
documents.extend(SimpleDirectoryReader(
|
147 |
+
input_files=[doc_filepath]
|
148 |
+
).load_data())
|
149 |
|
150 |
# Load data from URLs in Knowledge Base format
|
151 |
elif doc_type == 'url-kb':
|
|
|
200 |
):
|
201 |
cleaned_documents = []
|
202 |
for document in documents:
|
203 |
+
if hasattr(document, 'page_content'):
|
204 |
+
document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
|
205 |
+
elif hasattr(document, 'text'):
|
206 |
+
document.text = self.utils_obj.replace_newlines_and_spaces(document.text)
|
207 |
+
else:
|
208 |
+
document = self.utils_obj.replace_newlines_and_spaces(document)
|
209 |
cleaned_documents.append(document)
|
210 |
return cleaned_documents
|
211 |
|
src/langchain_utils.py
CHANGED
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional
|
|
22 |
|
23 |
import os
|
24 |
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
|
25 |
-
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
26 |
|
27 |
import logging
|
28 |
logger = logging.getLogger(__name__)
|
@@ -186,7 +185,11 @@ class LANGCHAIN_UTILS:
|
|
186 |
custom_prompt=True
|
187 |
):
|
188 |
# Prepare data (Split paragraph into chunks of small documents)
|
189 |
-
text_splitter = CharacterTextSplitter(
|
|
|
|
|
|
|
|
|
190 |
texts = text_splitter.split_text(para)
|
191 |
|
192 |
if self.index_type == 'FAISS':
|
@@ -299,15 +302,16 @@ class LANGCHAIN_UTILS:
|
|
299 |
|
300 |
logger.info(f'Creating index')
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
############## Build the Vector store for docs ##############
|
303 |
# Vector store using Facebook AI Similarity Search
|
304 |
if self.index_type == 'FAISS':
|
305 |
-
text_splitter = CharacterTextSplitter(
|
306 |
-
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
|
307 |
-
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
|
308 |
-
)
|
309 |
-
self.documents = text_splitter.split_documents(self.documents)
|
310 |
-
|
311 |
self.index = FAISS.from_documents(
|
312 |
self.documents,
|
313 |
self.embeddings
|
@@ -318,11 +322,6 @@ class LANGCHAIN_UTILS:
|
|
318 |
if not os.path.exists(self.index_filepath):
|
319 |
os.makedirs(self.index_filepath)
|
320 |
|
321 |
-
text_splitter = CharacterTextSplitter(
|
322 |
-
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
|
323 |
-
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
|
324 |
-
)
|
325 |
-
self.documents = text_splitter.split_documents(self.documents)
|
326 |
self.index = Chroma.from_documents(
|
327 |
self.documents,
|
328 |
self.embeddings,
|
@@ -478,7 +477,7 @@ class LANGCHAIN_UTILS:
|
|
478 |
|
479 |
logger.info(f'Saving index to: {index_filepath}')
|
480 |
|
481 |
-
if not os.path.exists(index_filepath):
|
482 |
os.makedirs(index_filepath)
|
483 |
|
484 |
if self.index_type == 'FAISS':
|
@@ -598,6 +597,7 @@ class LANGCHAIN_UTILS:
|
|
598 |
if not index or not isinstance(index, GPTSimpleVectorIndex):
|
599 |
logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
|
600 |
continue
|
|
|
601 |
raise NotImplementedError
|
602 |
|
603 |
# Store index_category master index
|
@@ -634,7 +634,11 @@ class LANGCHAIN_UTILS:
|
|
634 |
logger.info('Chroma DB initialized successfully!')
|
635 |
|
636 |
|
637 |
-
def query_chromadb(
|
|
|
|
|
|
|
|
|
638 |
return self.index.similarity_search(query=question, k=k)
|
639 |
|
640 |
|
@@ -658,7 +662,7 @@ class LANGCHAIN_UTILS:
|
|
658 |
response = None
|
659 |
|
660 |
# Get the index of the given question_category
|
661 |
-
index = self.index_category_doc_type_wise_index[question_category][
|
662 |
|
663 |
if self.index_type == 'FAISS':
|
664 |
response = index.similarity_search(
|
|
|
22 |
|
23 |
import os
|
24 |
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
|
|
|
25 |
|
26 |
import logging
|
27 |
logger = logging.getLogger(__name__)
|
|
|
185 |
custom_prompt=True
|
186 |
):
|
187 |
# Prepare data (Split paragraph into chunks of small documents)
|
188 |
+
text_splitter = CharacterTextSplitter(
|
189 |
+
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
|
190 |
+
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
|
191 |
+
separator=constants_utils.TEXT_SPLITTER_SEPARATOR
|
192 |
+
)
|
193 |
texts = text_splitter.split_text(para)
|
194 |
|
195 |
if self.index_type == 'FAISS':
|
|
|
302 |
|
303 |
logger.info(f'Creating index')
|
304 |
|
305 |
+
text_splitter = CharacterTextSplitter(
|
306 |
+
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
|
307 |
+
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
|
308 |
+
separator=constants_utils.TEXT_SPLITTER_SEPARATOR
|
309 |
+
)
|
310 |
+
self.documents = text_splitter.split_documents(self.documents)
|
311 |
+
|
312 |
############## Build the Vector store for docs ##############
|
313 |
# Vector store using Facebook AI Similarity Search
|
314 |
if self.index_type == 'FAISS':
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
self.index = FAISS.from_documents(
|
316 |
self.documents,
|
317 |
self.embeddings
|
|
|
322 |
if not os.path.exists(self.index_filepath):
|
323 |
os.makedirs(self.index_filepath)
|
324 |
|
|
|
|
|
|
|
|
|
|
|
325 |
self.index = Chroma.from_documents(
|
326 |
self.documents,
|
327 |
self.embeddings,
|
|
|
477 |
|
478 |
logger.info(f'Saving index to: {index_filepath}')
|
479 |
|
480 |
+
if not os.path.exists(index_filepath) and os.path.isdir(index_filepath):
|
481 |
os.makedirs(index_filepath)
|
482 |
|
483 |
if self.index_type == 'FAISS':
|
|
|
597 |
if not index or not isinstance(index, GPTSimpleVectorIndex):
|
598 |
logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
|
599 |
continue
|
600 |
+
import pdb; pdb.set_trace()
|
601 |
raise NotImplementedError
|
602 |
|
603 |
# Store index_category master index
|
|
|
634 |
logger.info('Chroma DB initialized successfully!')
|
635 |
|
636 |
|
637 |
+
def query_chromadb(
|
638 |
+
self,
|
639 |
+
question,
|
640 |
+
k=1
|
641 |
+
):
|
642 |
return self.index.similarity_search(query=question, k=k)
|
643 |
|
644 |
|
|
|
662 |
response = None
|
663 |
|
664 |
# Get the index of the given question_category
|
665 |
+
index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]
|
666 |
|
667 |
if self.index_type == 'FAISS':
|
668 |
response = index.similarity_search(
|