Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
-
import gradio as gr
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
-
|
| 5 |
from langchain_core.document_loaders import BaseLoader
|
| 6 |
from langchain_core.documents import Document as LCDocument
|
| 7 |
from docling.document_converter import DocumentConverter
|
|
@@ -13,11 +12,17 @@ from langchain_core.prompts import PromptTemplate
|
|
| 13 |
from langchain_core.runnables import RunnablePassthrough
|
| 14 |
from langchain_core.output_parsers import StrOutputParser
|
| 15 |
from tempfile import TemporaryDirectory
|
|
|
|
| 16 |
|
| 17 |
# Load environment variables
|
| 18 |
load_dotenv()
|
| 19 |
HF_API_KEY = os.environ.get("HF_API_KEY")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
class DoclingPDFLoader(BaseLoader):
|
| 22 |
def __init__(self, file_path: str | list[str]) -> None:
|
| 23 |
self._file_paths = file_path if isinstance(file_path, list) else [file_path]
|
|
@@ -32,84 +37,65 @@ class DoclingPDFLoader(BaseLoader):
|
|
| 32 |
def format_docs(docs):
|
| 33 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
splits = text_splitter.split_documents(docs)
|
| 44 |
-
|
| 45 |
-
# Setup embeddings
|
| 46 |
-
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
|
| 47 |
-
|
| 48 |
-
# Setup Milvus vectorstore
|
| 49 |
-
tmp_dir = TemporaryDirectory()
|
| 50 |
-
MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
|
| 51 |
-
vectorstore = Milvus.from_documents(
|
| 52 |
-
splits,
|
| 53 |
-
embeddings,
|
| 54 |
-
connection_args={"uri": MILVUS_URI},
|
| 55 |
-
drop_old=True,
|
| 56 |
-
index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
# Setup LLM
|
| 60 |
-
llm = HuggingFaceEndpoint(
|
| 61 |
-
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
|
| 62 |
-
huggingfacehub_api_token=HF_API_KEY,
|
| 63 |
-
task="text-generation",
|
| 64 |
-
)
|
| 65 |
-
|
| 66 |
-
# Setup RAG chain
|
| 67 |
-
retriever = vectorstore.as_retriever()
|
| 68 |
-
prompt = PromptTemplate.from_template(
|
| 69 |
-
"Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
return (
|
| 73 |
-
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
| 74 |
-
| prompt
|
| 75 |
-
| llm
|
| 76 |
-
| StrOutputParser()
|
| 77 |
-
)
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
try:
|
| 89 |
-
# Setup and run the RAG chain
|
| 90 |
-
rag_chain = setup_rag_chain(temp_pdf_path)
|
| 91 |
response = rag_chain.invoke(query)
|
| 92 |
return response
|
| 93 |
except Exception as e:
|
| 94 |
return f"An error occurred: {str(e)}"
|
| 95 |
-
finally:
|
| 96 |
-
# Clean up temporary file
|
| 97 |
-
if os.path.exists(temp_pdf_path):
|
| 98 |
-
os.remove(temp_pdf_path)
|
| 99 |
|
| 100 |
# Create Gradio interface
|
| 101 |
demo = gr.Interface(
|
| 102 |
fn=process_query,
|
| 103 |
-
inputs=
|
| 104 |
-
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
| 105 |
-
gr.Textbox(label="Enter your question")
|
| 106 |
-
],
|
| 107 |
outputs=gr.Textbox(label="Answer"),
|
| 108 |
-
title="
|
| 109 |
-
description="
|
| 110 |
examples=[
|
| 111 |
-
[
|
| 112 |
-
[
|
| 113 |
]
|
| 114 |
)
|
| 115 |
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
import gradio as gr
|
| 4 |
from langchain_core.document_loaders import BaseLoader
|
| 5 |
from langchain_core.documents import Document as LCDocument
|
| 6 |
from docling.document_converter import DocumentConverter
|
|
|
|
| 12 |
from langchain_core.runnables import RunnablePassthrough
|
| 13 |
from langchain_core.output_parsers import StrOutputParser
|
| 14 |
from tempfile import TemporaryDirectory
|
| 15 |
+
from typing import Iterator
|
| 16 |
|
| 17 |
# Load environment variables
|
| 18 |
load_dotenv()
|
| 19 |
HF_API_KEY = os.environ.get("HF_API_KEY")
|
| 20 |
|
| 21 |
+
# Constants
|
| 22 |
+
FILE_PATH = "10_Pages_Vol_5.pdf" # Your hardcoded PDF path
|
| 23 |
+
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
|
| 24 |
+
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 25 |
+
|
| 26 |
class DoclingPDFLoader(BaseLoader):
|
| 27 |
def __init__(self, file_path: str | list[str]) -> None:
|
| 28 |
self._file_paths = file_path if isinstance(file_path, list) else [file_path]
|
|
|
|
| 37 |
def format_docs(docs):
|
| 38 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 39 |
|
| 40 |
+
# Setup the RAG pipeline
|
| 41 |
+
loader = DoclingPDFLoader(file_path=FILE_PATH)
|
| 42 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 43 |
+
chunk_size=1000,
|
| 44 |
+
chunk_overlap=200,
|
| 45 |
+
)
|
| 46 |
+
docs = loader.load()
|
| 47 |
+
splits = text_splitter.split_documents(docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
|
| 50 |
+
|
| 51 |
+
# Setup Milvus
|
| 52 |
+
tmp_dir = TemporaryDirectory()
|
| 53 |
+
MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
|
| 54 |
+
vectorstore = Milvus.from_documents(
|
| 55 |
+
splits,
|
| 56 |
+
embeddings,
|
| 57 |
+
connection_args={"uri": MILVUS_URI},
|
| 58 |
+
drop_old=True,
|
| 59 |
+
index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Setup LLM
|
| 63 |
+
llm = HuggingFaceEndpoint(
|
| 64 |
+
repo_id=HF_LLM_MODEL_ID,
|
| 65 |
+
huggingfacehub_api_token=HF_API_KEY,
|
| 66 |
+
task="text-generation",
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Setup RAG chain
|
| 70 |
+
retriever = vectorstore.as_retriever()
|
| 71 |
+
prompt = PromptTemplate.from_template(
|
| 72 |
+
"Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
rag_chain = (
|
| 76 |
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
| 77 |
+
| prompt
|
| 78 |
+
| llm
|
| 79 |
+
| StrOutputParser()
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def process_query(query):
|
| 83 |
try:
|
|
|
|
|
|
|
| 84 |
response = rag_chain.invoke(query)
|
| 85 |
return response
|
| 86 |
except Exception as e:
|
| 87 |
return f"An error occurred: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Create Gradio interface
|
| 90 |
demo = gr.Interface(
|
| 91 |
fn=process_query,
|
| 92 |
+
inputs=gr.Textbox(label="Enter your question about the document"),
|
|
|
|
|
|
|
|
|
|
| 93 |
outputs=gr.Textbox(label="Answer"),
|
| 94 |
+
title="Document Q&A System",
|
| 95 |
+
description=f"Ask questions about {FILE_PATH}",
|
| 96 |
examples=[
|
| 97 |
+
["Who are the members of the Sanhedrin who are present?"],
|
| 98 |
+
["What are the main themes discussed in the document?"]
|
| 99 |
]
|
| 100 |
)
|
| 101 |
|