Spaces:

TomsTech
/

PoemTest

Sleeping

App Files Files Community

thomasjacob04 commited on Feb 19

Commit

bfb8470

verified ·

1 Parent(s): 5061d0f

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -68

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
-import gradio as gr
 from dotenv import load_dotenv
-from typing import Iterator
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document as LCDocument
 from docling.document_converter import DocumentConverter
@@ -13,11 +12,17 @@ from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
 from tempfile import TemporaryDirectory
 # Load environment variables
 load_dotenv()
 HF_API_KEY = os.environ.get("HF_API_KEY")
 class DoclingPDFLoader(BaseLoader):
     def __init__(self, file_path: str | list[str]) -> None:
         self._file_paths = file_path if isinstance(file_path, list) else [file_path]
@@ -32,84 +37,65 @@ class DoclingPDFLoader(BaseLoader):
 def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)
-def setup_rag_chain(pdf_path):
-    # Initialize loader and split documents
-    loader = DoclingPDFLoader(file_path=pdf_path)
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200,
-    )
-    docs = loader.load()
-    splits = text_splitter.split_documents(docs)
-    # Setup embeddings
-    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
-    # Setup Milvus vectorstore
-    tmp_dir = TemporaryDirectory()
-    MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
-    vectorstore = Milvus.from_documents(
-        splits,
-        embeddings,
-        connection_args={"uri": MILVUS_URI},
-        drop_old=True,
-        index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
-    )
-    # Setup LLM
-    llm = HuggingFaceEndpoint(
-        repo_id="mistralai/Mistral-7B-Instruct-v0.3",
-        huggingfacehub_api_token=HF_API_KEY,
-        task="text-generation",
-    )
-    # Setup RAG chain
-    retriever = vectorstore.as_retriever()
-    prompt = PromptTemplate.from_template(
-        "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
-    )
-    return (
-        {"context": retriever | format_docs, "question": RunnablePassthrough()}
-        | prompt
-        | llm
-        | StrOutputParser()
-    )
-def process_query(pdf_file, query):
-    if pdf_file is None:
-        return "Please upload a PDF file first."
-    # Save the uploaded file temporarily
-    temp_pdf_path = "temp_upload.pdf"
-    with open(temp_pdf_path, "wb") as f:
-        f.write(pdf_file)
     try:
-        # Setup and run the RAG chain
-        rag_chain = setup_rag_chain(temp_pdf_path)
         response = rag_chain.invoke(query)
         return response
     except Exception as e:
         return f"An error occurred: {str(e)}"
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_pdf_path):
-            os.remove(temp_pdf_path)
 # Create Gradio interface
 demo = gr.Interface(
     fn=process_query,
-    inputs=[
-        gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Textbox(label="Enter your question")
-    ],
     outputs=gr.Textbox(label="Answer"),
-    title="PDF Question Answering System",
-    description="Upload a PDF and ask questions about its content. The system will use RAG to provide relevant answers.",
     examples=[
-        [None, "Who are the members of the Sanhedrin who are present?"],
-        [None, "What are the main themes discussed in the document?"]
     ]
 )

 import os
 from dotenv import load_dotenv
+import gradio as gr
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document as LCDocument
 from docling.document_converter import DocumentConverter
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
 from tempfile import TemporaryDirectory
+from typing import Iterator
 # Load environment variables
 load_dotenv()
 HF_API_KEY = os.environ.get("HF_API_KEY")
+# Constants
+FILE_PATH = "10_Pages_Vol_5.pdf"  # Your hardcoded PDF path
+HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
+HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
 class DoclingPDFLoader(BaseLoader):
     def __init__(self, file_path: str | list[str]) -> None:
         self._file_paths = file_path if isinstance(file_path, list) else [file_path]
 def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)
+# Setup the RAG pipeline
+loader = DoclingPDFLoader(file_path=FILE_PATH)
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,
+    chunk_overlap=200,
+)
+docs = loader.load()
+splits = text_splitter.split_documents(docs)
+embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
+# Setup Milvus
+tmp_dir = TemporaryDirectory()
+MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
+vectorstore = Milvus.from_documents(
+    splits,
+    embeddings,
+    connection_args={"uri": MILVUS_URI},
+    drop_old=True,
+    index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
+)
+# Setup LLM
+llm = HuggingFaceEndpoint(
+    repo_id=HF_LLM_MODEL_ID,
+    huggingfacehub_api_token=HF_API_KEY,
+    task="text-generation",
+)
+# Setup RAG chain
+retriever = vectorstore.as_retriever()
+prompt = PromptTemplate.from_template(
+    "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
+)
+rag_chain = (
+    {"context": retriever | format_docs, "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+def process_query(query):
     try:
         response = rag_chain.invoke(query)
         return response
     except Exception as e:
         return f"An error occurred: {str(e)}"
 # Create Gradio interface
 demo = gr.Interface(
     fn=process_query,
+    inputs=gr.Textbox(label="Enter your question about the document"),
     outputs=gr.Textbox(label="Answer"),
+    title="Document Q&A System",
+    description=f"Ask questions about {FILE_PATH}",
     examples=[
+        ["Who are the members of the Sanhedrin who are present?"],
+        ["What are the main themes discussed in the document?"]
     ]
 )