Spaces:

TomsTech
/

PoemTest

Sleeping

App Files Files Community

thomasjacob04 commited on Feb 19

Commit

5061d0f

verified ·

1 Parent(s): 89e9286

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -80

app.py CHANGED Viewed

@@ -1,103 +1,117 @@
 import os
 from dotenv import load_dotenv
-load_dotenv()
 from typing import Iterator
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document as LCDocument
 from docling.document_converter import DocumentConverter
-# import gradio as gr
-from typing import Iterator
 class DoclingPDFLoader(BaseLoader):
     def __init__(self, file_path: str | list[str]) -> None:
         self._file_paths = file_path if isinstance(file_path, list) else [file_path]
         self._converter = DocumentConverter()
     def lazy_load(self) -> Iterator[LCDocument]:
         for source in self._file_paths:
             dl_doc = self._converter.convert(source).document
             text = dl_doc.export_to_markdown()
             yield LCDocument(page_content=text)
-FILE_PATH = "10_Pages_Vol_5.pdf"  # test paper
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-loader = DoclingPDFLoader(file_path=FILE_PATH)
-text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=1000,
-    chunk_overlap=200,
-)
-docs = loader.load()
-splits = text_splitter.split_documents(docs)
-from langchain_huggingface.embeddings import HuggingFaceEmbeddings
-HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
-embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
-from tempfile import TemporaryDirectory
-from langchain_milvus import Milvus
-MILVUS_URI = os.environ.get(
-    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
-)
-vectorstore = Milvus.from_documents(
-    splits,
-    embeddings,
-    connection_args={"uri": MILVUS_URI},
-    drop_old=True,
-    index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
-)
-from langchain_huggingface import HuggingFaceEndpoint
-HF_API_KEY = os.environ.get("HF_API_KEY")
-HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
-llm = HuggingFaceEndpoint(
-    repo_id=HF_LLM_MODEL_ID,
-    huggingfacehub_api_token=HF_API_KEY,
-    task="text-generation",  # Add this line to specify the task
-    )
-from typing import Iterable
-from langchain_core.documents import Document as LCDocument
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import PromptTemplate
-from langchain_core.runnables import RunnablePassthrough
-def format_docs(docs: Iterable[LCDocument]):
     return "\n\n".join(doc.page_content for doc in docs)
-retriever = vectorstore.as_retriever()
-prompt = PromptTemplate.from_template(
-    "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
-)
-rag_chain = (
-    {"context": retriever | format_docs, "question": RunnablePassthrough()}
-    | prompt
-    | llm
-    | StrOutputParser()
 )
-rag_chain.invoke("who are the members of the Sanhedrin who are present?")

 import os
+import gradio as gr
 from dotenv import load_dotenv
 from typing import Iterator
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document as LCDocument
 from docling.document_converter import DocumentConverter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_milvus import Milvus
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from tempfile import TemporaryDirectory
+# Load environment variables
+load_dotenv()
+HF_API_KEY = os.environ.get("HF_API_KEY")
 class DoclingPDFLoader(BaseLoader):
     def __init__(self, file_path: str | list[str]) -> None:
         self._file_paths = file_path if isinstance(file_path, list) else [file_path]
         self._converter = DocumentConverter()
     def lazy_load(self) -> Iterator[LCDocument]:
         for source in self._file_paths:
             dl_doc = self._converter.convert(source).document
             text = dl_doc.export_to_markdown()
             yield LCDocument(page_content=text)
+def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)
+def setup_rag_chain(pdf_path):
+    # Initialize loader and split documents
+    loader = DoclingPDFLoader(file_path=pdf_path)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+    )
+    docs = loader.load()
+    splits = text_splitter.split_documents(docs)
+    # Setup embeddings
+    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
+    # Setup Milvus vectorstore
+    tmp_dir = TemporaryDirectory()
+    MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
+    vectorstore = Milvus.from_documents(
+        splits,
+        embeddings,
+        connection_args={"uri": MILVUS_URI},
+        drop_old=True,
+        index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
+    )
+    # Setup LLM
+    llm = HuggingFaceEndpoint(
+        repo_id="mistralai/Mistral-7B-Instruct-v0.3",
+        huggingfacehub_api_token=HF_API_KEY,
+        task="text-generation",
+    )
+    # Setup RAG chain
+    retriever = vectorstore.as_retriever()
+    prompt = PromptTemplate.from_template(
+        "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
+    )
+    return (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+def process_query(pdf_file, query):
+    if pdf_file is None:
+        return "Please upload a PDF file first."
+    # Save the uploaded file temporarily
+    temp_pdf_path = "temp_upload.pdf"
+    with open(temp_pdf_path, "wb") as f:
+        f.write(pdf_file)
+    try:
+        # Setup and run the RAG chain
+        rag_chain = setup_rag_chain(temp_pdf_path)
+        response = rag_chain.invoke(query)
+        return response
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+    finally:
+        # Clean up temporary file
+        if os.path.exists(temp_pdf_path):
+            os.remove(temp_pdf_path)
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_query,
+    inputs=[
+        gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Textbox(label="Enter your question")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="PDF Question Answering System",
+    description="Upload a PDF and ask questions about its content. The system will use RAG to provide relevant answers.",
+    examples=[
+        [None, "Who are the members of the Sanhedrin who are present?"],
+        [None, "What are the main themes discussed in the document?"]
+    ]
 )
+if __name__ == "__main__":
+    demo.launch()