thomasjacob04 commited on
Commit
bfb8470
·
verified ·
1 Parent(s): 5061d0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -68
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
- import gradio as gr
3
  from dotenv import load_dotenv
4
- from typing import Iterator
5
  from langchain_core.document_loaders import BaseLoader
6
  from langchain_core.documents import Document as LCDocument
7
  from docling.document_converter import DocumentConverter
@@ -13,11 +12,17 @@ from langchain_core.prompts import PromptTemplate
13
  from langchain_core.runnables import RunnablePassthrough
14
  from langchain_core.output_parsers import StrOutputParser
15
  from tempfile import TemporaryDirectory
 
16
 
17
  # Load environment variables
18
  load_dotenv()
19
  HF_API_KEY = os.environ.get("HF_API_KEY")
20
 
 
 
 
 
 
21
  class DoclingPDFLoader(BaseLoader):
22
  def __init__(self, file_path: str | list[str]) -> None:
23
  self._file_paths = file_path if isinstance(file_path, list) else [file_path]
@@ -32,84 +37,65 @@ class DoclingPDFLoader(BaseLoader):
32
  def format_docs(docs):
33
  return "\n\n".join(doc.page_content for doc in docs)
34
 
35
- def setup_rag_chain(pdf_path):
36
- # Initialize loader and split documents
37
- loader = DoclingPDFLoader(file_path=pdf_path)
38
- text_splitter = RecursiveCharacterTextSplitter(
39
- chunk_size=1000,
40
- chunk_overlap=200,
41
- )
42
- docs = loader.load()
43
- splits = text_splitter.split_documents(docs)
44
-
45
- # Setup embeddings
46
- embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
47
-
48
- # Setup Milvus vectorstore
49
- tmp_dir = TemporaryDirectory()
50
- MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
51
- vectorstore = Milvus.from_documents(
52
- splits,
53
- embeddings,
54
- connection_args={"uri": MILVUS_URI},
55
- drop_old=True,
56
- index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
57
- )
58
-
59
- # Setup LLM
60
- llm = HuggingFaceEndpoint(
61
- repo_id="mistralai/Mistral-7B-Instruct-v0.3",
62
- huggingfacehub_api_token=HF_API_KEY,
63
- task="text-generation",
64
- )
65
-
66
- # Setup RAG chain
67
- retriever = vectorstore.as_retriever()
68
- prompt = PromptTemplate.from_template(
69
- "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
70
- )
71
-
72
- return (
73
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
74
- | prompt
75
- | llm
76
- | StrOutputParser()
77
- )
78
 
79
- def process_query(pdf_file, query):
80
- if pdf_file is None:
81
- return "Please upload a PDF file first."
82
-
83
- # Save the uploaded file temporarily
84
- temp_pdf_path = "temp_upload.pdf"
85
- with open(temp_pdf_path, "wb") as f:
86
- f.write(pdf_file)
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  try:
89
- # Setup and run the RAG chain
90
- rag_chain = setup_rag_chain(temp_pdf_path)
91
  response = rag_chain.invoke(query)
92
  return response
93
  except Exception as e:
94
  return f"An error occurred: {str(e)}"
95
- finally:
96
- # Clean up temporary file
97
- if os.path.exists(temp_pdf_path):
98
- os.remove(temp_pdf_path)
99
 
100
  # Create Gradio interface
101
  demo = gr.Interface(
102
  fn=process_query,
103
- inputs=[
104
- gr.File(label="Upload PDF", file_types=[".pdf"]),
105
- gr.Textbox(label="Enter your question")
106
- ],
107
  outputs=gr.Textbox(label="Answer"),
108
- title="PDF Question Answering System",
109
- description="Upload a PDF and ask questions about its content. The system will use RAG to provide relevant answers.",
110
  examples=[
111
- [None, "Who are the members of the Sanhedrin who are present?"],
112
- [None, "What are the main themes discussed in the document?"]
113
  ]
114
  )
115
 
 
1
  import os
 
2
  from dotenv import load_dotenv
3
+ import gradio as gr
4
  from langchain_core.document_loaders import BaseLoader
5
  from langchain_core.documents import Document as LCDocument
6
  from docling.document_converter import DocumentConverter
 
12
  from langchain_core.runnables import RunnablePassthrough
13
  from langchain_core.output_parsers import StrOutputParser
14
  from tempfile import TemporaryDirectory
15
+ from typing import Iterator
16
 
17
  # Load environment variables
18
  load_dotenv()
19
  HF_API_KEY = os.environ.get("HF_API_KEY")
20
 
21
+ # Constants
22
+ FILE_PATH = "10_Pages_Vol_5.pdf" # Your hardcoded PDF path
23
+ HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
24
+ HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
25
+
26
  class DoclingPDFLoader(BaseLoader):
27
  def __init__(self, file_path: str | list[str]) -> None:
28
  self._file_paths = file_path if isinstance(file_path, list) else [file_path]
 
37
  def format_docs(docs):
38
  return "\n\n".join(doc.page_content for doc in docs)
39
 
40
+ # Setup the RAG pipeline
41
+ loader = DoclingPDFLoader(file_path=FILE_PATH)
42
+ text_splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ )
46
+ docs = loader.load()
47
+ splits = text_splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)
50
+
51
+ # Setup Milvus
52
+ tmp_dir = TemporaryDirectory()
53
+ MILVUS_URI = f"{tmp_dir.name}/milvus_demo.db"
54
+ vectorstore = Milvus.from_documents(
55
+ splits,
56
+ embeddings,
57
+ connection_args={"uri": MILVUS_URI},
58
+ drop_old=True,
59
+ index_params={"index_type": "IVF_FLAT", "metric_type": "L2"},
60
+ )
61
+
62
+ # Setup LLM
63
+ llm = HuggingFaceEndpoint(
64
+ repo_id=HF_LLM_MODEL_ID,
65
+ huggingfacehub_api_token=HF_API_KEY,
66
+ task="text-generation",
67
+ )
68
+
69
+ # Setup RAG chain
70
+ retriever = vectorstore.as_retriever()
71
+ prompt = PromptTemplate.from_template(
72
+ "Context information is below.\n---------------------\n{context}\n---------------------\nUse the context of the work you have been currently trained on, not your prior knowledge, to answer the queries asked. Please use Chapter numbers and page numbers as references as well.\nQuery: {question}\nAnswer:\n"
73
+ )
74
+
75
+ rag_chain = (
76
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
77
+ | prompt
78
+ | llm
79
+ | StrOutputParser()
80
+ )
81
+
82
+ def process_query(query):
83
  try:
 
 
84
  response = rag_chain.invoke(query)
85
  return response
86
  except Exception as e:
87
  return f"An error occurred: {str(e)}"
 
 
 
 
88
 
89
  # Create Gradio interface
90
  demo = gr.Interface(
91
  fn=process_query,
92
+ inputs=gr.Textbox(label="Enter your question about the document"),
 
 
 
93
  outputs=gr.Textbox(label="Answer"),
94
+ title="Document Q&A System",
95
+ description=f"Ask questions about {FILE_PATH}",
96
  examples=[
97
+ ["Who are the members of the Sanhedrin who are present?"],
98
+ ["What are the main themes discussed in the document?"]
99
  ]
100
  )
101