Spaces:

maaz77
/

pdf_query

Sleeping

App Files Files Community

maaz77 commited on May 15, 2024

Commit

8b7951f

verified ·

1 Parent(s): 0f832f3

Create app.py

Browse files

Files changed (1) hide show

app.py +68 -0

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+import pdfplumber
+from sentence_transformers import SentenceTransformer
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
+from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
+from llama_index.core.prompts.prompts import SimpleInputPrompt
+from llama_index.legacy.embeddings.langchain import LangchainEmbedding
+# Setup for caching the index and LLM to avoid reloading
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def setup_llama_index():
+    # Define and configure the embedding model
+    embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))
+    # Define and configure the Llama LLM
+    llama_llm = LlamaHuggingFaceLLM(
+        context_window=4096,
+        max_new_tokens=256,
+        generate_kwargs={"temperature": 0.0, "do_sample": False},
+        system_prompt="You are a Q&A assistant...",
+        query_wrapper_prompt=SimpleInputPrompt("{query_str}"),
+        tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
+        model_name="HuggingFaceH4/zephyr-7b-beta",
+        device_map="auto",
+        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
+    )
+    # Load documents and create the index
+    documents = SimpleDirectoryReader('/content/data').load_data()  # Assuming document data is in this directory
+    service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
+    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
+    return index.as_query_engine()
+def extract_text_from_pdf(file):
+    """ Extract text from the uploaded PDF file using pdfplumber. """
+    text = []
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:  # Ensure that text extraction was successful
+                text.append(page_text)
+    return " ".join(text)
+def main():
+    st.title('PDF Reader and Question Answering with RAG-like Model')
+    # Load the query engine only once
+    query_engine = setup_llama_index()
+    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+    if uploaded_file is not None:
+        document_text = extract_text_from_pdf(uploaded_file)
+        if document_text:
+            st.text_area("Extracted Text", document_text, height=300)
+        else:
+            st.error("No text could be extracted from the PDF. Please check the file and try again.")
+        question = st.text_input("Ask a question based on the PDF")
+        if st.button("Get Answer"):
+            if question:
+                # Simulate RAG-like query using the index and LLM
+                response = query_engine.query(question)
+                st.text_area("Answer", response, height=150)
+            else:
+                st.error("Please enter a question to get an answer.")
+if __name__ == "__main__":
+    main()