pdf_query / app.py
maaz77's picture
Update app.py
f63d97e verified
import os
import streamlit as st
import pdfplumber
from sentence_transformers import SentenceTransformer
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.legacy.embeddings.langchain import LangchainEmbedding
import torch
# Setup for caching the index and LLM to avoid reloading
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def setup_llama_index(documents):
# Define and configure the embedding model
embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))
# Define and configure the Llama LLM
llama_llm = LlamaHuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={"temperature": 0.0, "do_sample": False},
system_prompt="You are a Q&A assistant...",
query_wrapper_prompt=SimpleInputPrompt("{query_str}"),
tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
model_name="HuggingFaceH4/zephyr-7b-beta",
device_map="auto",
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)
# Create the index
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
return index.as_query_engine()
def extract_text_from_pdf(file):
""" Extract text from the uploaded PDF file using pdfplumber. """
text = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text: # Ensure that text extraction was successful
text.append(page_text)
return " ".join(text)
def main():
st.title('PDF Reader and Question Answering')
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file is not None:
document_text = extract_text_from_pdf(uploaded_file)
if document_text:
st.text_area("Extracted Text", document_text, height=300)
# Process the uploaded document
documents = [document_text]
query_engine = setup_llama_index(documents)
question = st.text_input("Ask a question based on the PDF")
if st.button("Get Answer"):
if question:
# Simulate RAG-like query using the index and LLM
response = query_engine.query(question)
st.text_area("Answer", response, height=150)
else:
st.error("Please enter a question to get an answer.")
else:
st.error("No text could be extracted from the PDF. Please check the file and try again.")
if __name__ == "__main__":
main()