Spaces:

maaz77
/

pdf_query

Sleeping

App Files Files Community

pdf_query / app.py

maaz77

Update app.py

f63d97e verified about 1 year ago

raw

history blame contribute delete

2.91 kB

	import os
	import streamlit as st
	import pdfplumber
	from sentence_transformers import SentenceTransformer
	from llama_index.core import VectorStoreIndex, ServiceContext
	from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
	from llama_index.core.prompts.prompts import SimpleInputPrompt
	from llama_index.legacy.embeddings.langchain import LangchainEmbedding
	import torch

	# Setup for caching the index and LLM to avoid reloading
	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	def setup_llama_index(documents):
	# Define and configure the embedding model
	embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))

	# Define and configure the Llama LLM
	llama_llm = LlamaHuggingFaceLLM(
	context_window=4096,
	max_new_tokens=256,
	generate_kwargs={"temperature": 0.0, "do_sample": False},
	system_prompt="You are a Q&A assistant...",
	query_wrapper_prompt=SimpleInputPrompt("{query_str}"),
	tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
	model_name="HuggingFaceH4/zephyr-7b-beta",
	device_map="auto",
	model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
	)

	# Create the index
	service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
	index = VectorStoreIndex.from_documents(documents, service_context=service_context)
	return index.as_query_engine()

	def extract_text_from_pdf(file):
	""" Extract text from the uploaded PDF file using pdfplumber. """
	text = []
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text: # Ensure that text extraction was successful
	text.append(page_text)
	return " ".join(text)

	def main():
	st.title('PDF Reader and Question Answering')

	uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
	if uploaded_file is not None:
	document_text = extract_text_from_pdf(uploaded_file)
	if document_text:
	st.text_area("Extracted Text", document_text, height=300)

	# Process the uploaded document
	documents = [document_text]
	query_engine = setup_llama_index(documents)

	question = st.text_input("Ask a question based on the PDF")
	if st.button("Get Answer"):
	if question:
	# Simulate RAG-like query using the index and LLM
	response = query_engine.query(question)
	st.text_area("Answer", response, height=150)
	else:
	st.error("Please enter a question to get an answer.")
	else:
	st.error("No text could be extracted from the PDF. Please check the file and try again.")

	if __name__ == "__main__":
	main()