import os
import streamlit as st
import pdfplumber
from sentence_transformers import SentenceTransformer
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.legacy.embeddings.langchain import LangchainEmbedding
import torch
# Setup for caching the index and LLM to avoid reloading
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def setup_llama_index(documents):
# Define and configure the embedding model
embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))
# Define and configure the Llama LLM
llama_llm = LlamaHuggingFaceLLM(
generate_kwargs={"temperature": 0.0, "do_sample": False},
system_prompt="You are a Q&A assistant...",
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
# Create the index
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
return index.as_query_engine()
def extract_text_from_pdf(file):
""" Extract text from the uploaded PDF file using pdfplumber. """
text = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text: # Ensure that text extraction was successful
return " ".join(text)
def main():
st.title('PDF Reader and Question Answering')
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file is not None:
document_text = extract_text_from_pdf(uploaded_file)
if document_text:
st.text_area("Extracted Text", document_text, height=300)
# Process the uploaded document
documents = [document_text]
query_engine = setup_llama_index(documents)
question = st.text_input("Ask a question based on the PDF")
if st.button("Get Answer"):
if question:
# Simulate RAG-like query using the index and LLM
response = query_engine.query(question)
st.text_area("Answer", response, height=150)
st.error("Please enter a question to get an answer.")
st.error("No text could be extracted from the PDF. Please check the file and try again.")
if __name__ == "__main__":