Spaces:

maaz77
/

pdf_query

Sleeping

File size: 2,905 Bytes

import os
import streamlit as st
import pdfplumber
from sentence_transformers import SentenceTransformer
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.legacy.embeddings.langchain import LangchainEmbedding
import torch

# Setup for caching the index and LLM to avoid reloading
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def setup_llama_index(documents):
    # Define and configure the embedding model
    embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))

    # Define and configure the Llama LLM
    llama_llm = LlamaHuggingFaceLLM(
        context_window=4096,
        max_new_tokens=256,
        generate_kwargs={"temperature": 0.0, "do_sample": False},
        system_prompt="You are a Q&A assistant...",
        query_wrapper_prompt=SimpleInputPrompt("{query_str}"),
        tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
        model_name="HuggingFaceH4/zephyr-7b-beta",
        device_map="auto",
        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
    )

    # Create the index
    service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    return index.as_query_engine()

def extract_text_from_pdf(file):
    """ Extract text from the uploaded PDF file using pdfplumber. """
    text = []
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Ensure that text extraction was successful
                text.append(page_text)
    return " ".join(text)

def main():
    st.title('PDF Reader and Question Answering')

    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
    if uploaded_file is not None:
        document_text = extract_text_from_pdf(uploaded_file)
        if document_text:
            st.text_area("Extracted Text", document_text, height=300)

            # Process the uploaded document
            documents = [document_text]
            query_engine = setup_llama_index(documents)

            question = st.text_input("Ask a question based on the PDF")
            if st.button("Get Answer"):
                if question:
                    # Simulate RAG-like query using the index and LLM
                    response = query_engine.query(question)
                    st.text_area("Answer", response, height=150)
                else:
                    st.error("Please enter a question to get an answer.")
        else:
            st.error("No text could be extracted from the PDF. Please check the file and try again.")

if __name__ == "__main__":
    main()