|
import streamlit as st |
|
import pdfplumber |
|
from sentence_transformers import SentenceTransformer |
|
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext |
|
from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM |
|
from llama_index.core.prompts.prompts import SimpleInputPrompt |
|
from llama_index.legacy.embeddings.langchain import LangchainEmbedding |
|
import torch |
|
|
|
|
|
@st.cache(allow_output_mutation=True, suppress_st_warning=True) |
|
def setup_llama_index(): |
|
|
|
embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2')) |
|
|
|
|
|
llama_llm = LlamaHuggingFaceLLM( |
|
context_window=4096, |
|
max_new_tokens=256, |
|
generate_kwargs={"temperature": 0.0, "do_sample": False}, |
|
system_prompt="You are a Q&A assistant...", |
|
query_wrapper_prompt=SimpleInputPrompt("{query_str}"), |
|
tokenizer_name="HuggingFaceH4/zephyr-7b-beta", |
|
model_name="HuggingFaceH4/zephyr-7b-beta", |
|
device_map="auto", |
|
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True} |
|
) |
|
|
|
|
|
documents = SimpleDirectoryReader('/content/data').load_data() |
|
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model) |
|
index = VectorStoreIndex.from_documents(documents, service_context=service_context) |
|
return index.as_query_engine() |
|
|
|
def extract_text_from_pdf(file): |
|
""" Extract text from the uploaded PDF file using pdfplumber. """ |
|
text = [] |
|
with pdfplumber.open(file) as pdf: |
|
for page in pdf.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text.append(page_text) |
|
return " ".join(text) |
|
|
|
def main(): |
|
st.title('PDF Reader and Question Answering with RAG-like Model') |
|
|
|
|
|
query_engine = setup_llama_index() |
|
|
|
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) |
|
if uploaded_file is not None: |
|
document_text = extract_text_from_pdf(uploaded_file) |
|
if document_text: |
|
st.text_area("Extracted Text", document_text, height=300) |
|
else: |
|
st.error("No text could be extracted from the PDF. Please check the file and try again.") |
|
|
|
question = st.text_input("Ask a question based on the PDF") |
|
if st.button("Get Answer"): |
|
if question: |
|
|
|
response = query_engine.query(question) |
|
st.text_area("Answer", response, height=150) |
|
else: |
|
st.error("Please enter a question to get an answer.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|