maaz77 commited on
Commit
8b7951f
1 Parent(s): 0f832f3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ from sentence_transformers import SentenceTransformer
4
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
5
+ from llama_index.llms.huggingface import HuggingFaceLLM as LlamaHuggingFaceLLM
6
+ from llama_index.core.prompts.prompts import SimpleInputPrompt
7
+ from llama_index.legacy.embeddings.langchain import LangchainEmbedding
8
+
9
+ # Setup for caching the index and LLM to avoid reloading
10
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
11
+ def setup_llama_index():
12
+ # Define and configure the embedding model
13
+ embed_model = LangchainEmbedding(SentenceTransformer('sentence-transformers/all-mpnet-base-v2'))
14
+
15
+ # Define and configure the Llama LLM
16
+ llama_llm = LlamaHuggingFaceLLM(
17
+ context_window=4096,
18
+ max_new_tokens=256,
19
+ generate_kwargs={"temperature": 0.0, "do_sample": False},
20
+ system_prompt="You are a Q&A assistant...",
21
+ query_wrapper_prompt=SimpleInputPrompt("{query_str}"),
22
+ tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
23
+ model_name="HuggingFaceH4/zephyr-7b-beta",
24
+ device_map="auto",
25
+ model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
26
+ )
27
+
28
+ # Load documents and create the index
29
+ documents = SimpleDirectoryReader('/content/data').load_data() # Assuming document data is in this directory
30
+ service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llama_llm, embed_model=embed_model)
31
+ index = VectorStoreIndex.from_documents(documents, service_context=service_context)
32
+ return index.as_query_engine()
33
+
34
+ def extract_text_from_pdf(file):
35
+ """ Extract text from the uploaded PDF file using pdfplumber. """
36
+ text = []
37
+ with pdfplumber.open(file) as pdf:
38
+ for page in pdf.pages:
39
+ page_text = page.extract_text()
40
+ if page_text: # Ensure that text extraction was successful
41
+ text.append(page_text)
42
+ return " ".join(text)
43
+
44
+ def main():
45
+ st.title('PDF Reader and Question Answering with RAG-like Model')
46
+
47
+ # Load the query engine only once
48
+ query_engine = setup_llama_index()
49
+
50
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
51
+ if uploaded_file is not None:
52
+ document_text = extract_text_from_pdf(uploaded_file)
53
+ if document_text:
54
+ st.text_area("Extracted Text", document_text, height=300)
55
+ else:
56
+ st.error("No text could be extracted from the PDF. Please check the file and try again.")
57
+
58
+ question = st.text_input("Ask a question based on the PDF")
59
+ if st.button("Get Answer"):
60
+ if question:
61
+ # Simulate RAG-like query using the index and LLM
62
+ response = query_engine.query(question)
63
+ st.text_area("Answer", response, height=150)
64
+ else:
65
+ st.error("Please enter a question to get an answer.")
66
+
67
+ if __name__ == "__main__":
68
+ main()