RAG / app.py
Thunder-rk's picture
Create app.py
3ba792f verified
import os
import streamlit as st
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
import torch
# Set the environment variable for HuggingFace token
os.environ["HF_TOKEN"] = Secret
# Streamlit app
st.title("PDF Data Extractor")
# File uploader for PDFs
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
# Text input for the query prompt
user_query = st.text_input("Enter your query")
# Button to trigger processing
if st.button("Extract Data"):
if uploaded_files and user_query:
# Save uploaded files
for uploaded_file in uploaded_files:
with open(os.path.join("./docs", uploaded_file.name), "wb") as f:
f.write(uploaded_file.getbuffer())
# Load documents from the specified directory
documents = SimpleDirectoryReader("./docs").load_data()
# Define system prompt for the LLM
system_prompt = """
You are a data extractor. Your goal is to analyze the given PDF document and extract the table containing information relevant to the user query.
"""
# Define query wrapper prompt
query_wrapper_prompt = SimpleInputPrompt("{query_str}")
# Initialize the LLM
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={"temperature": 0.0, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="gemma-1.1-2b-it",
model_name="gemma-1.1-2b-it",
device_map="auto",
model_kwargs={"torch_dtype": torch.float16}
)
st.write("LLM download successful")
# Initialize the embedding model
embed_model = LangchainEmbedding(
HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)
# Create service context with appropriate configurations
service_context = ServiceContext.from_defaults(
chunk_size=1024,
llm=llm,
embed_model=embed_model
)
st.write("Before Vector Index")
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
st.write("After Vector Index")
# Create query engine from the index
query_engine = index.as_query_engine()
# Execute query
response = query_engine.query(user_query)
# Display response
st.write("Generated Response:")
st.write(response)
else:
st.error("Please upload PDF files and enter a query.")