rag / app.py
Akshayram1's picture
Update app.py
9806805 verified
import os
import streamlit as st
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
import joblib
import tempfile
# API keys
llama_cloud_api_key = "llx-rVenNfvEyWTTZ2bOJIY7zymr6oyyucfdBusq407A6RzZhMKb"
groq_api_key = "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"
# Function to load or parse data from uploaded PDF file
def load_or_parse_data(uploaded_file):
data_file = "./data/parsed_data.pkl"
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.getvalue())
temp_file_path = temp_file.name
parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies,
Inc. with the Securities and Exchange Commission (SEC)...
"""
parser = LlamaParse(api_key=llama_cloud_api_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000)
llama_parse_documents = parser.load_data(temp_file_path)
os.remove(temp_file_path)
return llama_parse_documents
# User uploads PDF file
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
llama_parse_documents = load_or_parse_data(uploaded_file)
if llama_parse_documents:
# Create data directory if it doesn't exist
os.makedirs("data", exist_ok=True)
# Further processing of the parsed data...
# Further processing of the parsed data
with open('data/output.md', 'a') as f:
for doc in llama_parse_documents:
f.write(doc.text + '\n')
markdown_path = "data/output.md"
loader = UnstructuredMarkdownLoader(markdown_path)
documents = loader.load()
# Split loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
# Initialize Embeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
if docs:
# Create and persist a Chroma vector database from the chunked documents
vs = Chroma.from_documents(
documents=docs,
embedding=embed_model,
persist_directory="chroma_db_llamaparse1",
collection_name="rag"
)
# Initialize ChatGroq model
chat_model = ChatGroq(
temperature=0,
model_name="mixtral-8x7b-32768",
api_key=groq_api_key
)
# Convert retrieved documents into QA format
custom_prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""
prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
# Initialize RetrievalQA
qa = RetrievalQA.from_chain_type(
llm=chat_model,
chain_type="stuff",
retriever=vs.as_retriever(search_kwargs={'k': 3}),
return_source_documents=True,
chain_type_kwargs={"prompt": prompt}
)
# Define function to interactively ask questions and retrieve answers
def ask_question(question):
response = qa.invoke({"query": question})
return response["result"]
# Example questions
example_questions = [
"What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?",
"What is the Cash flows from operating activities associated with bad expense specified in the document?",
"What is Loss (income) from equity method investments, net?"
]
# Ask questions and display answers
for idx, question in enumerate(example_questions, start=1):
st.subheader(f"Question {idx}: {question}")
answer = ask_question(question)
st.write(f"Answer: {answer}")
else:
st.write("No documents were parsed.")