import os import streamlit as st from llama_parse import LlamaParse from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings.fastembed import FastEmbedEmbeddings from langchain_community.vectorstores import Chroma from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from langchain_groq import ChatGroq import joblib import tempfile # API keys llama_cloud_api_key = "llx-rVenNfvEyWTTZ2bOJIY7zymr6oyyucfdBusq407A6RzZhMKb" groq_api_key = "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" # Function to load or parse data from uploaded PDF file def load_or_parse_data(uploaded_file): data_file = "./data/parsed_data.pkl" with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.getvalue()) temp_file_path = temp_file.name parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies, Inc. with the Securities and Exchange Commission (SEC)... """ parser = LlamaParse(api_key=llama_cloud_api_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000) llama_parse_documents = parser.load_data(temp_file_path) os.remove(temp_file_path) return llama_parse_documents # User uploads PDF file uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file is not None: llama_parse_documents = load_or_parse_data(uploaded_file) if llama_parse_documents: # Create data directory if it doesn't exist os.makedirs("data", exist_ok=True) # Further processing of the parsed data... # Further processing of the parsed data with open('data/output.md', 'a') as f: for doc in llama_parse_documents: f.write(doc.text + '\n') markdown_path = "data/output.md" loader = UnstructuredMarkdownLoader(markdown_path) documents = loader.load() # Split loaded documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) docs = text_splitter.split_documents(documents) # Initialize Embeddings embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5") if docs: # Create and persist a Chroma vector database from the chunked documents vs = Chroma.from_documents( documents=docs, embedding=embed_model, persist_directory="chroma_db_llamaparse1", collection_name="rag" ) # Initialize ChatGroq model chat_model = ChatGroq( temperature=0, model_name="mixtral-8x7b-32768", api_key=groq_api_key ) # Convert retrieved documents into QA format custom_prompt_template = """ Use the following pieces of information to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer. Context: {context} Question: {question} Only return the helpful answer below and nothing else. Helpful answer: """ prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question']) # Initialize RetrievalQA qa = RetrievalQA.from_chain_type( llm=chat_model, chain_type="stuff", retriever=vs.as_retriever(search_kwargs={'k': 3}), return_source_documents=True, chain_type_kwargs={"prompt": prompt} ) # Define function to interactively ask questions and retrieve answers def ask_question(question): response = qa.invoke({"query": question}) return response["result"] # Example questions example_questions = [ "What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?", "What is the Cash flows from operating activities associated with bad expense specified in the document?", "What is Loss (income) from equity method investments, net?" ] # Ask questions and display answers for idx, question in enumerate(example_questions, start=1): st.subheader(f"Question {idx}: {question}") answer = ask_question(question) st.write(f"Answer: {answer}") else: st.write("No documents were parsed.")