Spaces:

Akshayram1
/

rag

Running

File size: 4,719 Bytes

import os
import streamlit as st
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
import joblib
import tempfile

# API keys
llama_cloud_api_key = "llx-rVenNfvEyWTTZ2bOJIY7zymr6oyyucfdBusq407A6RzZhMKb"
groq_api_key = "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"

# Function to load or parse data from uploaded PDF file
def load_or_parse_data(uploaded_file):
    data_file = "./data/parsed_data.pkl"
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(uploaded_file.getvalue())
        temp_file_path = temp_file.name
    parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies,
    Inc. with the Securities and Exchange Commission (SEC)...
    """
    parser = LlamaParse(api_key=llama_cloud_api_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000)
    llama_parse_documents = parser.load_data(temp_file_path)
    os.remove(temp_file_path)
    return llama_parse_documents

# User uploads PDF file
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
    llama_parse_documents = load_or_parse_data(uploaded_file)

    if llama_parse_documents:
        # Create data directory if it doesn't exist
        os.makedirs("data", exist_ok=True)

        # Further processing of the parsed data...
        # Further processing of the parsed data
        with open('data/output.md', 'a') as f:
            for doc in llama_parse_documents:
                f.write(doc.text + '\n')

        markdown_path = "data/output.md"
        loader = UnstructuredMarkdownLoader(markdown_path)
        documents = loader.load()

        # Split loaded documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
        docs = text_splitter.split_documents(documents)

        # Initialize Embeddings
        embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

        if docs:
            # Create and persist a Chroma vector database from the chunked documents
            vs = Chroma.from_documents(
                documents=docs,
                embedding=embed_model,
                persist_directory="chroma_db_llamaparse1",
                collection_name="rag"
            )

            # Initialize ChatGroq model
            chat_model = ChatGroq(
                temperature=0,
                model_name="mixtral-8x7b-32768",
                api_key=groq_api_key
            )

            # Convert retrieved documents into QA format
            custom_prompt_template = """
            Use the following pieces of information to answer the user's question.
            If you don't know the answer, just say that you don't know, don't try to make up an answer.

            Context: {context}
            Question: {question}

            Only return the helpful answer below and nothing else.
            Helpful answer:
            """
            prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

            # Initialize RetrievalQA
            qa = RetrievalQA.from_chain_type(
                llm=chat_model,
                chain_type="stuff",
                retriever=vs.as_retriever(search_kwargs={'k': 3}),
                return_source_documents=True,
                chain_type_kwargs={"prompt": prompt}
            )

            # Define function to interactively ask questions and retrieve answers
            def ask_question(question):
                response = qa.invoke({"query": question})
                return response["result"]

            # Example questions
            example_questions = [
                "What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?",
                "What is the Cash flows from operating activities associated with bad expense specified in the document?",
                "What is Loss (income) from equity method investments, net?"
            ]

            # Ask questions and display answers
            for idx, question in enumerate(example_questions, start=1):
                st.subheader(f"Question {idx}: {question}")
                answer = ask_question(question)
                st.write(f"Answer: {answer}")
    else:
        st.write("No documents were parsed.")