Spaces:

Akshayram1
/

rag

Sleeping

App Files Files Community

Akshayram1 commited on May 10

Commit

a422b23

•

1 Parent(s): 9a90d0f

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -41

app.py CHANGED Viewed

@@ -9,67 +9,100 @@ from langchain.chains import RetrievalQA
 from langchain_groq import ChatGroq
 import joblib
 import os
 # Streamlit specific setup
 st.title("Uber Quarterly Report QA")
 user_llama_key = st.text_input("Enter your LLAMA Cloud API key:")
 user_groq_key = st.text_input("Enter your GROQ API key:")
-# LLAMAPARSE
-parser = LlamaParse(api_key=user_llama_key, result_type="markdown", verbose=True)
-# Function to load or parse data
-def load_or_parse_data():
     data_file = "./data/parsed_data.pkl"
-    if os.path.exists(data_file):
-        parsed_data = joblib.load(data_file)
-    else:
-        parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies,
-        Inc. with the Securities and Exchange Commission (SEC)...
-        """
-        parser = LlamaParse(api_key=user_llama_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000)
-        llama_parse_documents = parser.load_data("./uber_10q_march_2022.pdf")
-        joblib.dump(llama_parse_documents, data_file)
-        parsed_data = llama_parse_documents
-    return parsed_data
-# Create vector database
-def create_vector_database():
-    llama_parse_documents = load_or_parse_data()
     with open('data/output.md', 'a') as f:
         for doc in llama_parse_documents:
             f.write(doc.text + '\n')
     markdown_path = "data/output.md"
     loader = UnstructuredMarkdownLoader(markdown_path)
     documents = loader.load()
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
     docs = text_splitter.split_documents(documents)
     embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
         persist_directory="chroma_db_llamaparse1",
         collection_name="rag"
     )
-    return vs, embed_model
-# Create QA model
-vs, embed_model = create_vector_database()
-chat_model = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768", api_key=user_groq_key)
-vectorstore = Chroma(embedding_function=embed_model, persist_directory="chroma_db_llamaparse1", collection_name="rag")
-retriever = vectorstore.as_retriever(search_kwargs={'k': 3})
-# Define custom prompt template
-custom_prompt_template = """Use the following pieces of information to answer the user's question...
-"""
-def set_custom_prompt():
     prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
-    return prompt
-# Get user query and retrieve answer
-user_query = st.text_input("Ask a question:")
-if st.button("Get Answer"):
-    prompt = set_custom_prompt()
-    qa = RetrievalQA.from_chain_type(llm=chat_model, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": prompt})
-    response = qa.invoke({"query": user_query})
-    st.write(response["result"])

 from langchain_groq import ChatGroq
 import joblib
 import os
+import tempfile
 # Streamlit specific setup
 st.title("Uber Quarterly Report QA")
 user_llama_key = st.text_input("Enter your LLAMA Cloud API key:")
 user_groq_key = st.text_input("Enter your GROQ API key:")
+# Function to load or parse data from uploaded PDF file
+def load_or_parse_data(uploaded_file):
     data_file = "./data/parsed_data.pkl"
+    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+        temp_file.write(uploaded_file.getvalue())
+        temp_file_path = temp_file.name
+    parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies,
+    Inc. with the Securities and Exchange Commission (SEC)...
+    """
+    parser = LlamaParse(api_key=user_llama_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000)
+    llama_parse_documents = parser.load_data(temp_file_path)
+    os.remove(temp_file_path)
+    return llama_parse_documents
+# User uploads PDF file
+uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
+if uploaded_file is not None:
+    llama_parse_documents = load_or_parse_data(uploaded_file)
+    # Further processing of the parsed data...
+    # Further processing of the parsed data
     with open('data/output.md', 'a') as f:
         for doc in llama_parse_documents:
             f.write(doc.text + '\n')
     markdown_path = "data/output.md"
     loader = UnstructuredMarkdownLoader(markdown_path)
     documents = loader.load()
+    # Split loaded documents into chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
     docs = text_splitter.split_documents(documents)
+    # Initialize Embeddings
     embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+    # Create and persist a Chroma vector database from the chunked documents
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
         persist_directory="chroma_db_llamaparse1",
         collection_name="rag"
     )
+    # Initialize ChatGroq model
+    chat_model = ChatGroq(
+        temperature=0,
+        model_name="mixtral-8x7b-32768",
+        api_key=user_groq_key
+    )
+    # Convert retrieved documents into QA format
+    custom_prompt_template = """
+    Use the following pieces of information to answer the user's question.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    Context: {context}
+    Question: {question}
+    Only return the helpful answer below and nothing else.
+    Helpful answer:
+    """
     prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
+    # Initialize RetrievalQA
+    qa = RetrievalQA.from_chain_type(
+        llm=chat_model,
+        chain_type="stuff",
+        retriever=vs.as_retriever(search_kwargs={'k': 3}),
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": prompt}
+    )
+    # Define function to interactively ask questions and retrieve answers
+    def ask_question(question):
+        response = qa.invoke({"query": question})
+        return response["result"]
+    # Example questions
+    example_questions = [
+        "What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?",
+        "What is the Cash flows from operating activities associated with bad expense specified in the document?",
+        "What is Loss (income) from equity method investments, net?"
+    ]
+    # Ask questions and display answers
+    for idx, question in enumerate(example_questions, start=1):
+        st.subheader(f"Question {idx}: {question}")
+        answer = ask_question(question)
+        st.write(f"Answer: {answer}")