Spaces:

Akshayram1
/

rag

Sleeping

App Files Files Community

rag / app.py

Akshayram1

Update app.py

9806805 verified about 2 months ago

raw history blame contribute delete

No virus

4.72 kB

	import os
	import streamlit as st
	from llama_parse import LlamaParse
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	from langchain.prompts import PromptTemplate
	from langchain.chains import RetrievalQA
	from langchain_groq import ChatGroq
	import joblib
	import tempfile

	# API keys
	llama_cloud_api_key = "llx-rVenNfvEyWTTZ2bOJIY7zymr6oyyucfdBusq407A6RzZhMKb"
	groq_api_key = "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"

	# Function to load or parse data from uploaded PDF file
	def load_or_parse_data(uploaded_file):
	data_file = "./data/parsed_data.pkl"
	with tempfile.NamedTemporaryFile(delete=False) as temp_file:
	temp_file.write(uploaded_file.getvalue())
	temp_file_path = temp_file.name
	parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies,
	Inc. with the Securities and Exchange Commission (SEC)...
	"""
	parser = LlamaParse(api_key=llama_cloud_api_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000)
	llama_parse_documents = parser.load_data(temp_file_path)
	os.remove(temp_file_path)
	return llama_parse_documents

	# User uploads PDF file
	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
	if uploaded_file is not None:
	llama_parse_documents = load_or_parse_data(uploaded_file)

	if llama_parse_documents:
	# Create data directory if it doesn't exist
	os.makedirs("data", exist_ok=True)

	# Further processing of the parsed data...
	# Further processing of the parsed data
	with open('data/output.md', 'a') as f:
	for doc in llama_parse_documents:
	f.write(doc.text + '\n')

	markdown_path = "data/output.md"
	loader = UnstructuredMarkdownLoader(markdown_path)
	documents = loader.load()

	# Split loaded documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
	docs = text_splitter.split_documents(documents)

	# Initialize Embeddings
	embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	if docs:
	# Create and persist a Chroma vector database from the chunked documents
	vs = Chroma.from_documents(
	documents=docs,
	embedding=embed_model,
	persist_directory="chroma_db_llamaparse1",
	collection_name="rag"
	)

	# Initialize ChatGroq model
	chat_model = ChatGroq(
	temperature=0,
	model_name="mixtral-8x7b-32768",
	api_key=groq_api_key
	)

	# Convert retrieved documents into QA format
	custom_prompt_template = """
	Use the following pieces of information to answer the user's question.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.

	Context: {context}
	Question: {question}

	Only return the helpful answer below and nothing else.
	Helpful answer:
	"""
	prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

	# Initialize RetrievalQA
	qa = RetrievalQA.from_chain_type(
	llm=chat_model,
	chain_type="stuff",
	retriever=vs.as_retriever(search_kwargs={'k': 3}),
	return_source_documents=True,
	chain_type_kwargs={"prompt": prompt}
	)

	# Define function to interactively ask questions and retrieve answers
	def ask_question(question):
	response = qa.invoke({"query": question})
	return response["result"]

	# Example questions
	example_questions = [
	"What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?",
	"What is the Cash flows from operating activities associated with bad expense specified in the document?",
	"What is Loss (income) from equity method investments, net?"
	]

	# Ask questions and display answers
	for idx, question in enumerate(example_questions, start=1):
	st.subheader(f"Question {idx}: {question}")
	answer = ask_question(question)
	st.write(f"Answer: {answer}")
	else:
	st.write("No documents were parsed.")