Spaces:

andre-fichel
/

meta10-assistant

Sleeping

App Files Files Community

meta10-assistant / app.py

andre-fichel

AIE2 midterm submission

d62fb85 verified 3 months ago

raw

history blame contribute delete

No virus

6.89 kB

	# import dependencies
	import os
	import requests
	from operator import itemgetter
	import chainlit as cl
	from llama_parse import LlamaParse
	from llama_parse.utils import ResultType
	from llama_index.core import SimpleDirectoryReader
	from langchain.text_splitter import (
	RecursiveCharacterTextSplitter,
	MarkdownHeaderTextSplitter,
	)
	from langchain.schema.runnable.config import RunnableConfig
	from langchain_community.vectorstores import Qdrant
	from langchain_community.document_loaders import DirectoryLoader
	from langchain_openai import ChatOpenAI
	from langchain_openai.embeddings import OpenAIEmbeddings
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from qdrant_client import QdrantClient

	# load keys and variables
	from dotenv import load_dotenv

	load_dotenv()

	os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
	os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY")
	QDRANT_URL = os.getenv("QDRANT_URL")
	QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

	# pdf url
	URL = "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf"

	# parse instructions
	PARSING_INSTRUCTION = """The provided document is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).
	This form provides detailed financial information about the company's performance for a specific year.
	It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
	It contains many tables and some signature pages.

	Replace the signatures with tables containing the headers for each element.
	"""

	# rag prompt
	RAG_PROMPT = """
	CONTEXT:
	{context}

	QUERY:
	{question}

	The provided context is an annual report filed by Meta Platforms, Inc. with the Securities and Exchange Commission (SEC).
	This form provides detailed financial information about the company's performance for a specific year.
	It includes financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
	It contains many tables and some signature pages. All members of the board need to sign the document.

	Answer the query above only using the context provided. If you don't know the answer, simply say 'I don't know'.
	"""


	def create_new_vectorstore(collection_name, embeddings) -> Qdrant:
	data_file = "./data/output.md"
	# Check if the file exists
	if os.path.exists(data_file):
	print("The 10-K form has been parsed already. Using the cached version.")
	else:
	print("Cache is empty. Parsing will begin.")
	parse_10K_file()

	# load the document
	loader = DirectoryLoader(path="data/", glob="*/.md", show_progress=True)
	documents = loader.load()

	# split the document into chunks

	# split markdown headers
	headers_to_split_on = [
	("#", "Header 1"),
	("##", "Header 2"),
	("###", "Header 3"),
	]

	md_text_splitter = MarkdownHeaderTextSplitter(
	headers_to_split_on=headers_to_split_on, strip_headers=False
	)

	md_splits = md_text_splitter.split_text(documents[0].page_content)

	# recursive character text splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
	docs = text_splitter.split_documents(md_splits)

	# create the vectorstore
	qdrant_vector_store = Qdrant.from_documents(
	documents=docs,
	embedding=embeddings,
	url=QDRANT_URL,
	api_key=QDRANT_API_KEY,
	collection_name=collection_name,
	)

	return qdrant_vector_store


	def get_vectorstore(client, collection_name, embeddings) -> Qdrant:

	try:
	vector_store = Qdrant(
	client=client,
	collection_name=collection_name,
	embeddings=embeddings,
	)
	print(vector_store.embeddings)
	except:
	# create the vectorstore
	vector_store = create_new_vectorstore(collection_name, embeddings)

	return vector_store


	def parse_10K_file() -> None:

	# Create the data directory if it doesn't exist
	data_dir = "data"
	os.makedirs(data_dir, exist_ok=True)

	# Check if the file already exists
	file_path = os.path.join(data_dir, "Meta_10k.pdf")
	if not os.path.exists(file_path):
	# Download the file
	url = URL
	response = requests.get(url)

	# Save the file to the data directory
	with open(file_path, "wb") as file:
	file.write(response.content)
	print("File downloaded successfully.")
	else:
	print("File already exists. Skipping download.")

	# setup parser
	parser = LlamaParse(
	result_type=ResultType.MD, parsing_instruction=PARSING_INSTRUCTION
	)

	# load and parse the documet
	file_extractor = {".pdf": parser}
	llama_parse_documents = SimpleDirectoryReader(
	input_files=["data/Meta_10k.pdf"], file_extractor=file_extractor
	).load_data()

	# save markdown file
	data_file = "./data/output.md"
	with open(data_file, "a") as f:
	for doc in llama_parse_documents:
	f.write(doc.text + "\n")


	@cl.on_chat_start
	async def start() -> None:
	# instantiate embeddings
	embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

	# get vectorstore
	client = QdrantClient(
	QDRANT_URL,
	api_key=QDRANT_API_KEY, # For Qdrant Cloud, None for local instance
	)
	collection_name = "meta_10k"
	qdrant_vectorstore = get_vectorstore(client, collection_name, embeddings)

	# setup our retriever
	qdrant_retriever = qdrant_vectorstore.as_retriever()

	# setup rag prompt
	rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

	# setup the rag chain
	chat_model = ChatOpenAI(model="gpt-3.5-turbo")

	rag_chain = (
	{
	"question": itemgetter("question"),
	"context": itemgetter("question") \| qdrant_retriever,
	}
	\| RunnablePassthrough().assign(context=itemgetter("context"))
	\| {
	"response": rag_prompt \| chat_model \| StrOutputParser(),
	"context": itemgetter("context"),
	}
	)

	cl.user_session.set("rag_chain", rag_chain)
	await cl.Message(
	author="Assistant", content="Hi, I'm your Meta 10K Assistant! How can I help?"
	).send()


	@cl.on_message
	async def main(message: cl.Message) -> None:
	chain = cl.user_session.get("rag_chain")
	cb = cl.AsyncLangchainCallbackHandler()
	cb.answer_reached = True
	# res=await chain.acall(message, callbacks=[cb])
	res = await chain.ainvoke(
	{"question": message.content}, config=RunnableConfig(callbacks=[cb])
	)
	print(f"response: {res}")
	answer = res["response"]

	await cl.Message(author="Assistant", content=answer).send()