Spaces:

bullyellis
/

META-10K-RAG

Sleeping

App Files Files Community

bullyellis commited on May 1

Commit

033d8e0

•

1 Parent(s): 9135594

another

Browse files

Files changed (1) hide show

app.py +25 -27

app.py CHANGED Viewed

@@ -6,51 +6,47 @@ from langchain.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_openai.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import Qdrant
-from operator import itemgetter
 from langchain.schema.runnable import RunnablePassthrough
-from langchain.schema.output_parser import StrOutputParser
-from qdrant_client import QdrantClient
-from langchain_core.prompts import ChatPromptTemplate
-# Split documents into chunks
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
         text,
     )
     return len(tokens)
-# docs = PyMuPDFLoader("Meta10k.pdf").load()
-# text_splitter = RecursiveCharacterTextSplitter(
-#     chunk_size = 1000,
-#     chunk_overlap = 200,
-#     length_function = tiktoken_len,
-# )
-# split_chunks = text_splitter.split_documents(docs)
 embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
-# qdrant_vectorstore = Qdrant.from_documents(
-#     split_chunks,
-#     embedding_model,
-#     path="./data/embeddings",
-#     collection_name="Meta10k",
-# )
-client = QdrantClient(path="./data/embeddings")
-db = Qdrant(client=client, collection_name="Meta10k", embeddings=embedding_model,)
-qdrant_retriever = db.as_retriever()
 @cl.on_chat_start
 def chat_start():
-    openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
-    openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
     RAG_PROMPT = """
-    You are an expert financial analyst.  You will be provided CONTEXT excerpts from the META company 10K annual report.  Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst. IF the context provided does give you enough information to answer the question, respond "I do not know"
     CONTEXT:
     {context}
@@ -83,7 +79,9 @@ def chat_start():
         ("human", EVAL_USER_TEMPLATE)
     ])
-    chain = ({"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")} | RunnablePassthrough.assign(context=itemgetter("context")) | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")})
     eval_chain = eval_prompt | openai_chat_model_4
     cl.user_session.set("chain", chain)

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_openai.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import Qdrant
 from langchain.schema.runnable import RunnablePassthrough
+from operator import itemgetter
+from langchain.retrievers.multi_query import MultiQueryRetriever
+openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
+openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
         text,
     )
     return len(tokens)
+docs = PyMuPDFLoader("Meta10k.pdf").load()
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size = 1000,
+    chunk_overlap = 50,
+    length_function = tiktoken_len,
+)
+split_chunks = text_splitter.split_documents(docs)
 embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+qdrant_vectorstore = Qdrant.from_documents(
+    split_chunks,
+    embedding_model,
+    location=":memory:",
+    collection_name="Meta10k",
+)
+# THE SECRET SAUCE
+qdrant_retriever = qdrant_vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25})
 @cl.on_chat_start
 def chat_start():
     RAG_PROMPT = """
+    You are an expert financial analyst.  You will be provided CONTEXT excerpts from the META company 10K annual report.  Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst.  For questions regarding money do not over think it and begin adding values unless you are specifically asked to.  Use the simplest most obvious choice. IF the context provided does give you enough information to answer the question, respond "I do not know"
     CONTEXT:
     {context}
         ("human", EVAL_USER_TEMPLATE)
     ])
+    retriever_from_llm = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model)
+    chain = ({"context": itemgetter("question") | retriever_from_llm, "question": itemgetter("question")} | RunnablePassthrough.assign(context=itemgetter("context")) | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")})
     eval_chain = eval_prompt | openai_chat_model_4
     cl.user_session.set("chain", chain)