bullyellis commited on
Commit
033d8e0
1 Parent(s): 9135594
Files changed (1) hide show
  1. app.py +25 -27
app.py CHANGED
@@ -6,51 +6,47 @@ from langchain.document_loaders import PyMuPDFLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_openai.embeddings import OpenAIEmbeddings
8
  from langchain_community.vectorstores import Qdrant
9
- from operator import itemgetter
10
  from langchain.schema.runnable import RunnablePassthrough
11
- from langchain.schema.output_parser import StrOutputParser
12
- from qdrant_client import QdrantClient
13
- from langchain_core.prompts import ChatPromptTemplate
 
 
 
14
 
15
- # Split documents into chunks
16
  def tiktoken_len(text):
17
  tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
18
  text,
19
  )
20
  return len(tokens)
21
 
22
- # docs = PyMuPDFLoader("Meta10k.pdf").load()
23
 
24
- # text_splitter = RecursiveCharacterTextSplitter(
25
- # chunk_size = 1000,
26
- # chunk_overlap = 200,
27
- # length_function = tiktoken_len,
28
- # )
29
 
30
- # split_chunks = text_splitter.split_documents(docs)
31
 
32
  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
33
 
34
- # qdrant_vectorstore = Qdrant.from_documents(
35
- # split_chunks,
36
- # embedding_model,
37
- # path="./data/embeddings",
38
- # collection_name="Meta10k",
39
- # )
40
-
41
- client = QdrantClient(path="./data/embeddings")
42
- db = Qdrant(client=client, collection_name="Meta10k", embeddings=embedding_model,)
43
 
44
- qdrant_retriever = db.as_retriever()
 
45
 
46
  @cl.on_chat_start
47
  def chat_start():
48
 
49
- openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
50
- openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
51
-
52
  RAG_PROMPT = """
53
- You are an expert financial analyst. You will be provided CONTEXT excerpts from the META company 10K annual report. Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst. IF the context provided does give you enough information to answer the question, respond "I do not know"
54
 
55
  CONTEXT:
56
  {context}
@@ -83,7 +79,9 @@ def chat_start():
83
  ("human", EVAL_USER_TEMPLATE)
84
  ])
85
 
86
- chain = ({"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")} | RunnablePassthrough.assign(context=itemgetter("context")) | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")})
 
 
87
  eval_chain = eval_prompt | openai_chat_model_4
88
 
89
  cl.user_session.set("chain", chain)
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_openai.embeddings import OpenAIEmbeddings
8
  from langchain_community.vectorstores import Qdrant
 
9
  from langchain.schema.runnable import RunnablePassthrough
10
+ from operator import itemgetter
11
+ from langchain.retrievers.multi_query import MultiQueryRetriever
12
+
13
+ openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
14
+ openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
15
+
16
 
 
17
  def tiktoken_len(text):
18
  tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
19
  text,
20
  )
21
  return len(tokens)
22
 
23
+ docs = PyMuPDFLoader("Meta10k.pdf").load()
24
 
25
+ text_splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size = 1000,
27
+ chunk_overlap = 50,
28
+ length_function = tiktoken_len,
29
+ )
30
 
31
+ split_chunks = text_splitter.split_documents(docs)
32
 
33
  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
34
 
35
+ qdrant_vectorstore = Qdrant.from_documents(
36
+ split_chunks,
37
+ embedding_model,
38
+ location=":memory:",
39
+ collection_name="Meta10k",
40
+ )
 
 
 
41
 
42
+ # THE SECRET SAUCE
43
+ qdrant_retriever = qdrant_vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25})
44
 
45
  @cl.on_chat_start
46
  def chat_start():
47
 
 
 
 
48
  RAG_PROMPT = """
49
+ You are an expert financial analyst. You will be provided CONTEXT excerpts from the META company 10K annual report. Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst. For questions regarding money do not over think it and begin adding values unless you are specifically asked to. Use the simplest most obvious choice. IF the context provided does give you enough information to answer the question, respond "I do not know"
50
 
51
  CONTEXT:
52
  {context}
 
79
  ("human", EVAL_USER_TEMPLATE)
80
  ])
81
 
82
+ retriever_from_llm = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model)
83
+
84
+ chain = ({"context": itemgetter("question") | retriever_from_llm, "question": itemgetter("question")} | RunnablePassthrough.assign(context=itemgetter("context")) | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")})
85
  eval_chain = eval_prompt | openai_chat_model_4
86
 
87
  cl.user_session.set("chain", chain)