Spaces:
Sleeping
Sleeping
bullyellis
commited on
Commit
·
033d8e0
1
Parent(s):
9135594
another
Browse files
app.py
CHANGED
@@ -6,51 +6,47 @@ from langchain.document_loaders import PyMuPDFLoader
|
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
8 |
from langchain_community.vectorstores import Qdrant
|
9 |
-
from operator import itemgetter
|
10 |
from langchain.schema.runnable import RunnablePassthrough
|
11 |
-
from
|
12 |
-
from
|
13 |
-
|
|
|
|
|
|
|
14 |
|
15 |
-
# Split documents into chunks
|
16 |
def tiktoken_len(text):
|
17 |
tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
|
18 |
text,
|
19 |
)
|
20 |
return len(tokens)
|
21 |
|
22 |
-
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
|
32 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
client = QdrantClient(path="./data/embeddings")
|
42 |
-
db = Qdrant(client=client, collection_name="Meta10k", embeddings=embedding_model,)
|
43 |
|
44 |
-
|
|
|
45 |
|
46 |
@cl.on_chat_start
|
47 |
def chat_start():
|
48 |
|
49 |
-
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
|
50 |
-
openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
|
51 |
-
|
52 |
RAG_PROMPT = """
|
53 |
-
You are an expert financial analyst. You will be provided CONTEXT excerpts from the META company 10K annual report. Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst. IF the context provided does give you enough information to answer the question, respond "I do not know"
|
54 |
|
55 |
CONTEXT:
|
56 |
{context}
|
@@ -83,7 +79,9 @@ def chat_start():
|
|
83 |
("human", EVAL_USER_TEMPLATE)
|
84 |
])
|
85 |
|
86 |
-
|
|
|
|
|
87 |
eval_chain = eval_prompt | openai_chat_model_4
|
88 |
|
89 |
cl.user_session.set("chain", chain)
|
|
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
8 |
from langchain_community.vectorstores import Qdrant
|
|
|
9 |
from langchain.schema.runnable import RunnablePassthrough
|
10 |
+
from operator import itemgetter
|
11 |
+
from langchain.retrievers.multi_query import MultiQueryRetriever
|
12 |
+
|
13 |
+
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
|
14 |
+
openai_chat_model_4 = ChatOpenAI(model="gpt-4-turbo")
|
15 |
+
|
16 |
|
|
|
17 |
def tiktoken_len(text):
|
18 |
tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
|
19 |
text,
|
20 |
)
|
21 |
return len(tokens)
|
22 |
|
23 |
+
docs = PyMuPDFLoader("Meta10k.pdf").load()
|
24 |
|
25 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
26 |
+
chunk_size = 1000,
|
27 |
+
chunk_overlap = 50,
|
28 |
+
length_function = tiktoken_len,
|
29 |
+
)
|
30 |
|
31 |
+
split_chunks = text_splitter.split_documents(docs)
|
32 |
|
33 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
34 |
|
35 |
+
qdrant_vectorstore = Qdrant.from_documents(
|
36 |
+
split_chunks,
|
37 |
+
embedding_model,
|
38 |
+
location=":memory:",
|
39 |
+
collection_name="Meta10k",
|
40 |
+
)
|
|
|
|
|
|
|
41 |
|
42 |
+
# THE SECRET SAUCE
|
43 |
+
qdrant_retriever = qdrant_vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25})
|
44 |
|
45 |
@cl.on_chat_start
|
46 |
def chat_start():
|
47 |
|
|
|
|
|
|
|
48 |
RAG_PROMPT = """
|
49 |
+
You are an expert financial analyst. You will be provided CONTEXT excerpts from the META company 10K annual report. Your job is to answer the QUERY as correctly as you can using the information provided by the CONTEXT and your skills as an expert financial analyst. For questions regarding money do not over think it and begin adding values unless you are specifically asked to. Use the simplest most obvious choice. IF the context provided does give you enough information to answer the question, respond "I do not know"
|
50 |
|
51 |
CONTEXT:
|
52 |
{context}
|
|
|
79 |
("human", EVAL_USER_TEMPLATE)
|
80 |
])
|
81 |
|
82 |
+
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=qdrant_retriever, llm=openai_chat_model)
|
83 |
+
|
84 |
+
chain = ({"context": itemgetter("question") | retriever_from_llm, "question": itemgetter("question")} | RunnablePassthrough.assign(context=itemgetter("context")) | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")})
|
85 |
eval_chain = eval_prompt | openai_chat_model_4
|
86 |
|
87 |
cl.user_session.set("chain", chain)
|