Spaces:
Sleeping
Sleeping
santhoshml
commited on
Commit
•
66d0a2d
1
Parent(s):
234f73e
Commit after using contextual compression
Browse files- app.py +20 -9
- requirements.txt +2 -1
- utils.py +30 -3
app.py
CHANGED
@@ -1,21 +1,17 @@
|
|
1 |
from dotenv import load_dotenv
|
2 |
load_dotenv()
|
3 |
|
4 |
-
from langchain_openai import ChatOpenAI
|
5 |
-
from langchain_core.prompts import ChatPromptTemplate
|
6 |
-
import tiktoken
|
7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
-
from langchain_community.document_loaders import PyMuPDFLoader
|
9 |
-
from langchain_community.vectorstores import Qdrant
|
10 |
-
from langchain_openai.embeddings import OpenAIEmbeddings
|
11 |
from operator import itemgetter
|
12 |
-
from langchain.schema.output_parser import StrOutputParser
|
13 |
from langchain.schema.runnable import RunnablePassthrough
|
14 |
from utils import *
|
15 |
import os
|
16 |
import getpass
|
17 |
from langchain.globals import set_debug
|
18 |
import chainlit as cl
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
class RAGMeta10K:
|
@@ -28,11 +24,25 @@ class RAGMeta10K:
|
|
28 |
self.UtilsObject = Utils()
|
29 |
self.rag_prompt_template = self.UtilsObject.init_prompt()
|
30 |
self.UtilsObject.split_into_chunks()
|
|
|
31 |
self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def ask_question(self, question: str):
|
34 |
retrieval_augmented_qa_chain = (
|
35 |
-
{"context": itemgetter("question") | self.
|
36 |
| RunnablePassthrough.assign(context=itemgetter("context"))
|
37 |
| {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
|
38 |
)
|
@@ -55,6 +65,7 @@ class RAGMeta10K:
|
|
55 |
@cl.on_chat_start
|
56 |
async def start_chat():
|
57 |
ragObject = RAGMeta10K()
|
|
|
58 |
cl.user_session.set("ragObject", ragObject)
|
59 |
|
60 |
@cl.on_message
|
|
|
1 |
from dotenv import load_dotenv
|
2 |
load_dotenv()
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from operator import itemgetter
|
|
|
5 |
from langchain.schema.runnable import RunnablePassthrough
|
6 |
from utils import *
|
7 |
import os
|
8 |
import getpass
|
9 |
from langchain.globals import set_debug
|
10 |
import chainlit as cl
|
11 |
+
from langchain_openai import ChatOpenAI, OpenAI
|
12 |
+
from langchain.retrievers.multi_query import MultiQueryRetriever
|
13 |
+
from langchain.retrievers import ContextualCompressionRetriever
|
14 |
+
from langchain.retrievers.document_compressors import LLMChainExtractor
|
15 |
|
16 |
|
17 |
class RAGMeta10K:
|
|
|
24 |
self.UtilsObject = Utils()
|
25 |
self.rag_prompt_template = self.UtilsObject.init_prompt()
|
26 |
self.UtilsObject.split_into_chunks()
|
27 |
+
# normail retriever
|
28 |
self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
|
29 |
+
|
30 |
+
# MultiQuery Retriever
|
31 |
+
# self.mqr_retriever = MultiQueryRetriever.from_llm(
|
32 |
+
# retriever=self.qdrant_retriever, llm=ChatOpenAI(temperature=0)
|
33 |
+
# )
|
34 |
+
|
35 |
+
#Contexttual Compression
|
36 |
+
# gives correct answer for board question but messup on Cash question
|
37 |
+
compressor = LLMChainExtractor.from_llm(OpenAI(temperature=0))
|
38 |
+
self.compression_retriever = ContextualCompressionRetriever(
|
39 |
+
base_compressor=compressor, base_retriever=self.qdrant_retriever
|
40 |
+
)
|
41 |
+
|
42 |
|
43 |
def ask_question(self, question: str):
|
44 |
retrieval_augmented_qa_chain = (
|
45 |
+
{"context": itemgetter("question") | self.compression_retriever, "question": itemgetter("question")}
|
46 |
| RunnablePassthrough.assign(context=itemgetter("context"))
|
47 |
| {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
|
48 |
)
|
|
|
65 |
@cl.on_chat_start
|
66 |
async def start_chat():
|
67 |
ragObject = RAGMeta10K()
|
68 |
+
# ragObject.UtilsObject.generate_test_set()
|
69 |
cl.user_session.set("ragObject", ragObject)
|
70 |
|
71 |
@cl.on_message
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ langchain-community
|
|
6 |
chainlit
|
7 |
uvicorn
|
8 |
pymupdf
|
9 |
-
qdrant-client
|
|
|
|
6 |
chainlit
|
7 |
uvicorn
|
8 |
pymupdf
|
9 |
+
qdrant-client
|
10 |
+
ragas
|
utils.py
CHANGED
@@ -4,9 +4,8 @@ from langchain_community.document_loaders import PyMuPDFLoader
|
|
4 |
from langchain_core.prompts import ChatPromptTemplate
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
7 |
-
from typing import List
|
8 |
-
from langchain_core.documents.base import Document
|
9 |
from langchain_community.vectorstores import Qdrant
|
|
|
10 |
|
11 |
|
12 |
class Utils:
|
@@ -15,19 +14,33 @@ class Utils:
|
|
15 |
llm_name: str = "gpt-3.5-turbo",
|
16 |
pdf_name: str = "meta-10k.pdf",
|
17 |
embedding_model: str = "text-embedding-3-small",
|
|
|
|
|
18 |
) -> None:
|
19 |
self.openai_chat_model = ChatOpenAI(model=llm_name)
|
20 |
self.enc = tiktoken.encoding_for_model(llm_name)
|
21 |
self.docs = PyMuPDFLoader(pdf_name).load()
|
22 |
self.embedding_model = OpenAIEmbeddings(model=embedding_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
#semanticTextSplitter
|
25 |
#tokenRTextSplitter
|
26 |
def split_into_chunks(self):
|
27 |
text_splitter = RecursiveCharacterTextSplitter(
|
28 |
-
chunk_size=
|
29 |
)
|
30 |
self.split_chunks = text_splitter.split_documents(self.docs)
|
|
|
|
|
|
|
|
|
31 |
return self.split_chunks
|
32 |
|
33 |
def get_llm_model(self):
|
@@ -61,3 +74,17 @@ class Utils:
|
|
61 |
collection_name="meta-10k",
|
62 |
)
|
63 |
return self.qdrant_vectorstore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from langchain_core.prompts import ChatPromptTemplate
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
|
|
|
7 |
from langchain_community.vectorstores import Qdrant
|
8 |
+
from ragas.testset.generator import TestsetGenerator
|
9 |
|
10 |
|
11 |
class Utils:
|
|
|
14 |
llm_name: str = "gpt-3.5-turbo",
|
15 |
pdf_name: str = "meta-10k.pdf",
|
16 |
embedding_model: str = "text-embedding-3-small",
|
17 |
+
generator_llm: str = "gpt-3.5-turbo-16k",
|
18 |
+
critic_llm: str= "gpt-4-turbo"
|
19 |
) -> None:
|
20 |
self.openai_chat_model = ChatOpenAI(model=llm_name)
|
21 |
self.enc = tiktoken.encoding_for_model(llm_name)
|
22 |
self.docs = PyMuPDFLoader(pdf_name).load()
|
23 |
self.embedding_model = OpenAIEmbeddings(model=embedding_model)
|
24 |
+
self.generator_llm = ChatOpenAI(model=generator_llm)
|
25 |
+
self.critic_llm = ChatOpenAI(model=critic_llm)
|
26 |
+
self.test_generator = TestsetGenerator.from_langchain(
|
27 |
+
generator_llm,
|
28 |
+
critic_llm,
|
29 |
+
OpenAIEmbeddings()
|
30 |
+
)
|
31 |
+
|
32 |
|
33 |
#semanticTextSplitter
|
34 |
#tokenRTextSplitter
|
35 |
def split_into_chunks(self):
|
36 |
text_splitter = RecursiveCharacterTextSplitter(
|
37 |
+
chunk_size=2000, chunk_overlap=200, length_function=self.tiktoken_len
|
38 |
)
|
39 |
self.split_chunks = text_splitter.split_documents(self.docs)
|
40 |
+
|
41 |
+
# semantic splitter
|
42 |
+
# text_splitter = SemanticChunker(OpenAIEmbeddings())
|
43 |
+
# self.split_chunks = text_splitter.split_documents([self.docs])
|
44 |
return self.split_chunks
|
45 |
|
46 |
def get_llm_model(self):
|
|
|
74 |
collection_name="meta-10k",
|
75 |
)
|
76 |
return self.qdrant_vectorstore
|
77 |
+
|
78 |
+
def generate_test_set(self)-> None:
|
79 |
+
text_splitter_eval = RecursiveCharacterTextSplitter(
|
80 |
+
chunk_size = 600,
|
81 |
+
chunk_overlap = 50
|
82 |
+
)
|
83 |
+
eval_documents = text_splitter_eval.split_documents(self.docs)
|
84 |
+
distributions = {
|
85 |
+
"simple": 0.5,
|
86 |
+
"multi_context": 0.4,
|
87 |
+
"reasoning": 0.1
|
88 |
+
}
|
89 |
+
testset = self.test_generator.generate_with_langchain_docs(eval_documents, 20, distributions, is_async = False)
|
90 |
+
print("santhosh:"+len(testset.to_pandas()))
|