santhoshml commited on
Commit
66d0a2d
1 Parent(s): 234f73e

Commit after using contextual compression

Browse files
Files changed (3) hide show
  1. app.py +20 -9
  2. requirements.txt +2 -1
  3. utils.py +30 -3
app.py CHANGED
@@ -1,21 +1,17 @@
1
  from dotenv import load_dotenv
2
  load_dotenv()
3
 
4
- from langchain_openai import ChatOpenAI
5
- from langchain_core.prompts import ChatPromptTemplate
6
- import tiktoken
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.document_loaders import PyMuPDFLoader
9
- from langchain_community.vectorstores import Qdrant
10
- from langchain_openai.embeddings import OpenAIEmbeddings
11
  from operator import itemgetter
12
- from langchain.schema.output_parser import StrOutputParser
13
  from langchain.schema.runnable import RunnablePassthrough
14
  from utils import *
15
  import os
16
  import getpass
17
  from langchain.globals import set_debug
18
  import chainlit as cl
 
 
 
 
19
 
20
 
21
  class RAGMeta10K:
@@ -28,11 +24,25 @@ class RAGMeta10K:
28
  self.UtilsObject = Utils()
29
  self.rag_prompt_template = self.UtilsObject.init_prompt()
30
  self.UtilsObject.split_into_chunks()
 
31
  self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def ask_question(self, question: str):
34
  retrieval_augmented_qa_chain = (
35
- {"context": itemgetter("question") | self.qdrant_retriever, "question": itemgetter("question")}
36
  | RunnablePassthrough.assign(context=itemgetter("context"))
37
  | {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
38
  )
@@ -55,6 +65,7 @@ class RAGMeta10K:
55
  @cl.on_chat_start
56
  async def start_chat():
57
  ragObject = RAGMeta10K()
 
58
  cl.user_session.set("ragObject", ragObject)
59
 
60
  @cl.on_message
 
1
  from dotenv import load_dotenv
2
  load_dotenv()
3
 
 
 
 
 
 
 
 
4
  from operator import itemgetter
 
5
  from langchain.schema.runnable import RunnablePassthrough
6
  from utils import *
7
  import os
8
  import getpass
9
  from langchain.globals import set_debug
10
  import chainlit as cl
11
+ from langchain_openai import ChatOpenAI, OpenAI
12
+ from langchain.retrievers.multi_query import MultiQueryRetriever
13
+ from langchain.retrievers import ContextualCompressionRetriever
14
+ from langchain.retrievers.document_compressors import LLMChainExtractor
15
 
16
 
17
  class RAGMeta10K:
 
24
  self.UtilsObject = Utils()
25
  self.rag_prompt_template = self.UtilsObject.init_prompt()
26
  self.UtilsObject.split_into_chunks()
27
+ # normail retriever
28
  self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
29
+
30
+ # MultiQuery Retriever
31
+ # self.mqr_retriever = MultiQueryRetriever.from_llm(
32
+ # retriever=self.qdrant_retriever, llm=ChatOpenAI(temperature=0)
33
+ # )
34
+
35
+ #Contexttual Compression
36
+ # gives correct answer for board question but messup on Cash question
37
+ compressor = LLMChainExtractor.from_llm(OpenAI(temperature=0))
38
+ self.compression_retriever = ContextualCompressionRetriever(
39
+ base_compressor=compressor, base_retriever=self.qdrant_retriever
40
+ )
41
+
42
 
43
  def ask_question(self, question: str):
44
  retrieval_augmented_qa_chain = (
45
+ {"context": itemgetter("question") | self.compression_retriever, "question": itemgetter("question")}
46
  | RunnablePassthrough.assign(context=itemgetter("context"))
47
  | {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
48
  )
 
65
  @cl.on_chat_start
66
  async def start_chat():
67
  ragObject = RAGMeta10K()
68
+ # ragObject.UtilsObject.generate_test_set()
69
  cl.user_session.set("ragObject", ragObject)
70
 
71
  @cl.on_message
requirements.txt CHANGED
@@ -6,4 +6,5 @@ langchain-community
6
  chainlit
7
  uvicorn
8
  pymupdf
9
- qdrant-client
 
 
6
  chainlit
7
  uvicorn
8
  pymupdf
9
+ qdrant-client
10
+ ragas
utils.py CHANGED
@@ -4,9 +4,8 @@ from langchain_community.document_loaders import PyMuPDFLoader
4
  from langchain_core.prompts import ChatPromptTemplate
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_openai.embeddings import OpenAIEmbeddings
7
- from typing import List
8
- from langchain_core.documents.base import Document
9
  from langchain_community.vectorstores import Qdrant
 
10
 
11
 
12
  class Utils:
@@ -15,19 +14,33 @@ class Utils:
15
  llm_name: str = "gpt-3.5-turbo",
16
  pdf_name: str = "meta-10k.pdf",
17
  embedding_model: str = "text-embedding-3-small",
 
 
18
  ) -> None:
19
  self.openai_chat_model = ChatOpenAI(model=llm_name)
20
  self.enc = tiktoken.encoding_for_model(llm_name)
21
  self.docs = PyMuPDFLoader(pdf_name).load()
22
  self.embedding_model = OpenAIEmbeddings(model=embedding_model)
 
 
 
 
 
 
 
 
23
 
24
  #semanticTextSplitter
25
  #tokenRTextSplitter
26
  def split_into_chunks(self):
27
  text_splitter = RecursiveCharacterTextSplitter(
28
- chunk_size=500, chunk_overlap=50, length_function=self.tiktoken_len
29
  )
30
  self.split_chunks = text_splitter.split_documents(self.docs)
 
 
 
 
31
  return self.split_chunks
32
 
33
  def get_llm_model(self):
@@ -61,3 +74,17 @@ class Utils:
61
  collection_name="meta-10k",
62
  )
63
  return self.qdrant_vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from langchain_core.prompts import ChatPromptTemplate
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_openai.embeddings import OpenAIEmbeddings
 
 
7
  from langchain_community.vectorstores import Qdrant
8
+ from ragas.testset.generator import TestsetGenerator
9
 
10
 
11
  class Utils:
 
14
  llm_name: str = "gpt-3.5-turbo",
15
  pdf_name: str = "meta-10k.pdf",
16
  embedding_model: str = "text-embedding-3-small",
17
+ generator_llm: str = "gpt-3.5-turbo-16k",
18
+ critic_llm: str= "gpt-4-turbo"
19
  ) -> None:
20
  self.openai_chat_model = ChatOpenAI(model=llm_name)
21
  self.enc = tiktoken.encoding_for_model(llm_name)
22
  self.docs = PyMuPDFLoader(pdf_name).load()
23
  self.embedding_model = OpenAIEmbeddings(model=embedding_model)
24
+ self.generator_llm = ChatOpenAI(model=generator_llm)
25
+ self.critic_llm = ChatOpenAI(model=critic_llm)
26
+ self.test_generator = TestsetGenerator.from_langchain(
27
+ generator_llm,
28
+ critic_llm,
29
+ OpenAIEmbeddings()
30
+ )
31
+
32
 
33
  #semanticTextSplitter
34
  #tokenRTextSplitter
35
  def split_into_chunks(self):
36
  text_splitter = RecursiveCharacterTextSplitter(
37
+ chunk_size=2000, chunk_overlap=200, length_function=self.tiktoken_len
38
  )
39
  self.split_chunks = text_splitter.split_documents(self.docs)
40
+
41
+ # semantic splitter
42
+ # text_splitter = SemanticChunker(OpenAIEmbeddings())
43
+ # self.split_chunks = text_splitter.split_documents([self.docs])
44
  return self.split_chunks
45
 
46
  def get_llm_model(self):
 
74
  collection_name="meta-10k",
75
  )
76
  return self.qdrant_vectorstore
77
+
78
+ def generate_test_set(self)-> None:
79
+ text_splitter_eval = RecursiveCharacterTextSplitter(
80
+ chunk_size = 600,
81
+ chunk_overlap = 50
82
+ )
83
+ eval_documents = text_splitter_eval.split_documents(self.docs)
84
+ distributions = {
85
+ "simple": 0.5,
86
+ "multi_context": 0.4,
87
+ "reasoning": 0.1
88
+ }
89
+ testset = self.test_generator.generate_with_langchain_docs(eval_documents, 20, distributions, is_async = False)
90
+ print("santhosh:"+len(testset.to_pandas()))