parvezalmuqtadir commited on
Commit
db5a295
1 Parent(s): de7a2e9

Update pdfquery.py

Browse files
Files changed (1) hide show
  1. pdfquery.py +16 -12
pdfquery.py CHANGED
@@ -7,32 +7,36 @@ from langchain.chains.question_answering import load_qa_chain
7
  from langchain.llms import OpenAI
8
  from langchain.chat_models import ChatOpenAI
9
 
 
10
  class PDFQuery:
11
- def __init__(self, openai_api_key = None) -> None:
12
  self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
13
  os.environ["OPENAI_API_KEY"] = openai_api_key
14
- self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
15
- # self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
16
- self.llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
17
  self.chain = None
18
  self.db = None
19
-
 
20
  def ask(self, question: str) -> str:
21
  if self.chain is None:
22
  response = "Please, add a document."
23
  else:
24
  docs = self.db.get_relevant_documents(question)
 
25
  response = self.chain.run(input_documents=docs, question=question)
26
  return response
27
 
 
 
28
  def ingest(self, file_path: os.PathLike) -> None:
29
- loader = PyPDFium2Loader(file_path)
30
- documents = loader.load()
31
- splitted_documents = self.text_splitter.split_documents(documents)
32
- self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever()
33
- # self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
34
- self.chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
35
 
36
  def forget(self) -> None:
37
  self.db = None
38
- self.chain = None
 
7
  from langchain.llms import OpenAI
8
  from langchain.chat_models import ChatOpenAI
9
 
10
+
11
  class PDFQuery:
12
+ def __init__(self, openai_api_key=None) -> None:
13
  self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
14
  os.environ["OPENAI_API_KEY"] = openai_api_key
15
+ # Adjust chunk_size and chunk_overlap for better handling of large documents
16
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
17
+ self.llm = ChatOpenAI(model="gpt-4", temperature=0.5, openai_api_key=openai_api_key)
18
  self.chain = None
19
  self.db = None
20
+
21
+
22
  def ask(self, question: str) -> str:
23
  if self.chain is None:
24
  response = "Please, add a document."
25
  else:
26
  docs = self.db.get_relevant_documents(question)
27
+ # to better suit GPT-4's input format for optimal results.
28
  response = self.chain.run(input_documents=docs, question=question)
29
  return response
30
 
31
+
32
+
33
  def ingest(self, file_path: os.PathLike) -> None:
34
+ loader = PyPDFium2Loader(file_path)
35
+ documents = loader.load()
36
+ splitted_documents = self.text_splitter.split_documents(documents)
37
+ self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever()
38
+ self.chain = load_qa_chain(self.llm, chain_type="stuff")
 
39
 
40
  def forget(self) -> None:
41
  self.db = None
42
+ self.chain = None