SwatGarg commited on
Commit
10bbba1
1 Parent(s): 36196b5

Update retriever.py

Browse files
Files changed (1) hide show
  1. retriever.py +32 -0
retriever.py CHANGED
@@ -14,6 +14,38 @@ import os
14
  curr_dir = os.getcwd()
15
  db_path = 'chroma_db_v2'
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
18
  '''
19
  Process a PDF document and return the documents and text splitters
 
14
  curr_dir = os.getcwd()
15
  db_path = 'chroma_db_v2'
16
 
17
+ class QuestionRetriever:
18
+
19
+ def load_documents(self,file_name):
20
+ current_directory = os.getcwd()
21
+ data_directory = os.path.join(current_directory, "data")
22
+ file_path = os.path.join(data_directory, file_name)
23
+ loader = TextLoader(file_path)
24
+ documents = loader.load()
25
+ return documents
26
+
27
+ def store_data_in_vector_db(self,documents):
28
+ # global db
29
+ text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0,separator="\n")
30
+ docs = text_splitter.split_documents(documents)
31
+ # create the open-source embedding function
32
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
33
+ # print(docs)
34
+ # load it into Chroma
35
+ db = Chroma.from_documents(docs, embedding_function)
36
+ return db
37
+
38
+ def get_response(self, user_query):
39
+ db=self.store_data_in_vector_db(documents)
40
+
41
+ docs = db.similarity_search(user_query)
42
+ most_similar_question = docs[0].page_content.split("\n")[0] # Extract the first question
43
+ if user_query==most_similar_question:
44
+ most_similar_question=docs[1].page_content.split("\n")[0]
45
+
46
+ print(most_similar_question)
47
+ return most_similar_question
48
+
49
  def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
50
  '''
51
  Process a PDF document and return the documents and text splitters