Thomas (Tom) Gardos commited on
Commit
166d2a9
·
2 Parent(s): ccfbb8c 1052297

Merge pull request #50 from DL4DS/text_extraction

Browse files
code/main.py CHANGED
@@ -67,16 +67,19 @@ class Chatbot:
67
  async def setup_llm(self):
68
  """
69
  Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
 
 
70
  """
71
  start_time = time.time()
72
 
73
  llm_settings = cl.user_session.get("llm_settings", {})
74
- chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
75
  llm_settings.get("chat_model"),
76
  llm_settings.get("retriever_method"),
77
  llm_settings.get("memory_window"),
78
  llm_settings.get("llm_style"),
79
  llm_settings.get("follow_up_questions"),
 
80
  )
81
 
82
  chain = cl.user_session.get("chain")
@@ -96,6 +99,7 @@ class Chatbot:
96
  self.config["llm_params"]["llm_style"] = llm_style
97
  self.config["llm_params"]["llm_loader"] = chat_profile
98
  self.config["llm_params"]["generate_follow_up"] = generate_follow_up
 
99
 
100
  self.llm_tutor.update_llm(
101
  old_config, self.config
@@ -173,6 +177,12 @@ class Chatbot:
173
  label="Stream response",
174
  initial=config["llm_params"]["stream"],
175
  ),
 
 
 
 
 
 
176
  cl.input_widget.Switch(
177
  id="follow_up_questions",
178
  label="Generate follow up questions",
 
67
  async def setup_llm(self):
68
  """
69
  Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
70
+
71
+ #TODO: Clean this up.
72
  """
73
  start_time = time.time()
74
 
75
  llm_settings = cl.user_session.get("llm_settings", {})
76
+ chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
77
  llm_settings.get("chat_model"),
78
  llm_settings.get("retriever_method"),
79
  llm_settings.get("memory_window"),
80
  llm_settings.get("llm_style"),
81
  llm_settings.get("follow_up_questions"),
82
+ llm_settings.get("chunking_mode"),
83
  )
84
 
85
  chain = cl.user_session.get("chain")
 
99
  self.config["llm_params"]["llm_style"] = llm_style
100
  self.config["llm_params"]["llm_loader"] = chat_profile
101
  self.config["llm_params"]["generate_follow_up"] = generate_follow_up
102
+ self.config["splitter_options"]["chunking_mode"] = chunking_mode
103
 
104
  self.llm_tutor.update_llm(
105
  old_config, self.config
 
177
  label="Stream response",
178
  initial=config["llm_params"]["stream"],
179
  ),
180
+ cl.input_widget.Select(
181
+ id="chunking_mode",
182
+ label="Chunking mode",
183
+ values=['fixed', 'semantic'],
184
+ initial_index=1,
185
+ ),
186
  cl.input_widget.Switch(
187
  id="follow_up_questions",
188
  label="Generate follow up questions",
code/modules/config/config.yml CHANGED
@@ -39,6 +39,7 @@ llm_params:
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
  pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
 
42
 
43
  chat_logging:
44
  log_chat: True # bool
@@ -50,6 +51,7 @@ splitter_options:
50
  split_by_token : True # bool
51
  remove_leftover_delimiters: True # bool
52
  remove_chunks: False # bool
 
53
  chunk_size : 300 # int
54
  chunk_overlap : 30 # int
55
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
 
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
  pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
+ pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
43
 
44
  chat_logging:
45
  log_chat: True # bool
 
51
  split_by_token : True # bool
52
  remove_leftover_delimiters: True # bool
53
  remove_chunks: False # bool
54
+ chunking_mode: 'semantic' # str [fixed, semantic]
55
  chunk_size : 300 # int
56
  chunk_overlap : 30 # int
57
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
code/modules/dataloader/data_loader.py CHANGED
@@ -14,6 +14,8 @@ from llama_parse import LlamaParse
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
17
  from ragatouille import RAGPretrainedModel
18
  from langchain.chains import LLMChain
19
  from langchain_community.llms import OpenAI
@@ -63,12 +65,11 @@ class HTMLReader:
63
  href = href.replace("http", "https")
64
 
65
  absolute_url = urljoin(base_url, href)
66
- link["href"] = absolute_url
67
 
68
  resp = requests.head(absolute_url)
69
  if resp.status_code != 200:
70
- logger.warning(f"Link {absolute_url} is broken")
71
- logger.warning(f"Status code: {resp.status_code}")
72
 
73
  return str(soup)
74
 
@@ -84,7 +85,6 @@ class HTMLReader:
84
  else:
85
  return None
86
 
87
-
88
  class FileReader:
89
  def __init__(self, logger, kind):
90
  self.logger = logger
@@ -96,9 +96,7 @@ class FileReader:
96
  else:
97
  self.pdf_reader = PDFReader()
98
  self.web_reader = HTMLReader()
99
- self.logger.info(
100
- f"Initialized FileReader with {kind} PDF reader and HTML reader"
101
- )
102
 
103
  def extract_text_from_pdf(self, pdf_path):
104
  text = ""
@@ -156,21 +154,31 @@ class ChunkProcessor:
156
  self.document_metadata = {}
157
  self.document_chunks_full = []
158
 
 
 
 
159
  if config["splitter_options"]["use_splitter"]:
160
- if config["splitter_options"]["split_by_token"]:
161
- self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
162
- chunk_size=config["splitter_options"]["chunk_size"],
163
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
164
- separators=config["splitter_options"]["chunk_separators"],
165
- disallowed_special=(),
166
- )
 
 
 
 
 
 
 
 
167
  else:
168
- self.splitter = RecursiveCharacterTextSplitter(
169
- chunk_size=config["splitter_options"]["chunk_size"],
170
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
171
- separators=config["splitter_options"]["chunk_separators"],
172
- disallowed_special=(),
173
  )
 
174
  else:
175
  self.splitter = None
176
  self.logger.info("ChunkProcessor instance created")
@@ -193,16 +201,12 @@ class ChunkProcessor:
193
  def process_chunks(
194
  self, documents, file_type="txt", source="", page=0, metadata={}
195
  ):
 
196
  documents = [Document(page_content=documents, source=source, page=page)]
197
- if (
198
- file_type == "txt"
199
- or file_type == "docx"
200
- or file_type == "srt"
201
- or file_type == "tex"
202
- ):
203
  document_chunks = self.splitter.split_documents(documents)
204
- elif file_type == "pdf":
205
- document_chunks = documents # Full page for now
206
 
207
  # add the source and page number back to the metadata
208
  for chunk in document_chunks:
@@ -296,9 +300,6 @@ class ChunkProcessor:
296
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
297
  file_name = os.path.basename(file_path)
298
 
299
- if file_name in self.document_data:
300
- return
301
-
302
  file_type = file_name.split(".")[-1]
303
 
304
  read_methods = {
@@ -313,7 +314,12 @@ class ChunkProcessor:
313
  return
314
 
315
  try:
316
- documents = read_methods[file_type](file_path)
 
 
 
 
 
317
 
318
  self.process_documents(
319
  documents, file_path, file_type, "file", addl_metadata
@@ -372,13 +378,14 @@ class ChunkProcessor:
372
  f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
373
  ) as json_file:
374
  self.document_metadata = json.load(json_file)
 
 
 
375
 
376
 
377
  class DataLoader:
378
  def __init__(self, config, logger=None):
379
- self.file_reader = FileReader(
380
- logger=logger, kind=config["llm_params"]["pdf_reader"]
381
- )
382
  self.chunk_processor = ChunkProcessor(config, logger=logger)
383
 
384
  def get_chunks(self, uploaded_files, weblinks):
@@ -396,22 +403,19 @@ if __name__ == "__main__":
396
  with open("../code/modules/config/config.yml", "r") as f:
397
  config = yaml.safe_load(f)
398
 
399
- STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
400
  uploaded_files = [
401
- os.path.join(STORAGE_DIR, file)
402
- for file in os.listdir(STORAGE_DIR)
403
- if file != "urls.txt"
404
  ]
405
 
406
  data_loader = DataLoader(config, logger=logger)
407
  document_chunks, document_names, documents, document_metadata = (
408
  data_loader.get_chunks(
409
- [
410
- "https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
411
- ],
412
  [],
413
  )
414
  )
415
 
416
  print(document_names[:5])
417
  print(len(document_chunks))
 
 
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain_experimental.text_splitter import SemanticChunker
18
+ from langchain_openai.embeddings import OpenAIEmbeddings
19
  from ragatouille import RAGPretrainedModel
20
  from langchain.chains import LLMChain
21
  from langchain_community.llms import OpenAI
 
65
  href = href.replace("http", "https")
66
 
67
  absolute_url = urljoin(base_url, href)
68
+ link['href'] = absolute_url
69
 
70
  resp = requests.head(absolute_url)
71
  if resp.status_code != 200:
72
+ logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
 
73
 
74
  return str(soup)
75
 
 
85
  else:
86
  return None
87
 
 
88
  class FileReader:
89
  def __init__(self, logger, kind):
90
  self.logger = logger
 
96
  else:
97
  self.pdf_reader = PDFReader()
98
  self.web_reader = HTMLReader()
99
+ self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
 
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
102
  text = ""
 
154
  self.document_metadata = {}
155
  self.document_chunks_full = []
156
 
157
+ if not config['vectorstore']['embedd_files']:
158
+ self.load_document_data()
159
+
160
  if config["splitter_options"]["use_splitter"]:
161
+ if config["splitter_options"]["chunking_mode"] == "fixed":
162
+ if config["splitter_options"]["split_by_token"]:
163
+ self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
164
+ chunk_size=config["splitter_options"]["chunk_size"],
165
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
166
+ separators=config["splitter_options"]["chunk_separators"],
167
+ disallowed_special=(),
168
+ )
169
+ else:
170
+ self.splitter = RecursiveCharacterTextSplitter(
171
+ chunk_size=config["splitter_options"]["chunk_size"],
172
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
173
+ separators=config["splitter_options"]["chunk_separators"],
174
+ disallowed_special=(),
175
+ )
176
  else:
177
+ self.splitter = SemanticChunker(
178
+ OpenAIEmbeddings(),
179
+ breakpoint_threshold_type="percentile"
 
 
180
  )
181
+
182
  else:
183
  self.splitter = None
184
  self.logger.info("ChunkProcessor instance created")
 
201
  def process_chunks(
202
  self, documents, file_type="txt", source="", page=0, metadata={}
203
  ):
204
+ # TODO: Clear up this pipeline of re-adding metadata
205
  documents = [Document(page_content=documents, source=source, page=page)]
206
+ if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
207
+ document_chunks = documents
208
+ else:
 
 
 
209
  document_chunks = self.splitter.split_documents(documents)
 
 
210
 
211
  # add the source and page number back to the metadata
212
  for chunk in document_chunks:
 
300
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
301
  file_name = os.path.basename(file_path)
302
 
 
 
 
303
  file_type = file_name.split(".")[-1]
304
 
305
  read_methods = {
 
314
  return
315
 
316
  try:
317
+
318
+ if file_path in self.document_data:
319
+ self.logger.warning(f"File {file_name} already processed")
320
+ documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
321
+ else:
322
+ documents = read_methods[file_type](file_path)
323
 
324
  self.process_documents(
325
  documents, file_path, file_type, "file", addl_metadata
 
378
  f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
379
  ) as json_file:
380
  self.document_metadata = json.load(json_file)
381
+ self.logger.info(
382
+ f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
383
+ )
384
 
385
 
386
  class DataLoader:
387
  def __init__(self, config, logger=None):
388
+ self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
 
 
389
  self.chunk_processor = ChunkProcessor(config, logger=logger)
390
 
391
  def get_chunks(self, uploaded_files, weblinks):
 
403
  with open("../code/modules/config/config.yml", "r") as f:
404
  config = yaml.safe_load(f)
405
 
406
+ STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
407
  uploaded_files = [
408
+ os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
 
 
409
  ]
410
 
411
  data_loader = DataLoader(config, logger=logger)
412
  document_chunks, document_names, documents, document_metadata = (
413
  data_loader.get_chunks(
414
+ ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
 
 
415
  [],
416
  )
417
  )
418
 
419
  print(document_names[:5])
420
  print(len(document_chunks))
421
+
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -23,7 +23,7 @@ class GPTParser:
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
  The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
- Just respond with the markdown.
27
  """
28
 
29
  def parse(self, pdf_path):
 
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
  The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
+ Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
27
  """
28
 
29
  def parse(self, pdf_path):
code/modules/vectorstore/faiss.py CHANGED
@@ -14,6 +14,10 @@ class FaissVectorStore(VectorStoreBase):
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
 
 
 
 
17
 
18
  def _init_vector_db(self):
19
  self.faiss = FAISS(
@@ -25,24 +29,12 @@ class FaissVectorStore(VectorStoreBase):
25
  documents=document_chunks, embedding=embedding_model
26
  )
27
  self.vectorstore.save_local(
28
- os.path.join(
29
- self.config["vectorstore"]["db_path"],
30
- "db_"
31
- + self.config["vectorstore"]["db_option"]
32
- + "_"
33
- + self.config["vectorstore"]["model"],
34
- )
35
  )
36
 
37
  def load_database(self, embedding_model):
38
  self.vectorstore = self.faiss.load_local(
39
- os.path.join(
40
- self.config["vectorstore"]["db_path"],
41
- "db_"
42
- + self.config["vectorstore"]["db_option"]
43
- + "_"
44
- + self.config["vectorstore"]["model"],
45
- ),
46
  embedding_model,
47
  allow_dangerous_deserialization=True,
48
  )
 
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
17
+ self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
18
+ "db_" + self.config["vectorstore"]["db_option"]
19
+ + "_" + self.config["vectorstore"]["model"]
20
+ + "_" + config["splitter_options"]["chunking_mode"])
21
 
22
  def _init_vector_db(self):
23
  self.faiss = FAISS(
 
29
  documents=document_chunks, embedding=embedding_model
30
  )
31
  self.vectorstore.save_local(
32
+ self.local_path
 
 
 
 
 
 
33
  )
34
 
35
  def load_database(self, embedding_model):
36
  self.vectorstore = self.faiss.load_local(
37
+ self.local_path,
 
 
 
 
 
 
38
  embedding_model,
39
  allow_dangerous_deserialization=True,
40
  )