alexkueck commited on
Commit
9915a53
1 Parent(s): f57497d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +11 -2
utils.py CHANGED
@@ -348,11 +348,20 @@ def document_loading_splitting():
348
  # Document splitting
349
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
350
  splits = text_splitter.split_documents(preprocessed_docs)
 
 
 
 
 
 
 
 
351
  print("Splits...........................")
352
- for split in splits:
353
  if 'divis' in split.page_content:
354
  print("DIVIS found in chunk:", split)
355
- return splits
 
356
 
357
  ###########################################
358
  #Chroma DB die splits ablegen - vektorisiert...
 
348
  # Document splitting
349
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
350
  splits = text_splitter.split_documents(preprocessed_docs)
351
+ # Split sowohl für originale als auch für vorverarbeitete Dokumente
352
+ original_splits = text_splitter.split_documents(docs)
353
+ preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
354
+
355
+ # Mapping von vorverarbeiteten Splits zu Originalsplits
356
+ split_to_original_mapping = {p_split: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
357
+
358
+
359
  print("Splits...........................")
360
+ for split in preprocessed_splits:
361
  if 'divis' in split.page_content:
362
  print("DIVIS found in chunk:", split)
363
+
364
+ return preprocessed_splits, split_to_original_mapping
365
 
366
  ###########################################
367
  #Chroma DB die splits ablegen - vektorisiert...