alexkueck commited on
Commit
f11e3cf
1 Parent(s): 6b4c237

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +20 -8
utils.py CHANGED
@@ -302,6 +302,17 @@ def load_word_with_metadata(file_path):
302
 
303
  ################################################
304
  #die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
 
 
 
 
 
 
 
 
 
 
 
305
  def document_loading_splitting():
306
  ##############################
307
  # Document loading
@@ -335,6 +346,7 @@ def document_loading_splitting():
335
  # Vorverarbeitung der Dokumente
336
  preprocessed_docs = []
337
  original_docs = []
 
338
  for doc in docs:
339
  doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
340
  preprocessed_content = preprocess_text(doc.page_content)
@@ -342,7 +354,8 @@ def document_loading_splitting():
342
  preprocessed_metadata = {
343
  "title": preprocessed_title,
344
  "page": doc.metadata["page"],
345
- "path": doc.metadata["path"]
 
346
  }
347
  preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
348
  original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
@@ -354,12 +367,13 @@ def document_loading_splitting():
354
  # Document splitting
355
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
356
  splits = text_splitter.split_documents(preprocessed_docs)
 
357
  # Split sowohl für originale als auch für vorverarbeitete Dokumente
358
- original_splits = text_splitter.split_documents(original_docs)
359
- preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
360
 
361
  # Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
362
- split_to_original_mapping = {p_split.id: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
363
 
364
 
365
  print("Splits...........................")
@@ -720,11 +734,9 @@ class Document:
720
  self.metadata = {
721
  "title": title,
722
  "page": page,
723
- "path": path
 
724
  }
725
- self.doc_id = doc_id
726
-
727
-
728
 
729
 
730
  def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
 
302
 
303
  ################################################
304
  #die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
305
+ # Funktion zum Splitten und Zuweisen der doc_id
306
+ def split_documents_with_id(docs):
307
+ splits = []
308
+ for doc in docs:
309
+ doc_splits = text_splitter.split_text(doc.page_content)
310
+ for split_content in doc_splits:
311
+ split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc.doc_id)
312
+ splits.append(split_doc)
313
+ return splits
314
+
315
+ #finally die Splits erzeugen und laden.....
316
  def document_loading_splitting():
317
  ##############################
318
  # Document loading
 
346
  # Vorverarbeitung der Dokumente
347
  preprocessed_docs = []
348
  original_docs = []
349
+
350
  for doc in docs:
351
  doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
352
  preprocessed_content = preprocess_text(doc.page_content)
 
354
  preprocessed_metadata = {
355
  "title": preprocessed_title,
356
  "page": doc.metadata["page"],
357
+ "path": doc.metadata["path"],
358
+ "doc_id": doc_id # Füge die ID in die Metadaten ein
359
  }
360
  preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
361
  original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
 
367
  # Document splitting
368
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
369
  splits = text_splitter.split_documents(preprocessed_docs)
370
+
371
  # Split sowohl für originale als auch für vorverarbeitete Dokumente
372
+ original_splits = split_documents_with_id(original_docs)
373
+ preprocessed_splits = split_documents_with_id(preprocessed_docs)
374
 
375
  # Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
376
+ split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
377
 
378
 
379
  print("Splits...........................")
 
734
  self.metadata = {
735
  "title": title,
736
  "page": page,
737
+ "path": path,
738
+ "doc_id": doc_id # Füge die ID in die Metadaten ein
739
  }
 
 
 
740
 
741
 
742
  def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: