alexkueck commited on
Commit
5f98ae1
1 Parent(s): 4f55e88

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +26 -1
utils.py CHANGED
@@ -244,6 +244,31 @@ def create_directory_loader(file_type, directory_path):
244
  ################################################
245
  # Custom Loader-Funktionen zu dem DirektoryLoader
246
  # Custom loader functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  def load_pdf_with_metadata(file_path):
248
  document = fitz.open(file_path)
249
  documents = []
@@ -269,7 +294,7 @@ def load_word_with_metadata(file_path):
269
  content = para.text
270
  contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
271
  return contents
272
-
273
 
274
 
275
  ################################################
 
244
  ################################################
245
  # Custom Loader-Funktionen zu dem DirektoryLoader
246
  # Custom loader functions
247
+ def load_pdf_with_metadata(file_path):
248
+ document = fitz.open(file_path)
249
+ documents = []
250
+ for page_num in range(len(document)):
251
+ page = document.load_page(page_num)
252
+ content = page.get_text("text")
253
+ title = document.metadata.get("title", "Unbekannt")
254
+ page_number = page_num + 1
255
+ documents.append(Document(content=content, title=title, page=page_number, path=file_path))
256
+ return documents
257
+
258
+ def load_word_with_metadata(file_path):
259
+ document = docx.Document(file_path)
260
+ title = "Dokument"
261
+ path = file_path
262
+ documents = []
263
+ for para in document.paragraphs:
264
+ content = para.text
265
+ page_number = 1 # Word-Dokumente haben keine Seitenzahlen in diesem Kontext
266
+ documents.append(Document(content=content, title=title, page=page_number, path=path))
267
+ return documents
268
+
269
+
270
+ """
271
+ # Custom loader functions
272
  def load_pdf_with_metadata(file_path):
273
  document = fitz.open(file_path)
274
  documents = []
 
294
  content = para.text
295
  contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
296
  return contents
297
+ """
298
 
299
 
300
  ################################################