alexkueck commited on
Commit
e34b246
1 Parent(s): 686cd9f

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +15 -1
utils.py CHANGED
@@ -329,10 +329,24 @@ def document_loading_splitting():
329
  # Load YouTube
330
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
331
  #docs.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  ################################
333
  # Document splitting
334
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
335
- splits = text_splitter.split_documents(docs)
336
  print("Splits...........................")
337
  for split in splits:
338
  if 'DIVIS' in split.page_content:
 
329
  # Load YouTube
330
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
331
  #docs.extend(loader.load())
332
+
333
+
334
+ # Vorverarbeitung der Dokumente - passend zu dem der Prompts...
335
+ preprocessed_docs = []
336
+ for doc in docs:
337
+ preprocessed_content = preprocess_text(doc.page_content)
338
+ preprocessed_title = preprocess_text(doc.metadata["title"])
339
+ preprocessed_metadata = {
340
+ "title": preprocessed_title,
341
+ "page": doc.metadata["page"],
342
+ "path": doc.metadata["path"]
343
+ }
344
+ preprocessed_docs.append(Document(metadata=preprocessed_metadata, page_content=preprocessed_content))
345
+
346
  ################################
347
  # Document splitting
348
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
349
+ splits = text_splitter.split_documents(preprocessed_docs)
350
  print("Splits...........................")
351
  for split in splits:
352
  if 'DIVIS' in split.page_content: