YchKhan commited on
Commit
9767141
1 Parent(s): 9ea18b7

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +7 -4
split_files_to_excel.py CHANGED
@@ -359,10 +359,13 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
359
  is_first_chunk = True # Keep track of the first chunk in the document
360
  to_encode += doc.page_content
361
  # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
362
- if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
363
- # print('SAME DOC')
364
- skip_next = True
365
- to_encode += documents[i+1].page_content
 
 
 
366
  #print(f"to_encode:\n{to_encode}")
367
  encoded = tokenizer.encode(to_encode)#encode the current document
368
  if len(encoded) < min_chunk_size and not skip_next:
 
359
  is_first_chunk = True # Keep track of the first chunk in the document
360
  to_encode += doc.page_content
361
  # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
362
+ try:
363
+ if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
364
+ # print('SAME DOC')
365
+ skip_next = True
366
+ to_encode += documents[i+1].page_content
367
+ except Exception as e:
368
+ print(e)
369
  #print(f"to_encode:\n{to_encode}")
370
  encoded = tokenizer.encode(to_encode)#encode the current document
371
  if len(encoded) < min_chunk_size and not skip_next: