YchKhan commited on
Commit
40b1456
1 Parent(s): a0ac111

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +7 -7
split_files_to_excel.py CHANGED
@@ -347,7 +347,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
347
  # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
348
  continue
349
  elif skip_next:
350
- split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
351
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
352
  resized.append(split_doc)
353
  # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
@@ -371,7 +371,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
371
  # print("not len(remaining_encoded)>max_length + min_chunk_size")
372
  current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
373
  is_last_chunk = True
374
- split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
375
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
376
  resized.append(split_doc)
377
  # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
@@ -390,18 +390,18 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
390
  # print("not is_first_chunk", period_index_b)
391
  if period_index_b == -1:# Period not found in overlap
392
  # print(". not found in overlap")
393
- split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
394
  else:
395
  if is_last_chunk : #not the first but the last
396
  # print("is_last_chunk")
397
- split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
398
  #print("Should start after \".\"")
399
  else:
400
  # print("not is_last_chunk", period_index_e, len(to_encode))
401
- split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
402
  else:#first chunk
403
  # print("else")
404
- split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
405
  if 'titles' in split_doc.metadata:
406
  # print("title in metadata")
407
  chunk_counter += 1
@@ -425,7 +425,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
425
  chunk_counter += 1
426
  doc.metadata['chunk_id'] = chunk_counter
427
  doc.metadata['token_length'] = len(encoded)
428
- doc.page_content = tokenizer.decode(encoded)
429
  resized.append(doc)
430
  print(f"Added a document of {doc.metadata['token_length']} tokens 4")
431
  print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
 
347
  # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
348
  continue
349
  elif skip_next:
350
+ split_doc = Document(page_content=tokenizer.decode(encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
351
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
352
  resized.append(split_doc)
353
  # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
 
371
  # print("not len(remaining_encoded)>max_length + min_chunk_size")
372
  current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
373
  is_last_chunk = True
374
+ split_doc = Document(page_content=tokenizer.decode(current_encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
375
  split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
376
  resized.append(split_doc)
377
  # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
 
390
  # print("not is_first_chunk", period_index_b)
391
  if period_index_b == -1:# Period not found in overlap
392
  # print(". not found in overlap")
393
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Keep regular splitting
394
  else:
395
  if is_last_chunk : #not the first but the last
396
  # print("is_last_chunk")
397
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:].replace('<s> ', ''), metadata=doc.metadata.copy())
398
  #print("Should start after \".\"")
399
  else:
400
  # print("not is_last_chunk", period_index_e, len(to_encode))
401
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Split at the begining and the end
402
  else:#first chunk
403
  # print("else")
404
+ split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # split only at the end if its first chunk
405
  if 'titles' in split_doc.metadata:
406
  # print("title in metadata")
407
  chunk_counter += 1
 
425
  chunk_counter += 1
426
  doc.metadata['chunk_id'] = chunk_counter
427
  doc.metadata['token_length'] = len(encoded)
428
+ doc.page_content = tokenizer.decode(encoded).replace('<s> ', '')
429
  resized.append(doc)
430
  print(f"Added a document of {doc.metadata['token_length']} tokens 4")
431
  print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")