Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +62 -14
split_files_to_excel.py
CHANGED
@@ -29,8 +29,8 @@ import requests
|
|
29 |
import json
|
30 |
|
31 |
MODEL = "thenlper/gte-base"
|
32 |
-
CHUNK_SIZE =
|
33 |
-
CHUNK_OVERLAP =
|
34 |
|
35 |
embeddings = HuggingFaceEmbeddings(
|
36 |
model_name=MODEL,
|
@@ -323,15 +323,41 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
323 |
# Create an empty list to store the resized documents
|
324 |
resized = []
|
325 |
previous_file=""
|
|
|
|
|
326 |
# Iterate through the original documents list
|
327 |
-
for doc in documents:
|
|
|
|
|
|
|
328 |
current_file = doc.metadata['source']
|
329 |
if current_file != previous_file: #chunk counting
|
330 |
previous_file = current_file
|
331 |
chunk_counter = 0
|
332 |
is_first_chunk = True # Keep track of the first chunk in the document
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
if len(encoded) > max_length:
|
|
|
335 |
remaining_encoded = encoded
|
336 |
is_last_chunk = False
|
337 |
while len(remaining_encoded) > 1 and not is_last_chunk:
|
@@ -339,47 +365,69 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
339 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
|
340 |
period_index_b = overlap_text.find('.')# Index by character
|
341 |
if len(remaining_encoded)>max_length + min_chunk_size:
|
|
|
342 |
current_encoded = remaining_encoded[:max(10, max_length)]
|
343 |
else:
|
344 |
-
|
|
|
345 |
is_last_chunk = True
|
346 |
-
|
|
|
|
|
|
|
|
|
|
|
347 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
|
|
|
348 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
|
349 |
period_index_last = overlap_text_last.find('.')
|
350 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
|
351 |
-
#print(f"period index last found at {period_index_last}")
|
352 |
-
period_index_e = period_index_last - len(overlap_text_last)
|
353 |
-
#print(f"period_index_e :{period_index_e}")
|
354 |
-
#print(f"last :{overlap_text_last}")
|
355 |
if not is_first_chunk:#starting after the period in overlap
|
|
|
356 |
if period_index_b == -1:# Period not found in overlap
|
357 |
-
#print(". not found in overlap")
|
358 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
|
359 |
else:
|
360 |
if is_last_chunk : #not the first but the last
|
|
|
361 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
|
362 |
#print("Should start after \".\"")
|
363 |
else:
|
|
|
364 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
|
365 |
else:#first chunk
|
|
|
366 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
367 |
if 'titles' in split_doc.metadata:
|
|
|
368 |
chunk_counter += 1
|
369 |
split_doc.metadata['chunk_id'] = chunk_counter
|
370 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
|
371 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
372 |
resized.append(split_doc)
|
|
|
373 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
|
374 |
is_first_chunk = False
|
375 |
-
#print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
|
378 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
|
379 |
chunk_counter += 1
|
380 |
doc.metadata['chunk_id'] = chunk_counter
|
381 |
-
|
|
|
382 |
resized.append(doc)
|
|
|
383 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|
384 |
return resized
|
385 |
|
|
|
29 |
import json
|
30 |
|
31 |
MODEL = "thenlper/gte-base"
|
32 |
+
CHUNK_SIZE = 1500
|
33 |
+
CHUNK_OVERLAP = 400
|
34 |
|
35 |
embeddings = HuggingFaceEmbeddings(
|
36 |
model_name=MODEL,
|
|
|
323 |
# Create an empty list to store the resized documents
|
324 |
resized = []
|
325 |
previous_file=""
|
326 |
+
to_encode = ""
|
327 |
+
skip_next = False
|
328 |
# Iterate through the original documents list
|
329 |
+
for i, doc in enumerate(documents):
|
330 |
+
if skip_next:
|
331 |
+
skip_next = False
|
332 |
+
continue
|
333 |
current_file = doc.metadata['source']
|
334 |
if current_file != previous_file: #chunk counting
|
335 |
previous_file = current_file
|
336 |
chunk_counter = 0
|
337 |
is_first_chunk = True # Keep track of the first chunk in the document
|
338 |
+
to_encode += doc.page_content
|
339 |
+
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
340 |
+
if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
|
341 |
+
# print('SAME DOC')
|
342 |
+
skip_next = True
|
343 |
+
to_encode += documents[i+1].page_content
|
344 |
+
#print(f"to_encode:\n{to_encode}")
|
345 |
+
encoded = tokenizer.encode(to_encode)#encode the current document
|
346 |
+
if len(encoded) < min_chunk_size and not skip_next:
|
347 |
+
# print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
|
348 |
+
continue
|
349 |
+
elif skip_next:
|
350 |
+
split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
|
351 |
+
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
352 |
+
resized.append(split_doc)
|
353 |
+
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
|
354 |
+
to_encode = ""
|
355 |
+
continue
|
356 |
+
else:
|
357 |
+
# print(f"len(encoded):{len(encoded)}>=min_chunk_size:{min_chunk_size}")
|
358 |
+
to_encode = ""
|
359 |
if len(encoded) > max_length:
|
360 |
+
# print(f"len(encoded):{len(encoded)}>=max_length:{max_length}")
|
361 |
remaining_encoded = encoded
|
362 |
is_last_chunk = False
|
363 |
while len(remaining_encoded) > 1 and not is_last_chunk:
|
|
|
365 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
|
366 |
period_index_b = overlap_text.find('.')# Index by character
|
367 |
if len(remaining_encoded)>max_length + min_chunk_size:
|
368 |
+
# print("len(remaining_encoded)>max_length + min_chunk_size")
|
369 |
current_encoded = remaining_encoded[:max(10, max_length)]
|
370 |
else:
|
371 |
+
# print("not len(remaining_encoded)>max_length + min_chunk_size")
|
372 |
+
current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
|
373 |
is_last_chunk = True
|
374 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
|
375 |
+
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
376 |
+
resized.append(split_doc)
|
377 |
+
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
|
378 |
+
break
|
379 |
+
period_index_e = -1 # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
|
380 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
|
381 |
+
# print("len(remaining_encoded)>max_length+min_chunk_size")
|
382 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
|
383 |
period_index_last = overlap_text_last.find('.')
|
384 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
|
385 |
+
# print(f"period index last found at {period_index_last}")
|
386 |
+
period_index_e = period_index_last - len(overlap_text_last)
|
387 |
+
# print(f"period_index_e :{period_index_e}")
|
388 |
+
# print(f"last :{overlap_text_last}")
|
389 |
if not is_first_chunk:#starting after the period in overlap
|
390 |
+
# print("not is_first_chunk", period_index_b)
|
391 |
if period_index_b == -1:# Period not found in overlap
|
392 |
+
# print(". not found in overlap")
|
393 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
|
394 |
else:
|
395 |
if is_last_chunk : #not the first but the last
|
396 |
+
# print("is_last_chunk")
|
397 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
|
398 |
#print("Should start after \".\"")
|
399 |
else:
|
400 |
+
# print("not is_last_chunk", period_index_e, len(to_encode))
|
401 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
|
402 |
else:#first chunk
|
403 |
+
# print("else")
|
404 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
405 |
if 'titles' in split_doc.metadata:
|
406 |
+
# print("title in metadata")
|
407 |
chunk_counter += 1
|
408 |
split_doc.metadata['chunk_id'] = chunk_counter
|
409 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
|
410 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
411 |
resized.append(split_doc)
|
412 |
+
print(f"Added a document of {split_doc.metadata['token_length']} tokens 3")
|
413 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
|
414 |
is_first_chunk = False
|
415 |
+
# # print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content[:50], "\n-----------------")
|
416 |
+
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
417 |
+
# print(split_doc.page_content[:100])
|
418 |
+
# # print("😂😂😂😂")
|
419 |
+
# print(split_doc.page_content[-100:])
|
420 |
+
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
421 |
+
else:# len(encoded)>min_chunk_size:#ignore the chunks that are too small
|
422 |
+
print(f"found a chunk with the perfect size:{len(encoded)}")
|
423 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
|
424 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
|
425 |
chunk_counter += 1
|
426 |
doc.metadata['chunk_id'] = chunk_counter
|
427 |
+
doc.metadata['token_length'] = len(encoded)
|
428 |
+
doc.page_content = tokenizer.decode(encoded)
|
429 |
resized.append(doc)
|
430 |
+
print(f"Added a document of {doc.metadata['token_length']} tokens 4")
|
431 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|
432 |
return resized
|
433 |
|