Browse files- +62 -14
@@ -29,8 +29,8 @@ import requests
29 |
import json
30 |
31 |
MODEL = "thenlper/gte-base"
32 |
33 |
34 |
35 |
embeddings = HuggingFaceEmbeddings(
36 |
@@ -323,15 +323,41 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
323 |
# Create an empty list to store the resized documents
324 |
resized = []
325 |
326 |
# Iterate through the original documents list
327 |
for doc in documents:
328 |
current_file = doc.metadata['source']
329 |
if current_file != previous_file: #chunk counting
330 |
previous_file = current_file
331 |
chunk_counter = 0
332 |
is_first_chunk = True # Keep track of the first chunk in the document
333 |
334 |
if len(encoded) > max_length:
335 |
remaining_encoded = encoded
336 |
is_last_chunk = False
337 |
while len(remaining_encoded) > 1 and not is_last_chunk:
@@ -339,47 +365,69 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
339 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
340 |
period_index_b = overlap_text.find('.')# Index by character
341 |
if len(remaining_encoded)>max_length + min_chunk_size:
342 |
current_encoded = remaining_encoded[:max(10, max_length)]
343 |
344 |
345 |
is_last_chunk = True
346 |
347 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
348 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
349 |
period_index_last = overlap_text_last.find('.')
350 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
351 |
#print(f"period index last found at {period_index_last}")
352 |
period_index_e = period_index_last - len(overlap_text_last)
353 |
#print(f"period_index_e :{period_index_e}")
354 |
#print(f"last :{overlap_text_last}")
355 |
if not is_first_chunk:#starting after the period in overlap
356 |
if period_index_b == -1:# Period not found in overlap
357 |
#print(". not found in overlap")
358 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
359 |
360 |
if is_last_chunk : #not the first but the last
361 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
362 |
#print("Should start after \".\"")
363 |
364 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
365 |
else:#first chunk
366 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
367 |
if 'titles' in split_doc.metadata:
368 |
chunk_counter += 1
369 |
split_doc.metadata['chunk_id'] = chunk_counter
370 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
371 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
372 |
373 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
374 |
is_first_chunk = False
375 |
#print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content, "\n-----------------")
376 |
377 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
378 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
379 |
chunk_counter += 1
380 |
doc.metadata['chunk_id'] = chunk_counter
381 |
382 |
383 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
384 |
return resized
385 |
29 |
import json
30 |
31 |
MODEL = "thenlper/gte-base"
32 |
33 |
34 |
35 |
embeddings = HuggingFaceEmbeddings(
36 |
323 |
# Create an empty list to store the resized documents
324 |
resized = []
325 |
326 |
to_encode = ""
327 |
skip_next = False
328 |
# Iterate through the original documents list
329 |
for i, doc in enumerate(documents):
330 |
if skip_next:
331 |
skip_next = False
332 |
333 |
current_file = doc.metadata['source']
334 |
if current_file != previous_file: #chunk counting
335 |
previous_file = current_file
336 |
chunk_counter = 0
337 |
is_first_chunk = True # Keep track of the first chunk in the document
338 |
to_encode += doc.page_content
339 |
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
340 |
if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
341 |
# print('SAME DOC')
342 |
skip_next = True
343 |
to_encode += documents[i+1].page_content
344 |
345 |
encoded = tokenizer.encode(to_encode)#encode the current document
346 |
if len(encoded) < min_chunk_size and not skip_next:
347 |
# print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
348 |
349 |
elif skip_next:
350 |
split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
351 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
352 |
353 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
354 |
to_encode = ""
355 |
356 |
357 |
# print(f"len(encoded):{len(encoded)}>=min_chunk_size:{min_chunk_size}")
358 |
to_encode = ""
359 |
if len(encoded) > max_length:
360 |
# print(f"len(encoded):{len(encoded)}>=max_length:{max_length}")
361 |
remaining_encoded = encoded
362 |
is_last_chunk = False
363 |
while len(remaining_encoded) > 1 and not is_last_chunk:
365 |
overlap_text = tokenizer.decode(remaining_encoded[:overlap])# Index by token
366 |
period_index_b = overlap_text.find('.')# Index by character
367 |
if len(remaining_encoded)>max_length + min_chunk_size:
368 |
# print("len(remaining_encoded)>max_length + min_chunk_size")
369 |
current_encoded = remaining_encoded[:max(10, max_length)]
370 |
371 |
# print("not len(remaining_encoded)>max_length + min_chunk_size")
372 |
current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
373 |
is_last_chunk = True
374 |
split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
375 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
376 |
377 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
378 |
379 |
period_index_e = -1 # an amount of character that I am sure will be greater or equal to the max lengh of a chunk, could have done len(tokenizer.decode(current_encoded))
380 |
if len(remaining_encoded)>max_length+min_chunk_size:# If it is not the last sub chunk
381 |
# print("len(remaining_encoded)>max_length+min_chunk_size")
382 |
overlap_text_last = tokenizer.decode(current_encoded[-overlap:])
383 |
period_index_last = overlap_text_last.find('.')
384 |
if period_index_last != -1 and period_index_last < len(overlap_text_last) - 1:
385 |
# print(f"period index last found at {period_index_last}")
386 |
period_index_e = period_index_last - len(overlap_text_last)
387 |
# print(f"period_index_e :{period_index_e}")
388 |
# print(f"last :{overlap_text_last}")
389 |
if not is_first_chunk:#starting after the period in overlap
390 |
# print("not is_first_chunk", period_index_b)
391 |
if period_index_b == -1:# Period not found in overlap
392 |
# print(". not found in overlap")
393 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
394 |
395 |
if is_last_chunk : #not the first but the last
396 |
# print("is_last_chunk")
397 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
398 |
#print("Should start after \".\"")
399 |
400 |
# print("not is_last_chunk", period_index_e, len(to_encode))
401 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
402 |
else:#first chunk
403 |
# print("else")
404 |
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
405 |
if 'titles' in split_doc.metadata:
406 |
# print("title in metadata")
407 |
chunk_counter += 1
408 |
split_doc.metadata['chunk_id'] = chunk_counter
409 |
#A1 We could round chunk length in token if we ignore the '.' position in the overlap and save time of computation
410 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
411 |
412 |
print(f"Added a document of {split_doc.metadata['token_length']} tokens 3")
413 |
remaining_encoded = remaining_encoded[max(10, max_length - overlap):]
414 |
is_first_chunk = False
415 |
# # print(len(tokenizer.encode(split_doc.page_content)), split_doc.page_content[:50], "\n-----------------")
416 |
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
417 |
# print(split_doc.page_content[:100])
418 |
# # print("😂😂😂😂")
419 |
# print(split_doc.page_content[-100:])
420 |
# print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
421 |
else:# len(encoded)>min_chunk_size:#ignore the chunks that are too small
422 |
print(f"found a chunk with the perfect size:{len(encoded)}")
423 |
#print(f"◀Document:{{ {doc.page_content} }} was not added because to short▶")
424 |
if 'titles' in doc.metadata:#check if it was splitted by or split_docx
425 |
chunk_counter += 1
426 |
doc.metadata['chunk_id'] = chunk_counter
427 |
doc.metadata['token_length'] = len(encoded)
428 |
doc.page_content = tokenizer.decode(encoded)
429 |
430 |
print(f"Added a document of {doc.metadata['token_length']} tokens 4")
431 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
432 |
return resized
433 |