Standard_Intelligence_Dev

Sleeping

App Files Files Community

YchKhan commited on Apr 30

Commit

40b1456

•

1 Parent(s): a0ac111

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +7 -7

split_files_to_excel.py CHANGED Viewed

@@ -347,7 +347,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
             # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
             continue
         elif skip_next:
-            split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
             split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
             resized.append(split_doc)
             # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
@@ -371,7 +371,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
                     # print("not len(remaining_encoded)>max_length + min_chunk_size")
                     current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
                     is_last_chunk = True
-                    split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
                     split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
                     resized.append(split_doc)
                     # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
@@ -390,18 +390,18 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
                     # print("not is_first_chunk", period_index_b)
                     if period_index_b == -1:# Period not found in overlap
                         # print(". not found in overlap")
-                        split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
                     else:
                         if is_last_chunk : #not the first but the last
                             # print("is_last_chunk")
-                            split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
                         #print("Should start after \".\"")
                         else:
                             # print("not is_last_chunk", period_index_e, len(to_encode))
-                            split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
                 else:#first chunk
                     # print("else")
-                    split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
                 if 'titles' in split_doc.metadata:
                     # print("title in metadata")
                     chunk_counter += 1
@@ -425,7 +425,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
                 chunk_counter += 1
                 doc.metadata['chunk_id'] = chunk_counter
             doc.metadata['token_length'] = len(encoded)
-            doc.page_content = tokenizer.decode(encoded)
             resized.append(doc)
             print(f"Added a document of {doc.metadata['token_length']} tokens 4")
     print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")

             # print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
             continue
         elif skip_next:
+            split_doc = Document(page_content=tokenizer.decode(encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
             split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
             resized.append(split_doc)
             # print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
                     # print("not len(remaining_encoded)>max_length + min_chunk_size")
                     current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
                     is_last_chunk = True
+                    split_doc = Document(page_content=tokenizer.decode(current_encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
                     split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
                     resized.append(split_doc)
                     # print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
                     # print("not is_first_chunk", period_index_b)
                     if period_index_b == -1:# Period not found in overlap
                         # print(". not found in overlap")
+                        split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Keep regular splitting
                     else:
                         if is_last_chunk : #not the first but the last
                             # print("is_last_chunk")
+                            split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:].replace('<s> ', ''), metadata=doc.metadata.copy())
                         #print("Should start after \".\"")
                         else:
                             # print("not is_last_chunk", period_index_e, len(to_encode))
+                            split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Split at the begining and the end
                 else:#first chunk
                     # print("else")
+                    split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # split only at the end if its first chunk
                 if 'titles' in split_doc.metadata:
                     # print("title in metadata")
                     chunk_counter += 1
                 chunk_counter += 1
                 doc.metadata['chunk_id'] = chunk_counter
             doc.metadata['token_length'] = len(encoded)
+            doc.page_content = tokenizer.decode(encoded).replace('<s> ', '')
             resized.append(doc)
             print(f"Added a document of {doc.metadata['token_length']} tokens 4")
     print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")