Harry_Potter / chunker_final.py
Sonja-Subt's picture
Upload folder using huggingface_hub
c55e75f verified
raw
history blame contribute delete
953 Bytes
def chunk_document_to_dict(doc: str, doc_name: str, desired_chunk_size: int = 400, max_chunk_size: int = 500):
chunks = {}
chunk = ''
chunk_number = 1
for line in doc.splitlines():
chunk += line + '\n'
if len(chunk) >= desired_chunk_size:
chunk_id = f"{doc_name}_{chunk_number}"
chunks[chunk_id] = chunk[:max_chunk_size]
chunk = ''
chunk_number += 1
if chunk: # Залишок запихаємо в останній чанк
chunk_id = f"{doc_name}_{chunk_number}"
chunks[chunk_id] = chunk
return chunks
def chunk_documents_to_dict(docs: dict, desired_chunk_size: int = 400, max_chunk_size: int = 500):
all_chunks = {}
for doc_name, doc_text in docs.items():
chunks = chunk_document_to_dict(doc_text, doc_name, desired_chunk_size, max_chunk_size)
all_chunks.update(chunks)
return all_chunks