Spaces:
Build error
Build error
import config | |
from langchain.docstore.document import Document as LangchainDocument | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from transformers import AutoTokenizer | |
from tqdm.notebook import tqdm | |
from typing import List | |
def split_documents(chunk_size: int, knowledge_base: List[LangchainDocument]) -> List[LangchainDocument]: | |
""" | |
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. | |
""" | |
MARKDOWN_SEPARATORS = [ | |
"\n#{1,6} ", | |
"```\n", | |
"\n\\*\\*\\*+\n", | |
"\n---+\n", | |
"\n___+\n", | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
AutoTokenizer.from_pretrained(config.EMBEDDING_MODEL_NAME), | |
chunk_size=chunk_size, | |
chunk_overlap=int(chunk_size / 10), | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=MARKDOWN_SEPARATORS, | |
) | |
docs_processed = [] | |
for doc in tqdm(knowledge_base): | |
docs_processed += text_splitter.split_documents([doc]) | |
unique_texts = {} | |
docs_processed_unique = [] | |
for doc in docs_processed: | |
if doc.page_content not in unique_texts: | |
unique_texts[doc.page_content] = True | |
docs_processed_unique.append(doc) | |
return docs_processed_unique # , docs_processed |