Spaces:
Build error
Build error
| import config | |
| from langchain.docstore.document import Document as LangchainDocument | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from transformers import AutoTokenizer | |
| from tqdm.notebook import tqdm | |
| from typing import List | |
| def split_documents(chunk_size: int, knowledge_base: List[LangchainDocument]) -> List[LangchainDocument]: | |
| """ | |
| Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. | |
| """ | |
| MARKDOWN_SEPARATORS = [ | |
| "\n#{1,6} ", | |
| "```\n", | |
| "\n\\*\\*\\*+\n", | |
| "\n---+\n", | |
| "\n___+\n", | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
| AutoTokenizer.from_pretrained(config.EMBEDDING_MODEL_NAME), | |
| chunk_size=chunk_size, | |
| chunk_overlap=int(chunk_size / 10), | |
| add_start_index=True, | |
| strip_whitespace=True, | |
| separators=MARKDOWN_SEPARATORS, | |
| ) | |
| docs_processed = [] | |
| for doc in tqdm(knowledge_base): | |
| docs_processed += text_splitter.split_documents([doc]) | |
| unique_texts = {} | |
| docs_processed_unique = [] | |
| for doc in docs_processed: | |
| if doc.page_content not in unique_texts: | |
| unique_texts[doc.page_content] = True | |
| docs_processed_unique.append(doc) | |
| return docs_processed_unique # , docs_processed |