langchain-chatchat / tests /custom_splitter /test_different_splitter.py
Zulelee's picture
Upload 254 files
5e9cd1d verified
raw
history blame
No virus
1.64 kB
import os
from transformers import AutoTokenizer
import sys
sys.path.append("../..")
from configs import (
CHUNK_SIZE,
OVERLAP_SIZE
)
from server.knowledge_base.utils import make_text_splitter
def text(splitter_name):
from langchain import document_loaders
# 使用DocumentLoader读取文件
filepath = "../../knowledge_base/samples/content/test.txt"
loader = document_loaders.UnstructuredFileLoader(filepath, autodetect_encoding=True)
docs = loader.load()
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
if splitter_name == "MarkdownHeaderTextSplitter":
docs = text_splitter.split_text(docs[0].page_content)
for doc in docs:
if doc.metadata:
doc.metadata["source"] = os.path.basename(filepath)
else:
docs = text_splitter.split_documents(docs)
for doc in docs:
print(doc)
return docs
import pytest
from langchain.docstore.document import Document
@pytest.mark.parametrize("splitter_name",
[
"ChineseRecursiveTextSplitter",
"SpacyTextSplitter",
"RecursiveCharacterTextSplitter",
"MarkdownHeaderTextSplitter"
])
def test_different_splitter(splitter_name):
try:
docs = text(splitter_name)
assert isinstance(docs, list)
if len(docs)>0:
assert isinstance(docs[0], Document)
except Exception as e:
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")