Spaces:
Running
Running
import os | |
from transformers import AutoTokenizer | |
import sys | |
sys.path.append("../..") | |
from configs import ( | |
CHUNK_SIZE, | |
OVERLAP_SIZE | |
) | |
from server.knowledge_base.utils import make_text_splitter | |
def text(splitter_name): | |
from langchain import document_loaders | |
# 使用DocumentLoader读取文件 | |
filepath = "../../knowledge_base/samples/content/test.txt" | |
loader = document_loaders.UnstructuredFileLoader(filepath, autodetect_encoding=True) | |
docs = loader.load() | |
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE) | |
if splitter_name == "MarkdownHeaderTextSplitter": | |
docs = text_splitter.split_text(docs[0].page_content) | |
for doc in docs: | |
if doc.metadata: | |
doc.metadata["source"] = os.path.basename(filepath) | |
else: | |
docs = text_splitter.split_documents(docs) | |
for doc in docs: | |
print(doc) | |
return docs | |
import pytest | |
from langchain.docstore.document import Document | |
def test_different_splitter(splitter_name): | |
try: | |
docs = text(splitter_name) | |
assert isinstance(docs, list) | |
if len(docs)>0: | |
assert isinstance(docs[0], Document) | |
except Exception as e: | |
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}") | |