Spaces:
Runtime error
Runtime error
"""Test text splitters that require an integration.""" | |
import pytest | |
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter | |
def test_huggingface_type_check() -> None: | |
"""Test that type checks are done properly on input.""" | |
with pytest.raises(ValueError): | |
CharacterTextSplitter.from_huggingface_tokenizer("foo") | |
def test_huggingface_tokenizer() -> None: | |
"""Test text splitter that uses a HuggingFace tokenizer.""" | |
from transformers import GPT2TokenizerFast | |
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer( | |
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0 | |
) | |
output = text_splitter.split_text("foo bar") | |
assert output == ["foo", "bar"] | |
class TestTokenTextSplitter: | |
"""Test token text splitter.""" | |
def test_basic(self) -> None: | |
"""Test no overlap.""" | |
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0) | |
output = splitter.split_text("abcdef" * 5) # 10 token string | |
expected_output = ["abcdefabcdefabc", "defabcdefabcdef"] | |
assert output == expected_output | |
def test_overlap(self) -> None: | |
"""Test with overlap.""" | |
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1) | |
output = splitter.split_text("abcdef" * 5) # 10 token string | |
expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"] | |
assert output == expected_output | |