|
from typing import List |
|
from pytest import fixture |
|
from create_db import split_text |
|
|
|
|
|
@fixture |
|
def sample_text(): |
|
return [ |
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. " |
|
"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " |
|
"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. " |
|
"Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. " |
|
"Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", |
|
"Another long text string to demonstrate the splitting functionality. This text should also be split into multiple chunks." |
|
] |
|
|
|
|
|
def test_split_text(sample_text): |
|
|
|
chunks = split_text(sample_text) |
|
|
|
|
|
assert all( |
|
isinstance(chunk, list) and all( |
|
isinstance(text, str) for text in chunk) for chunk in chunks) |
|
|
|
|
|
assert all(chunk for chunk in chunks) |
|
|
|
|
|
expected_length = 1500 - 150 |
|
assert all(expected_length <= len(''.join(chunk)) < 1500 |
|
for chunk in chunks) |
|
|
|
|
|
original_text = ' '.join(sample_text) |
|
assert all(text in original_text for chunk in chunks for text in chunk) |
|
|
|
|
|
for i in range(len(chunks) - 1): |
|
previous_chunk = chunks[i] |
|
next_chunk = chunks[i + 1] |
|
overlap = ''.join(set(previous_chunk[-150:]) & set(next_chunk[:150])) |
|
assert len(overlap) == 150 or not overlap |
|
|