File size: 1,891 Bytes
e60c070
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from typing import List
from pytest import fixture
from create_db import split_text


@fixture
def sample_text():
    return [
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
        "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. "
        "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "
        "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. "
        "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.",
        "Another long text string to demonstrate the splitting functionality. This text should also be split into multiple chunks."
    ]


def test_split_text(sample_text):
    # Split the sample text into chunks
    chunks = split_text(sample_text)

    # Assert that the chunks are lists of strings
    assert all(
        isinstance(chunk, list) and all(
            isinstance(text, str) for text in chunk) for chunk in chunks)

    # Assert that the chunks are not empty
    assert all(chunk for chunk in chunks)

    # Assert that the chunks have the expected length (approx. 1500 characters with 150 overlap)
    expected_length = 1500 - 150  # Subtracting the overlap size
    assert all(expected_length <= len(''.join(chunk)) < 1500
               for chunk in chunks)

    # Assert that the chunks contain the original text
    original_text = ' '.join(sample_text)
    assert all(text in original_text for chunk in chunks for text in chunk)

    # Assert that the chunks do not overlap (except for the overlap size)
    for i in range(len(chunks) - 1):
        previous_chunk = chunks[i]
        next_chunk = chunks[i + 1]
        overlap = ''.join(set(previous_chunk[-150:]) & set(next_chunk[:150]))
        assert len(overlap) == 150 or not overlap