File size: 3,961 Bytes
58d33f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""Test functionality related to combining documents."""

from typing import Any, List, Tuple

import pytest

from langchain.chains.combine_documents.map_reduce import (
    _collapse_docs,
    _split_list_of_docs,
)
from langchain.docstore.document import Document


def _fake_docs_len_func(docs: List[Document]) -> int:
    return len(_fake_combine_docs_func(docs)[0])


def _fake_combine_docs_func(docs: List[Document], **kwargs: Any) -> Tuple[str, dict]:
    return "".join([d.page_content for d in docs]), {}


def test__split_list_long_single_doc() -> None:
    """Test splitting of a long single doc."""
    docs = [Document(page_content="foo" * 100)]
    with pytest.raises(ValueError):
        _split_list_of_docs(docs, _fake_docs_len_func, 100)


def test__split_list_long_pair_doc() -> None:
    """Test splitting of a list with two medium docs."""
    docs = [Document(page_content="foo" * 30)] * 2
    with pytest.raises(ValueError):
        _split_list_of_docs(docs, _fake_docs_len_func, 100)


def test__split_list_single_doc() -> None:
    """Test splitting works with just a single doc."""
    docs = [Document(page_content="foo")]
    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 100)
    assert doc_list == [docs]


def test__split_list_double_doc() -> None:
    """Test splitting works with just two docs."""
    docs = [Document(page_content="foo"), Document(page_content="bar")]
    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 100)
    assert doc_list == [docs]


def test__split_list_works_correctly() -> None:
    """Test splitting works correctly."""
    docs = [
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="baz"),
        Document(page_content="foo" * 2),
        Document(page_content="bar"),
        Document(page_content="baz"),
    ]
    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 10)
    expected_result = [
        # Test a group of three.
        [
            Document(page_content="foo"),
            Document(page_content="bar"),
            Document(page_content="baz"),
        ],
        # Test a group of two, where one is bigger.
        [Document(page_content="foo" * 2), Document(page_content="bar")],
        # Test no errors on last
        [Document(page_content="baz")],
    ]
    assert doc_list == expected_result


def test__collapse_docs_no_metadata() -> None:
    """Test collapse documents functionality when no metadata."""
    docs = [
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="baz"),
    ]
    output = _collapse_docs(docs, _fake_combine_docs_func)
    expected_output = Document(page_content="foobarbaz")
    assert output == expected_output


def test__collapse_docs_one_doc() -> None:
    """Test collapse documents functionality when only one document present."""
    # Test with no metadata.
    docs = [Document(page_content="foo")]
    output = _collapse_docs(docs, _fake_combine_docs_func)
    assert output == docs[0]

    # Test with metadata.
    docs = [Document(page_content="foo", metadata={"source": "a"})]
    output = _collapse_docs(docs, _fake_combine_docs_func)
    assert output == docs[0]


def test__collapse_docs_metadata() -> None:
    """Test collapse documents functionality when metadata exists."""
    metadata1 = {"source": "a", "foo": 2, "bar": "1", "extra1": "foo"}
    metadata2 = {"source": "b", "foo": "3", "bar": 2, "extra2": "bar"}
    docs = [
        Document(page_content="foo", metadata=metadata1),
        Document(page_content="bar", metadata=metadata2),
    ]
    output = _collapse_docs(docs, _fake_combine_docs_func)
    expected_metadata = {
        "source": "a, b",
        "foo": "2, 3",
        "bar": "1, 2",
        "extra1": "foo",
        "extra2": "bar",
    }
    expected_output = Document(page_content="foobar", metadata=expected_metadata)
    assert output == expected_output