File size: 3,022 Bytes
4623a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pytest
from langchain_core.documents import Document
from RAG_BOT.pdf_processor import load_pdf

# Define the path to the test data directory relative to this test file
# Assuming the test file is in RAG_BOT/tests/integration/
# and data is in RAG_BOT/tests/data/hindi/
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'hindi')

# Helper function to check if a path is a valid file
def check_pdf_exists(pdf_name):
    pdf_path = os.path.join(TEST_DATA_DIR, pdf_name)
    if not os.path.isfile(pdf_path):
        pytest.skip(f"Test PDF not found: {pdf_path}")
    return pdf_path

@pytest.fixture
def sakar_pdf_path():
    return check_pdf_exists("01.03.14-h.pdf")

@pytest.fixture
def avyakt_pdf_path():
    return check_pdf_exists("03. AV-H-07.01.1980.pdf")

@pytest.fixture
def multi_date_header_pdf_path():
    return check_pdf_exists("FHM - 17-11-2013 (AM Revised - 31-12-1996).pdf")

def test_load_pdf_sakar_murli(sakar_pdf_path):
    """
    Tests loading a standard Sakar Murli PDF.
    Checks if the correct date is extracted and is_avyakt is not set.
    """
    documents = load_pdf(sakar_pdf_path)

    assert isinstance(documents, list)
    assert len(documents) > 0
    assert all(isinstance(doc, Document) for doc in documents)

    expected_date = "2014-03-01"
    for doc in documents:
        assert "date" in doc.metadata
        assert doc.metadata["date"] == expected_date
        assert "is_avyakt" not in doc.metadata

def test_load_pdf_avyakt_murli(avyakt_pdf_path):
    """
    Tests loading a standard Avyakt Murli PDF.
    Checks if the correct date is extracted and is_avyakt is set to True.
    """
    documents = load_pdf(avyakt_pdf_path)

    assert isinstance(documents, list)
    assert len(documents) > 0
    assert all(isinstance(doc, Document) for doc in documents)

    expected_date = "1980-01-07"
    for doc in documents:
        assert "date" in doc.metadata
        assert doc.metadata["date"] == expected_date
        assert "is_avyakt" in doc.metadata
        assert doc.metadata["is_avyakt"] is True

def test_load_pdf_multiple_header_dates(multi_date_header_pdf_path):
    """
    Tests loading a PDF with multiple dates in the header (original/revised).
    Checks if the *first* date found is extracted and applied consistently.
    """
    documents = load_pdf(multi_date_header_pdf_path)

    assert isinstance(documents, list)
    assert len(documents) > 0
    assert all(isinstance(doc, Document) for doc in documents)

    # The current logic should pick the first date it finds in the header
    expected_date = "2013-11-17"
    for doc in documents:
        assert "date" in doc.metadata
        assert doc.metadata["date"] == expected_date
        # Assuming this is Sakar based on filename pattern, but code checks content.
        # If the content check finds 'avyakt', this assertion might need adjustment.
        # For now, testing based on the primary date extraction logic.
        assert "is_avyakt" not in doc.metadata