Spaces:
Building
Building
File size: 3,022 Bytes
4623a33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import pytest
from langchain_core.documents import Document
from RAG_BOT.pdf_processor import load_pdf
# Define the path to the test data directory relative to this test file
# Assuming the test file is in RAG_BOT/tests/integration/
# and data is in RAG_BOT/tests/data/hindi/
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'hindi')
# Helper function to check if a path is a valid file
def check_pdf_exists(pdf_name):
pdf_path = os.path.join(TEST_DATA_DIR, pdf_name)
if not os.path.isfile(pdf_path):
pytest.skip(f"Test PDF not found: {pdf_path}")
return pdf_path
@pytest.fixture
def sakar_pdf_path():
return check_pdf_exists("01.03.14-h.pdf")
@pytest.fixture
def avyakt_pdf_path():
return check_pdf_exists("03. AV-H-07.01.1980.pdf")
@pytest.fixture
def multi_date_header_pdf_path():
return check_pdf_exists("FHM - 17-11-2013 (AM Revised - 31-12-1996).pdf")
def test_load_pdf_sakar_murli(sakar_pdf_path):
"""
Tests loading a standard Sakar Murli PDF.
Checks if the correct date is extracted and is_avyakt is not set.
"""
documents = load_pdf(sakar_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
expected_date = "2014-03-01"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
assert "is_avyakt" not in doc.metadata
def test_load_pdf_avyakt_murli(avyakt_pdf_path):
"""
Tests loading a standard Avyakt Murli PDF.
Checks if the correct date is extracted and is_avyakt is set to True.
"""
documents = load_pdf(avyakt_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
expected_date = "1980-01-07"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
assert "is_avyakt" in doc.metadata
assert doc.metadata["is_avyakt"] is True
def test_load_pdf_multiple_header_dates(multi_date_header_pdf_path):
"""
Tests loading a PDF with multiple dates in the header (original/revised).
Checks if the *first* date found is extracted and applied consistently.
"""
documents = load_pdf(multi_date_header_pdf_path)
assert isinstance(documents, list)
assert len(documents) > 0
assert all(isinstance(doc, Document) for doc in documents)
# The current logic should pick the first date it finds in the header
expected_date = "2013-11-17"
for doc in documents:
assert "date" in doc.metadata
assert doc.metadata["date"] == expected_date
# Assuming this is Sakar based on filename pattern, but code checks content.
# If the content check finds 'avyakt', this assertion might need adjustment.
# For now, testing based on the primary date extraction logic.
assert "is_avyakt" not in doc.metadata
|