File size: 2,822 Bytes
9041389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import logging
import unittest
import uuid
from typing import List

from domain.chunk_d import DocumentD
from extraction_pipeline.document_metadata_extractor.openai_document_metadata_extractor import OpenAIDocumentMetadataExtractor, AuthorsError, CreationDateError
from llm_handler.mock_llm_handler import MockLLMHandler

DOCUMENT_METADATA_EXTRACTION_RESPONSE = '''
{
    "authors": ["BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung"],
    "publish_date": "2023-04-13"
}
'''


class TestOpenAIDocumentMetadataExtractor(unittest.TestCase):

    @classmethod
    def setUpClass(cls) -> None:
        cls.test_pdf_path = "extraction_pipeline/test_data/test.pdf"
        cls.start_document_d = DocumentD(file_path=cls.test_pdf_path, authors="", publish_date="")
        cls.final_document_d = DocumentD(
            file_path=cls.test_pdf_path,
            authors=
            "BofA Global Research, Michael Hartnett, Elyas Galou, Anya Shelekhin, Myung-Jee Jung",
            publish_date="2023-04-13")
        cls.openai_publish_details_extractor = OpenAIDocumentMetadataExtractor()

    def test__validate_text_missing_publishers(self):
        missing_publishers_text = {"publish_date": "2023-12-13"}
        with self.assertRaises(AuthorsError):
            self.openai_publish_details_extractor._validate_text(
                missing_publishers_text)  # type: ignore

    def test__validate_text_invalid_date(self):
        invalid_date_text = {
            "authors": [
                "BofA Global Research",
                "Michael Hartnett",
                "Elyas Galou",
                "Anya Shelekhin",
                "Myung-Jee Jung"
            ],
            "publish_date": "2-13"
        }
        with self.assertRaises(CreationDateError):
            self.openai_publish_details_extractor._validate_text(invalid_date_text)  # type: ignore

    def test__validate_text_valid(self):
        valid_text = {
            "authors": [
                "BofA Global Research",
                "Michael Hartnett",
                "Elyas Galou",
                "Anya Shelekhin",
                "Myung-Jee Jung"
            ],
            "publish_date": "2023-04-13"
        }
        self.openai_publish_details_extractor._validate_text(valid_text)  # type: ignore

    def test__process_element(self):
        handler = MockLLMHandler(chat_completion=[DOCUMENT_METADATA_EXTRACTION_RESPONSE])
        openai_publish_details_extractor = OpenAIDocumentMetadataExtractor(handler)
        pdf_document_d = self.start_document_d
        output = list(openai_publish_details_extractor._process_element(pdf_document_d))
        self.assertEqual(output[0], self.final_document_d)


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    unittest.main()