import logging import unittest import uuid from typing import List from domain.chunk_d import DocumentD from extraction_pipeline.document_metadata_extractor.openai_document_metadata_extractor import OpenAIDocumentMetadataExtractor, AuthorsError, CreationDateError from llm_handler.mock_llm_handler import MockLLMHandler DOCUMENT_METADATA_EXTRACTION_RESPONSE = ''' { "authors": ["BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung"], "publish_date": "2023-04-13" } ''' class TestOpenAIDocumentMetadataExtractor(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls.test_pdf_path = "extraction_pipeline/test_data/test.pdf" cls.start_document_d = DocumentD(file_path=cls.test_pdf_path, authors="", publish_date="") cls.final_document_d = DocumentD( file_path=cls.test_pdf_path, authors= "BofA Global Research, Michael Hartnett, Elyas Galou, Anya Shelekhin, Myung-Jee Jung", publish_date="2023-04-13") cls.openai_publish_details_extractor = OpenAIDocumentMetadataExtractor() def test__validate_text_missing_publishers(self): missing_publishers_text = {"publish_date": "2023-12-13"} with self.assertRaises(AuthorsError): self.openai_publish_details_extractor._validate_text( missing_publishers_text) # type: ignore def test__validate_text_invalid_date(self): invalid_date_text = { "authors": [ "BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung" ], "publish_date": "2-13" } with self.assertRaises(CreationDateError): self.openai_publish_details_extractor._validate_text(invalid_date_text) # type: ignore def test__validate_text_valid(self): valid_text = { "authors": [ "BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung" ], "publish_date": "2023-04-13" } self.openai_publish_details_extractor._validate_text(valid_text) # type: ignore def test__process_element(self): handler = MockLLMHandler(chat_completion=[DOCUMENT_METADATA_EXTRACTION_RESPONSE]) openai_publish_details_extractor = OpenAIDocumentMetadataExtractor(handler) pdf_document_d = self.start_document_d output = list(openai_publish_details_extractor._process_element(pdf_document_d)) self.assertEqual(output[0], self.final_document_d) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) unittest.main()