metisllm-dashboard / extraction_pipeline /document_metadata_extractor /openai_document_metadata_extractor_test.py
Gateston Johns
first real commit
9041389
raw
history blame
No virus
2.82 kB
import logging
import unittest
import uuid
from typing import List
from domain.chunk_d import DocumentD
from extraction_pipeline.document_metadata_extractor.openai_document_metadata_extractor import OpenAIDocumentMetadataExtractor, AuthorsError, CreationDateError
from llm_handler.mock_llm_handler import MockLLMHandler
DOCUMENT_METADATA_EXTRACTION_RESPONSE = '''
{
"authors": ["BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung"],
"publish_date": "2023-04-13"
}
'''
class TestOpenAIDocumentMetadataExtractor(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.test_pdf_path = "extraction_pipeline/test_data/test.pdf"
cls.start_document_d = DocumentD(file_path=cls.test_pdf_path, authors="", publish_date="")
cls.final_document_d = DocumentD(
file_path=cls.test_pdf_path,
authors=
"BofA Global Research, Michael Hartnett, Elyas Galou, Anya Shelekhin, Myung-Jee Jung",
publish_date="2023-04-13")
cls.openai_publish_details_extractor = OpenAIDocumentMetadataExtractor()
def test__validate_text_missing_publishers(self):
missing_publishers_text = {"publish_date": "2023-12-13"}
with self.assertRaises(AuthorsError):
self.openai_publish_details_extractor._validate_text(
missing_publishers_text) # type: ignore
def test__validate_text_invalid_date(self):
invalid_date_text = {
"authors": [
"BofA Global Research",
"Michael Hartnett",
"Elyas Galou",
"Anya Shelekhin",
"Myung-Jee Jung"
],
"publish_date": "2-13"
}
with self.assertRaises(CreationDateError):
self.openai_publish_details_extractor._validate_text(invalid_date_text) # type: ignore
def test__validate_text_valid(self):
valid_text = {
"authors": [
"BofA Global Research",
"Michael Hartnett",
"Elyas Galou",
"Anya Shelekhin",
"Myung-Jee Jung"
],
"publish_date": "2023-04-13"
}
self.openai_publish_details_extractor._validate_text(valid_text) # type: ignore
def test__process_element(self):
handler = MockLLMHandler(chat_completion=[DOCUMENT_METADATA_EXTRACTION_RESPONSE])
openai_publish_details_extractor = OpenAIDocumentMetadataExtractor(handler)
pdf_document_d = self.start_document_d
output = list(openai_publish_details_extractor._process_element(pdf_document_d))
self.assertEqual(output[0], self.final_document_d)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
unittest.main()