Spaces:
Sleeping
Sleeping
File size: 2,822 Bytes
9041389 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import logging
import unittest
import uuid
from typing import List
from domain.chunk_d import DocumentD
from extraction_pipeline.document_metadata_extractor.openai_document_metadata_extractor import OpenAIDocumentMetadataExtractor, AuthorsError, CreationDateError
from llm_handler.mock_llm_handler import MockLLMHandler
DOCUMENT_METADATA_EXTRACTION_RESPONSE = '''
{
"authors": ["BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung"],
"publish_date": "2023-04-13"
}
'''
class TestOpenAIDocumentMetadataExtractor(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.test_pdf_path = "extraction_pipeline/test_data/test.pdf"
cls.start_document_d = DocumentD(file_path=cls.test_pdf_path, authors="", publish_date="")
cls.final_document_d = DocumentD(
file_path=cls.test_pdf_path,
authors=
"BofA Global Research, Michael Hartnett, Elyas Galou, Anya Shelekhin, Myung-Jee Jung",
publish_date="2023-04-13")
cls.openai_publish_details_extractor = OpenAIDocumentMetadataExtractor()
def test__validate_text_missing_publishers(self):
missing_publishers_text = {"publish_date": "2023-12-13"}
with self.assertRaises(AuthorsError):
self.openai_publish_details_extractor._validate_text(
missing_publishers_text) # type: ignore
def test__validate_text_invalid_date(self):
invalid_date_text = {
"authors": [
"BofA Global Research",
"Michael Hartnett",
"Elyas Galou",
"Anya Shelekhin",
"Myung-Jee Jung"
],
"publish_date": "2-13"
}
with self.assertRaises(CreationDateError):
self.openai_publish_details_extractor._validate_text(invalid_date_text) # type: ignore
def test__validate_text_valid(self):
valid_text = {
"authors": [
"BofA Global Research",
"Michael Hartnett",
"Elyas Galou",
"Anya Shelekhin",
"Myung-Jee Jung"
],
"publish_date": "2023-04-13"
}
self.openai_publish_details_extractor._validate_text(valid_text) # type: ignore
def test__process_element(self):
handler = MockLLMHandler(chat_completion=[DOCUMENT_METADATA_EXTRACTION_RESPONSE])
openai_publish_details_extractor = OpenAIDocumentMetadataExtractor(handler)
pdf_document_d = self.start_document_d
output = list(openai_publish_details_extractor._process_element(pdf_document_d))
self.assertEqual(output[0], self.final_document_d)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
unittest.main()
|