import dataclasses import json from typing import ClassVar, Dict, Iterable, Iterator, List, Optional, Tuple, Union import pymupdf from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam from domain.chunk_d import DocumentD from extraction_pipeline.document_metadata_extractor.document_metadata_extractor import ( DocumentMetadataExtractor,) from extraction_pipeline.document_metadata_extractor.prompts import ( DOCUMENT_METADATA_PROMPT,) from llm_handler.llm_interface import LLMInterface from llm_handler.openai_handler import ChatModelVersion, OpenAIHandler from utils.dates import parse_date class CreationDateError(Exception): pass class AuthorsError(Exception): pass class OpenAIDocumentMetadataExtractor(DocumentMetadataExtractor): _handler: LLMInterface _MODEL_VERSION: ChatModelVersion = ChatModelVersion.GPT_4_O _AUTHORS_KEY: ClassVar[str] = "authors" _PUBLISH_DATE_KEY: ClassVar[str] = "publish_date" _TEMPARATURE: ClassVar[float] = 0.2 def __init__(self, openai_handler: Optional[LLMInterface] = None, model_version: Optional[ChatModelVersion] = None): self._handler = openai_handler or OpenAIHandler() self._model_version = model_version or self._MODEL_VERSION def _validate_text(self, completion_text: Dict[str, Union[str, List[str]]]): if not completion_text.get(self._AUTHORS_KEY): raise AuthorsError("No authors found.") if not completion_text.get(self._PUBLISH_DATE_KEY): raise CreationDateError("No creation date found.") publish_date_str: str = str(completion_text.get(self._PUBLISH_DATE_KEY, "")) try: parse_date(publish_date_str) except ValueError as e: raise CreationDateError( f"Failed to parse publish date '{publish_date_str}': {e}") from e def _process_element(self, element: DocumentD) -> Iterable[DocumentD]: pdf_document_pages: Iterator = pymupdf.open(element.file_path).pages() first_page_text: str = next(pdf_document_pages).get_text() messages: List[ChatCompletionMessageParam] = [{ "role": "system", "content": DOCUMENT_METADATA_PROMPT }, { "role": "user", "content": f"Input:\n{first_page_text}" }] completion_text_raw = self._handler.get_chat_completion( messages=messages, model=self._model_version, temperature=self._TEMPARATURE, response_format={"type": "json_object"}) completion_text: Dict[str, Union[str, List[str]]] = dict(json.loads(completion_text_raw)) self._validate_text(completion_text) authors: str = ", ".join(completion_text.get(self._AUTHORS_KEY, [])) publish_date: str = str(completion_text.get(self._PUBLISH_DATE_KEY, "")) yield dataclasses.replace(element, authors=authors, publish_date=publish_date)