Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
import dataclasses | |
import uuid | |
from typing import Union | |
import hashlib | |
import proto.chunk_pb2 as chunk_pb2 | |
from domain.domain_protocol import DomainProtocol | |
class DocumentD(DomainProtocol[chunk_pb2.Document]): | |
file_path: str | |
authors: str | |
publish_date: str | |
def id(self) -> str: | |
return hashlib.sha256(self.to_proto().SerializeToString()).hexdigest() | |
def _from_proto(cls, proto: chunk_pb2.Document) -> DocumentD: | |
return cls(file_path=proto.file_path, | |
authors=proto.authors, | |
publish_date=proto.publish_date) | |
def to_proto(self) -> chunk_pb2.Document: | |
return chunk_pb2.Document(file_path=self.file_path, | |
authors=self.authors, | |
publish_date=self.publish_date) | |
class ChunkD(DomainProtocol[chunk_pb2.Chunk]): | |
def id(self) -> str: | |
return str(self.chunk_id) | |
chunk_text: str | |
chunk_type: chunk_pb2.ChunkType | |
chunk_index: int | |
parent_reference: Union[uuid.UUID, DocumentD] | |
chunk_id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4) | |
def __post_init__(self): | |
if self.chunk_type == chunk_pb2.ChunkType.CHUNK_TYPE_PAGE: | |
if not isinstance(self.parent_reference, DocumentD): | |
raise ValueError( | |
f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a DocumentD parent_reference." | |
) | |
elif not isinstance(self.parent_reference, uuid.UUID): | |
raise ValueError( | |
f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a uuid.UUID parent_reference." | |
) | |
def _from_proto(cls, proto: chunk_pb2.Chunk) -> ChunkD: | |
if proto.HasField('parent_chunk_id'): | |
return cls(chunk_id=uuid.UUID(proto.chunk_id), | |
parent_reference=uuid.UUID(proto.parent_chunk_id), | |
chunk_text=proto.chunk_text, | |
chunk_type=proto.chunk_type, | |
chunk_index=proto.chunk_index) | |
elif proto.HasField('document'): | |
return cls(chunk_id=uuid.UUID(proto.chunk_id), | |
parent_reference=DocumentD._from_proto(proto.document), | |
chunk_text=proto.chunk_text, | |
chunk_type=proto.chunk_type, | |
chunk_index=proto.chunk_index) | |
else: | |
raise ValueError( | |
f"Chunk proto (id: {proto.chunk_id}) has no 'parent' or 'document' field.") | |
def to_proto(self) -> chunk_pb2.Chunk: | |
chunk_proto = chunk_pb2.Chunk() | |
chunk_proto.chunk_id = str(self.chunk_id) | |
chunk_proto.chunk_text = self.chunk_text | |
chunk_proto.chunk_type = self.chunk_type | |
chunk_proto.chunk_index = self.chunk_index | |
if isinstance(self.parent_reference, uuid.UUID): | |
chunk_proto.parent_chunk_id = str(self.parent_reference) | |
elif isinstance(self.parent_reference, DocumentD): | |
chunk_proto.document.CopyFrom(self.parent_reference.to_proto()) | |
else: | |
raise ValueError( | |
f"Chunk (id: {self.chunk_id}) parent_reference is of unknown type: {type(self.parent_reference)}" | |
) | |
return chunk_proto | |