Spaces:
Sleeping
Sleeping
File size: 3,459 Bytes
9041389 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from __future__ import annotations
import dataclasses
import uuid
from typing import Union
import hashlib
import proto.chunk_pb2 as chunk_pb2
from domain.domain_protocol import DomainProtocol
@dataclasses.dataclass(frozen=True)
class DocumentD(DomainProtocol[chunk_pb2.Document]):
file_path: str
authors: str
publish_date: str
@property
def id(self) -> str:
return hashlib.sha256(self.to_proto().SerializeToString()).hexdigest()
@classmethod
def _from_proto(cls, proto: chunk_pb2.Document) -> DocumentD:
return cls(file_path=proto.file_path,
authors=proto.authors,
publish_date=proto.publish_date)
def to_proto(self) -> chunk_pb2.Document:
return chunk_pb2.Document(file_path=self.file_path,
authors=self.authors,
publish_date=self.publish_date)
@dataclasses.dataclass(frozen=True)
class ChunkD(DomainProtocol[chunk_pb2.Chunk]):
@property
def id(self) -> str:
return str(self.chunk_id)
chunk_text: str
chunk_type: chunk_pb2.ChunkType
chunk_index: int
parent_reference: Union[uuid.UUID, DocumentD]
chunk_id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4)
def __post_init__(self):
if self.chunk_type == chunk_pb2.ChunkType.CHUNK_TYPE_PAGE:
if not isinstance(self.parent_reference, DocumentD):
raise ValueError(
f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a DocumentD parent_reference."
)
elif not isinstance(self.parent_reference, uuid.UUID):
raise ValueError(
f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a uuid.UUID parent_reference."
)
@classmethod
def _from_proto(cls, proto: chunk_pb2.Chunk) -> ChunkD:
if proto.HasField('parent_chunk_id'):
return cls(chunk_id=uuid.UUID(proto.chunk_id),
parent_reference=uuid.UUID(proto.parent_chunk_id),
chunk_text=proto.chunk_text,
chunk_type=proto.chunk_type,
chunk_index=proto.chunk_index)
elif proto.HasField('document'):
return cls(chunk_id=uuid.UUID(proto.chunk_id),
parent_reference=DocumentD._from_proto(proto.document),
chunk_text=proto.chunk_text,
chunk_type=proto.chunk_type,
chunk_index=proto.chunk_index)
else:
raise ValueError(
f"Chunk proto (id: {proto.chunk_id}) has no 'parent' or 'document' field.")
def to_proto(self) -> chunk_pb2.Chunk:
chunk_proto = chunk_pb2.Chunk()
chunk_proto.chunk_id = str(self.chunk_id)
chunk_proto.chunk_text = self.chunk_text
chunk_proto.chunk_type = self.chunk_type
chunk_proto.chunk_index = self.chunk_index
if isinstance(self.parent_reference, uuid.UUID):
chunk_proto.parent_chunk_id = str(self.parent_reference)
elif isinstance(self.parent_reference, DocumentD):
chunk_proto.document.CopyFrom(self.parent_reference.to_proto())
else:
raise ValueError(
f"Chunk (id: {self.chunk_id}) parent_reference is of unknown type: {type(self.parent_reference)}"
)
return chunk_proto
|