File size: 3,459 Bytes
9041389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from __future__ import annotations
import dataclasses
import uuid
from typing import Union
import hashlib

import proto.chunk_pb2 as chunk_pb2
from domain.domain_protocol import DomainProtocol


@dataclasses.dataclass(frozen=True)
class DocumentD(DomainProtocol[chunk_pb2.Document]):
    file_path: str
    authors: str
    publish_date: str

    @property
    def id(self) -> str:
        return hashlib.sha256(self.to_proto().SerializeToString()).hexdigest()

    @classmethod
    def _from_proto(cls, proto: chunk_pb2.Document) -> DocumentD:
        return cls(file_path=proto.file_path,
                   authors=proto.authors,
                   publish_date=proto.publish_date)

    def to_proto(self) -> chunk_pb2.Document:
        return chunk_pb2.Document(file_path=self.file_path,
                                  authors=self.authors,
                                  publish_date=self.publish_date)


@dataclasses.dataclass(frozen=True)
class ChunkD(DomainProtocol[chunk_pb2.Chunk]):

    @property
    def id(self) -> str:
        return str(self.chunk_id)

    chunk_text: str
    chunk_type: chunk_pb2.ChunkType
    chunk_index: int
    parent_reference: Union[uuid.UUID, DocumentD]
    chunk_id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4)

    def __post_init__(self):
        if self.chunk_type == chunk_pb2.ChunkType.CHUNK_TYPE_PAGE:
            if not isinstance(self.parent_reference, DocumentD):
                raise ValueError(
                    f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a DocumentD parent_reference."
                )
        elif not isinstance(self.parent_reference, uuid.UUID):
            raise ValueError(
                f"Chunk (id: {self.chunk_id}) with type {self.chunk_type} must have a uuid.UUID parent_reference."
            )

    @classmethod
    def _from_proto(cls, proto: chunk_pb2.Chunk) -> ChunkD:
        if proto.HasField('parent_chunk_id'):
            return cls(chunk_id=uuid.UUID(proto.chunk_id),
                       parent_reference=uuid.UUID(proto.parent_chunk_id),
                       chunk_text=proto.chunk_text,
                       chunk_type=proto.chunk_type,
                       chunk_index=proto.chunk_index)
        elif proto.HasField('document'):
            return cls(chunk_id=uuid.UUID(proto.chunk_id),
                       parent_reference=DocumentD._from_proto(proto.document),
                       chunk_text=proto.chunk_text,
                       chunk_type=proto.chunk_type,
                       chunk_index=proto.chunk_index)
        else:
            raise ValueError(
                f"Chunk proto (id: {proto.chunk_id}) has no 'parent' or 'document' field.")

    def to_proto(self) -> chunk_pb2.Chunk:
        chunk_proto = chunk_pb2.Chunk()
        chunk_proto.chunk_id = str(self.chunk_id)
        chunk_proto.chunk_text = self.chunk_text
        chunk_proto.chunk_type = self.chunk_type
        chunk_proto.chunk_index = self.chunk_index
        if isinstance(self.parent_reference, uuid.UUID):
            chunk_proto.parent_chunk_id = str(self.parent_reference)
        elif isinstance(self.parent_reference, DocumentD):
            chunk_proto.document.CopyFrom(self.parent_reference.to_proto())
        else:
            raise ValueError(
                f"Chunk (id: {self.chunk_id}) parent_reference is of unknown type: {type(self.parent_reference)}"
            )
        return chunk_proto