File size: 3,021 Bytes
64772a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import base64
from typing import Any, Dict, Optional, Sequence, Union

from aleph_alpha_client.prompt import Image, Prompt, PromptItem, Text, Tokens, _to_json


class Document:
    """
    A document that can be either a docx document or text/image prompts.
    """

    def __init__(
        self,
        docx: Optional[str] = None,
        prompt: Optional[Sequence[Union[str, Text, Image, Tokens]]] = None,
        text: Optional[str] = None,
    ):
        # We use a base_64 representation for docx documents, because we want to embed the file
        # into a prompt send in JSON.
        self.docx = docx
        self.prompt = prompt
        self.text = text

    @classmethod
    def from_docx_bytes(cls, bytes: bytes):
        """
        Pass a docx file in bytes and prepare it to be used as a document
        """
        docx_base64 = base64.b64encode(bytes).decode()
        return cls(docx=docx_base64)

    @classmethod
    def from_docx_file(cls, path: str):
        """
        Load a docx file from disk and prepare it to be used as a document

        Examples:
            >>> docx_file = "./tests/sample.docx"
            >>> document = Document.from_docx_file(docx_file)
        """
        with open(path, "rb") as f:
            docx_bytes = f.read()
        return cls.from_docx_bytes(docx_bytes)

    @classmethod
    def from_prompt(cls, prompt: Union[Prompt, Sequence[Union[str, Image]]]):
        """
        Pass a prompt that can contain multiple strings and Image prompts and prepare it to be used as a document
        """
        if isinstance(prompt, Prompt):
            return cls(prompt=prompt.items)
        else:
            return cls(prompt=prompt)

    @classmethod
    def from_text(cls, text: str):
        """
        Pass a single text and prepare it to be used as a document

        Example:
            >>> prompt = "This is an example."
            >>> document = Document.from_text(prompt)
        """
        return cls(text=text)

    def _to_serializable_document(self) -> Dict[str, Any]:
        """
        A dict if serialized to JSON is suitable as a document element
        """

        def to_prompt_item(item: Union[str, Image, Text, Tokens]) -> PromptItem:
            # document still uses a plain piece of text for text-prompts
            # -> convert to Text-instance
            return Text.from_text(item) if isinstance(item, str) else item

        if self.docx is not None:
            # Serialize docx to Document JSON format
            return {
                "docx": self.docx,
            }
        elif self.prompt is not None:
            # Serialize prompt to Document JSON format
            prompt_data = [
                _to_json(to_prompt_item(prompt_item)) for prompt_item in self.prompt
            ]
            return {"prompt": prompt_data}
        elif self.text is not None:
            return {
                "text": self.text,
            }
        else:
            raise NotImplementedError("unsupported document type")