Spaces:
Build error
Build error
File size: 3,021 Bytes
64772a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import base64
from typing import Any, Dict, Optional, Sequence, Union
from aleph_alpha_client.prompt import Image, Prompt, PromptItem, Text, Tokens, _to_json
class Document:
"""
A document that can be either a docx document or text/image prompts.
"""
def __init__(
self,
docx: Optional[str] = None,
prompt: Optional[Sequence[Union[str, Text, Image, Tokens]]] = None,
text: Optional[str] = None,
):
# We use a base_64 representation for docx documents, because we want to embed the file
# into a prompt send in JSON.
self.docx = docx
self.prompt = prompt
self.text = text
@classmethod
def from_docx_bytes(cls, bytes: bytes):
"""
Pass a docx file in bytes and prepare it to be used as a document
"""
docx_base64 = base64.b64encode(bytes).decode()
return cls(docx=docx_base64)
@classmethod
def from_docx_file(cls, path: str):
"""
Load a docx file from disk and prepare it to be used as a document
Examples:
>>> docx_file = "./tests/sample.docx"
>>> document = Document.from_docx_file(docx_file)
"""
with open(path, "rb") as f:
docx_bytes = f.read()
return cls.from_docx_bytes(docx_bytes)
@classmethod
def from_prompt(cls, prompt: Union[Prompt, Sequence[Union[str, Image]]]):
"""
Pass a prompt that can contain multiple strings and Image prompts and prepare it to be used as a document
"""
if isinstance(prompt, Prompt):
return cls(prompt=prompt.items)
else:
return cls(prompt=prompt)
@classmethod
def from_text(cls, text: str):
"""
Pass a single text and prepare it to be used as a document
Example:
>>> prompt = "This is an example."
>>> document = Document.from_text(prompt)
"""
return cls(text=text)
def _to_serializable_document(self) -> Dict[str, Any]:
"""
A dict if serialized to JSON is suitable as a document element
"""
def to_prompt_item(item: Union[str, Image, Text, Tokens]) -> PromptItem:
# document still uses a plain piece of text for text-prompts
# -> convert to Text-instance
return Text.from_text(item) if isinstance(item, str) else item
if self.docx is not None:
# Serialize docx to Document JSON format
return {
"docx": self.docx,
}
elif self.prompt is not None:
# Serialize prompt to Document JSON format
prompt_data = [
_to_json(to_prompt_item(prompt_item)) for prompt_item in self.prompt
]
return {"prompt": prompt_data}
elif self.text is not None:
return {
"text": self.text,
}
else:
raise NotImplementedError("unsupported document type")
|