|
"""Base schema for readers.""" |
|
from dataclasses import dataclass |
|
|
|
from langchain.docstore.document import Document as LCDocument |
|
from application.parser.schema.schema import BaseDocument |
|
|
|
|
|
@dataclass |
|
class Document(BaseDocument): |
|
"""Generic interface for a data document. |
|
|
|
This document connects to data sources. |
|
|
|
""" |
|
|
|
def __post_init__(self) -> None: |
|
"""Post init.""" |
|
if self.text is None: |
|
raise ValueError("text field not set.") |
|
|
|
@classmethod |
|
def get_type(cls) -> str: |
|
"""Get Document type.""" |
|
return "Document" |
|
|
|
def to_langchain_format(self) -> LCDocument: |
|
"""Convert struct to LangChain document format.""" |
|
metadata = self.extra_info or {} |
|
return LCDocument(page_content=self.text, metadata=metadata) |
|
|
|
@classmethod |
|
def from_langchain_format(cls, doc: LCDocument) -> "Document": |
|
"""Convert struct from LangChain document format.""" |
|
return cls(text=doc.page_content, extra_info=doc.metadata) |
|
|