Spaces:
Runtime error
Runtime error
"""Abstract interface for document loader implementations.""" | |
from abc import ABC, abstractmethod | |
from typing import Iterator, List, Optional | |
from langchain_core.documents import Document | |
from langchain.document_loaders.blob_loaders import Blob | |
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | |
class BaseLoader(ABC): | |
"""Interface for Document Loader. | |
Implementations should implement the lazy-loading method using generators | |
to avoid loading all Documents into memory at once. | |
The `load` method will remain as is for backwards compatibility, but its | |
implementation should be just `list(self.lazy_load())`. | |
""" | |
# Sub-classes should implement this method | |
# as return list(self.lazy_load()). | |
# This method returns a List which is materialized in memory. | |
def load(self) -> List[Document]: | |
"""Load data into Document objects.""" | |
def load_and_split( | |
self, text_splitter: Optional[TextSplitter] = None | |
) -> List[Document]: | |
"""Load Documents and split into chunks. Chunks are returned as Documents. | |
Args: | |
text_splitter: TextSplitter instance to use for splitting documents. | |
Defaults to RecursiveCharacterTextSplitter. | |
Returns: | |
List of Documents. | |
""" | |
if text_splitter is None: | |
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter() | |
else: | |
_text_splitter = text_splitter | |
docs = self.load() | |
return _text_splitter.split_documents(docs) | |
# Attention: This method will be upgraded into an abstractmethod once it's | |
# implemented in all the existing subclasses. | |
def lazy_load( | |
self, | |
) -> Iterator[Document]: | |
"""A lazy loader for Documents.""" | |
raise NotImplementedError( | |
f"{self.__class__.__name__} does not implement lazy_load()" | |
) | |
class BaseBlobParser(ABC): | |
"""Abstract interface for blob parsers. | |
A blob parser provides a way to parse raw data stored in a blob into one | |
or more documents. | |
The parser can be composed with blob loaders, making it easy to reuse | |
a parser independent of how the blob was originally loaded. | |
""" | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Lazy parsing interface. | |
Subclasses are required to implement this method. | |
Args: | |
blob: Blob instance | |
Returns: | |
Generator of documents | |
""" | |
def parse(self, blob: Blob) -> List[Document]: | |
"""Eagerly parse the blob into a document or documents. | |
This is a convenience method for interactive development environment. | |
Production applications should favor the lazy_parse method instead. | |
Subclasses should generally not over-ride this parse method. | |
Args: | |
blob: Blob instance | |
Returns: | |
List of documents | |
""" | |
return list(self.lazy_parse(blob)) | |