Spaces:
Runtime error
Runtime error
import base64 | |
import warnings | |
from typing import Any, Dict, Iterator, List, Optional | |
import requests | |
from langchain_core.pydantic_v1 import BaseModel, root_validator, validator | |
from typing_extensions import NotRequired, TypedDict | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseBlobParser, BaseLoader | |
from langchain.document_loaders.blob_loaders import Blob | |
from langchain.text_splitter import TextSplitter | |
from langchain.utils import get_from_dict_or_env | |
EMBAAS_DOC_API_URL = "https://api.embaas.io/v1/document/extract-text/bytes/" | |
class EmbaasDocumentExtractionParameters(TypedDict): | |
"""Parameters for the embaas document extraction API.""" | |
mime_type: NotRequired[str] | |
"""The mime type of the document.""" | |
file_extension: NotRequired[str] | |
"""The file extension of the document.""" | |
file_name: NotRequired[str] | |
"""The file name of the document.""" | |
should_chunk: NotRequired[bool] | |
"""Whether to chunk the document into pages.""" | |
chunk_size: NotRequired[int] | |
"""The maximum size of the text chunks.""" | |
chunk_overlap: NotRequired[int] | |
"""The maximum overlap allowed between chunks.""" | |
chunk_splitter: NotRequired[str] | |
"""The text splitter class name for creating chunks.""" | |
separators: NotRequired[List[str]] | |
"""The separators for chunks.""" | |
should_embed: NotRequired[bool] | |
"""Whether to create embeddings for the document in the response.""" | |
model: NotRequired[str] | |
"""The model to pass to the Embaas document extraction API.""" | |
instruction: NotRequired[str] | |
"""The instruction to pass to the Embaas document extraction API.""" | |
class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): | |
"""Payload for the Embaas document extraction API.""" | |
bytes: str | |
"""The base64 encoded bytes of the document to extract text from.""" | |
class BaseEmbaasLoader(BaseModel): | |
"""Base loader for `Embaas` document extraction API.""" | |
embaas_api_key: Optional[str] = None | |
"""The API key for the Embaas document extraction API.""" | |
api_url: str = EMBAAS_DOC_API_URL | |
"""The URL of the Embaas document extraction API.""" | |
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() | |
"""Additional parameters to pass to the Embaas document extraction API.""" | |
def validate_environment(cls, values: Dict) -> Dict: | |
"""Validate that api key and python package exists in environment.""" | |
embaas_api_key = get_from_dict_or_env( | |
values, "embaas_api_key", "EMBAAS_API_KEY" | |
) | |
values["embaas_api_key"] = embaas_api_key | |
return values | |
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): | |
"""Load `Embaas` blob. | |
To use, you should have the | |
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass | |
it as a named parameter to the constructor. | |
Example: | |
.. code-block:: python | |
# Default parsing | |
from langchain.document_loaders.embaas import EmbaasBlobLoader | |
loader = EmbaasBlobLoader() | |
blob = Blob.from_path(path="example.mp3") | |
documents = loader.parse(blob=blob) | |
# Custom api parameters (create embeddings automatically) | |
from langchain.document_loaders.embaas import EmbaasBlobLoader | |
loader = EmbaasBlobLoader( | |
params={ | |
"should_embed": True, | |
"model": "e5-large-v2", | |
"chunk_size": 256, | |
"chunk_splitter": "CharacterTextSplitter" | |
} | |
) | |
blob = Blob.from_path(path="example.pdf") | |
documents = loader.parse(blob=blob) | |
""" | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Parses the blob lazily. | |
Args: | |
blob: The blob to parse. | |
""" | |
yield from self._get_documents(blob=blob) | |
def _api_response_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: | |
"""Convert the API response to a list of documents.""" | |
docs = [] | |
for chunk in chunks: | |
metadata = chunk["metadata"] | |
if chunk.get("embedding", None) is not None: | |
metadata["embedding"] = chunk["embedding"] | |
doc = Document(page_content=chunk["text"], metadata=metadata) | |
docs.append(doc) | |
return docs | |
def _generate_payload(self, blob: Blob) -> EmbaasDocumentExtractionPayload: | |
"""Generates payload for the API request.""" | |
base64_byte_str = base64.b64encode(blob.as_bytes()).decode() | |
payload: EmbaasDocumentExtractionPayload = EmbaasDocumentExtractionPayload( | |
bytes=base64_byte_str, | |
# Workaround for mypy issue: https://github.com/python/mypy/issues/9408 | |
# type: ignore | |
**self.params, | |
) | |
if blob.mimetype is not None and payload.get("mime_type", None) is None: | |
payload["mime_type"] = blob.mimetype | |
return payload | |
def _handle_request( | |
self, payload: EmbaasDocumentExtractionPayload | |
) -> List[Document]: | |
"""Sends a request to the embaas API and handles the response.""" | |
headers = { | |
"Authorization": f"Bearer {self.embaas_api_key}", | |
"Content-Type": "application/json", | |
} | |
response = requests.post(self.api_url, headers=headers, json=payload) | |
response.raise_for_status() | |
parsed_response = response.json() | |
return EmbaasBlobLoader._api_response_to_documents( | |
chunks=parsed_response["data"]["chunks"] | |
) | |
def _get_documents(self, blob: Blob) -> Iterator[Document]: | |
"""Get the documents from the blob.""" | |
payload = self._generate_payload(blob=blob) | |
try: | |
documents = self._handle_request(payload=payload) | |
except requests.exceptions.RequestException as e: | |
if e.response is None or not e.response.text: | |
raise ValueError( | |
f"Error raised by Embaas document text extraction API: {e}" | |
) | |
parsed_response = e.response.json() | |
if "message" in parsed_response: | |
raise ValueError( | |
f"Validation Error raised by Embaas document text extraction API:" | |
f" {parsed_response['message']}" | |
) | |
raise | |
yield from documents | |
class EmbaasLoader(BaseEmbaasLoader, BaseLoader): | |
"""Load from `Embaas`. | |
To use, you should have the | |
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass | |
it as a named parameter to the constructor. | |
Example: | |
.. code-block:: python | |
# Default parsing | |
from langchain.document_loaders.embaas import EmbaasLoader | |
loader = EmbaasLoader(file_path="example.mp3") | |
documents = loader.load() | |
# Custom api parameters (create embeddings automatically) | |
from langchain.document_loaders.embaas import EmbaasBlobLoader | |
loader = EmbaasBlobLoader( | |
file_path="example.pdf", | |
params={ | |
"should_embed": True, | |
"model": "e5-large-v2", | |
"chunk_size": 256, | |
"chunk_splitter": "CharacterTextSplitter" | |
} | |
) | |
documents = loader.load() | |
""" | |
file_path: str | |
"""The path to the file to load.""" | |
blob_loader: Optional[EmbaasBlobLoader] | |
"""The blob loader to use. If not provided, a default one will be created.""" | |
def validate_blob_loader( | |
cls, v: EmbaasBlobLoader, values: Dict | |
) -> EmbaasBlobLoader: | |
return v or EmbaasBlobLoader( | |
embaas_api_key=values["embaas_api_key"], | |
api_url=values["api_url"], | |
params=values["params"], | |
) | |
def lazy_load(self) -> Iterator[Document]: | |
"""Load the documents from the file path lazily.""" | |
blob = Blob.from_path(path=self.file_path) | |
assert self.blob_loader is not None | |
# Should never be None, but mypy doesn't know that. | |
yield from self.blob_loader.lazy_parse(blob=blob) | |
def load(self) -> List[Document]: | |
return list(self.lazy_load()) | |
def load_and_split( | |
self, text_splitter: Optional[TextSplitter] = None | |
) -> List[Document]: | |
if self.params.get("should_embed", False): | |
warnings.warn( | |
"Embeddings are not supported with load_and_split." | |
" Use the API splitter to properly generate embeddings." | |
" For more information see embaas.io docs." | |
) | |
return super().load_and_split(text_splitter=text_splitter) | |