File size: 1,465 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import asyncio
import json
import uuid
from typing import Any, Sequence

from langchain_core.documents import BaseDocumentTransformer, Document

from langchain.tools.nuclia.tool import NucliaUnderstandingAPI


class NucliaTextTransformer(BaseDocumentTransformer):
    """
    The Nuclia Understanding API splits into paragraphs and sentences,
    identifies entities, provides a summary of the text and generates
    embeddings for all sentences.
    """

    def __init__(self, nua: NucliaUnderstandingAPI):
        self.nua = nua

    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        raise NotImplementedError

    async def atransform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        tasks = [
            self.nua.arun(
                {
                    "action": "push",
                    "id": str(uuid.uuid4()),
                    "text": doc.page_content,
                    "path": None,
                }
            )
            for doc in documents
        ]
        results = await asyncio.gather(*tasks)
        for doc, result in zip(documents, results):
            obj = json.loads(result)
            metadata = {
                "file": obj["file_extracted_data"][0],
                "metadata": obj["field_metadata"][0],
            }
            doc.metadata["nuclia"] = metadata
        return documents