File size: 5,264 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Document Loader for ArcGIS FeatureLayers."""

from __future__ import annotations

import json
import re
import warnings
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

if TYPE_CHECKING:
    import arcgis

_NOT_PROVIDED = "(Not Provided)"


class ArcGISLoader(BaseLoader):
    """Load records from an ArcGIS FeatureLayer."""

    def __init__(
        self,
        layer: Union[str, arcgis.features.FeatureLayer],
        gis: Optional[arcgis.gis.GIS] = None,
        where: str = "1=1",
        out_fields: Optional[Union[List[str], str]] = None,
        return_geometry: bool = False,
        result_record_count: Optional[int] = None,
        lyr_desc: Optional[str] = None,
        **kwargs: Any,
    ):
        try:
            import arcgis
        except ImportError as e:
            raise ImportError(
                "arcgis is required to use the ArcGIS Loader. "
                "Install it with pip or conda."
            ) from e

        try:
            from bs4 import BeautifulSoup  # type: ignore

            self.BEAUTIFULSOUP = BeautifulSoup
        except ImportError:
            warnings.warn("BeautifulSoup not found. HTML will not be parsed.")
            self.BEAUTIFULSOUP = None

        self.gis = gis or arcgis.gis.GIS()

        if isinstance(layer, str):
            self.url = layer
            self.layer = arcgis.features.FeatureLayer(layer, gis=gis)
        else:
            self.url = layer.url
            self.layer = layer

        self.layer_properties = self._get_layer_properties(lyr_desc)

        self.where = where

        if isinstance(out_fields, str):
            self.out_fields = out_fields
        elif out_fields is None:
            self.out_fields = "*"
        else:
            self.out_fields = ",".join(out_fields)

        self.return_geometry = return_geometry

        self.result_record_count = result_record_count
        self.return_all_records = not isinstance(result_record_count, int)

        query_params = dict(
            where=self.where,
            out_fields=self.out_fields,
            return_geometry=self.return_geometry,
            return_all_records=self.return_all_records,
            result_record_count=self.result_record_count,
        )
        query_params.update(kwargs)
        self.query_params = query_params

    def _get_layer_properties(self, lyr_desc: Optional[str] = None) -> dict:
        """Get the layer properties from the FeatureLayer."""
        import arcgis

        layer_number_pattern = re.compile(r"/\d+$")
        props = self.layer.properties

        if lyr_desc is None:
            # retrieve description from the FeatureLayer if not provided
            try:
                if self.BEAUTIFULSOUP:
                    lyr_desc = self.BEAUTIFULSOUP(props["description"]).text
                else:
                    lyr_desc = props["description"]
                lyr_desc = lyr_desc or _NOT_PROVIDED
            except KeyError:
                lyr_desc = _NOT_PROVIDED
        try:
            item_id = props["serviceItemId"]
            item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer(
                re.sub(layer_number_pattern, "", self.url),
            )
            try:
                raw_desc = item.description
            except AttributeError:
                raw_desc = item.properties.description
            if self.BEAUTIFULSOUP:
                item_desc = self.BEAUTIFULSOUP(raw_desc).text
            else:
                item_desc = raw_desc
            item_desc = item_desc or _NOT_PROVIDED
        except KeyError:
            item_desc = _NOT_PROVIDED
        return {
            "layer_description": lyr_desc,
            "item_description": item_desc,
            "layer_properties": props,
        }

    def lazy_load(self) -> Iterator[Document]:
        """Lazy load records from FeatureLayer."""
        query_response = self.layer.query(**self.query_params)
        features = (feature.as_dict for feature in query_response)
        for feature in features:
            attributes = feature["attributes"]
            page_content = json.dumps(attributes)

            metadata = {
                "accessed": f"{datetime.now(timezone.utc).isoformat()}Z",
                "name": self.layer_properties["layer_properties"]["name"],
                "url": self.url,
                "layer_description": self.layer_properties["layer_description"],
                "item_description": self.layer_properties["item_description"],
                "layer_properties": self.layer_properties["layer_properties"],
            }

            if self.return_geometry:
                try:
                    metadata["geometry"] = feature["geometry"]
                except KeyError:
                    warnings.warn(
                        "Geometry could not be retrieved from the feature layer."
                    )

            yield Document(page_content=page_content, metadata=metadata)

    def load(self) -> List[Document]:
        """Load all records from FeatureLayer."""
        return list(self.lazy_load())