Spaces:
Runtime error
Runtime error
File size: 5,264 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""Document Loader for ArcGIS FeatureLayers."""
from __future__ import annotations
import json
import re
import warnings
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
import arcgis
_NOT_PROVIDED = "(Not Provided)"
class ArcGISLoader(BaseLoader):
"""Load records from an ArcGIS FeatureLayer."""
def __init__(
self,
layer: Union[str, arcgis.features.FeatureLayer],
gis: Optional[arcgis.gis.GIS] = None,
where: str = "1=1",
out_fields: Optional[Union[List[str], str]] = None,
return_geometry: bool = False,
result_record_count: Optional[int] = None,
lyr_desc: Optional[str] = None,
**kwargs: Any,
):
try:
import arcgis
except ImportError as e:
raise ImportError(
"arcgis is required to use the ArcGIS Loader. "
"Install it with pip or conda."
) from e
try:
from bs4 import BeautifulSoup # type: ignore
self.BEAUTIFULSOUP = BeautifulSoup
except ImportError:
warnings.warn("BeautifulSoup not found. HTML will not be parsed.")
self.BEAUTIFULSOUP = None
self.gis = gis or arcgis.gis.GIS()
if isinstance(layer, str):
self.url = layer
self.layer = arcgis.features.FeatureLayer(layer, gis=gis)
else:
self.url = layer.url
self.layer = layer
self.layer_properties = self._get_layer_properties(lyr_desc)
self.where = where
if isinstance(out_fields, str):
self.out_fields = out_fields
elif out_fields is None:
self.out_fields = "*"
else:
self.out_fields = ",".join(out_fields)
self.return_geometry = return_geometry
self.result_record_count = result_record_count
self.return_all_records = not isinstance(result_record_count, int)
query_params = dict(
where=self.where,
out_fields=self.out_fields,
return_geometry=self.return_geometry,
return_all_records=self.return_all_records,
result_record_count=self.result_record_count,
)
query_params.update(kwargs)
self.query_params = query_params
def _get_layer_properties(self, lyr_desc: Optional[str] = None) -> dict:
"""Get the layer properties from the FeatureLayer."""
import arcgis
layer_number_pattern = re.compile(r"/\d+$")
props = self.layer.properties
if lyr_desc is None:
# retrieve description from the FeatureLayer if not provided
try:
if self.BEAUTIFULSOUP:
lyr_desc = self.BEAUTIFULSOUP(props["description"]).text
else:
lyr_desc = props["description"]
lyr_desc = lyr_desc or _NOT_PROVIDED
except KeyError:
lyr_desc = _NOT_PROVIDED
try:
item_id = props["serviceItemId"]
item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer(
re.sub(layer_number_pattern, "", self.url),
)
try:
raw_desc = item.description
except AttributeError:
raw_desc = item.properties.description
if self.BEAUTIFULSOUP:
item_desc = self.BEAUTIFULSOUP(raw_desc).text
else:
item_desc = raw_desc
item_desc = item_desc or _NOT_PROVIDED
except KeyError:
item_desc = _NOT_PROVIDED
return {
"layer_description": lyr_desc,
"item_description": item_desc,
"layer_properties": props,
}
def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from FeatureLayer."""
query_response = self.layer.query(**self.query_params)
features = (feature.as_dict for feature in query_response)
for feature in features:
attributes = feature["attributes"]
page_content = json.dumps(attributes)
metadata = {
"accessed": f"{datetime.now(timezone.utc).isoformat()}Z",
"name": self.layer_properties["layer_properties"]["name"],
"url": self.url,
"layer_description": self.layer_properties["layer_description"],
"item_description": self.layer_properties["item_description"],
"layer_properties": self.layer_properties["layer_properties"],
}
if self.return_geometry:
try:
metadata["geometry"] = feature["geometry"]
except KeyError:
warnings.warn(
"Geometry could not be retrieved from the feature layer."
)
yield Document(page_content=page_content, metadata=metadata)
def load(self) -> List[Document]:
"""Load all records from FeatureLayer."""
return list(self.lazy_load())
|