Carl Boettiger commited on
Commit
ef3b477
1 Parent(s): 3cf30a6
Files changed (3) hide show
  1. AK-HI-preprocess.py +272 -0
  2. pad-AK-HI-stats.parquet +3 -0
  3. preprocess.py +2 -3
AK-HI-preprocess.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # +
2
+ import ibis
3
+ import ibis.selectors as s
4
+ from ibis import _
5
+ import fiona
6
+ import geopandas as gpd
7
+ import rioxarray
8
+ from shapely.geometry import box
9
+
10
+ vec_file = 'pad-AK-HI-stats.parquet'
11
+
12
+
13
+ # +
14
+
15
+
16
+
17
+ fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
18
+ parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
19
+ # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
20
+
21
+ con = ibis.duckdb.connect()
22
+ con.load_extension("spatial")
23
+ threads = 1
24
+
25
+ # or read the fgb version, much slower
26
+ # pad = con.read_geo(fgb)
27
+ # pad = con.read_parquet(parquet)
28
+ # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
29
+
30
+ agency_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
31
+ agency_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
32
+ desig_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
33
+ public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
34
+ state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
35
+ iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
36
+
37
+ con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
38
+ pad = con.table("pad")
39
+ # -
40
+
41
+
42
+ # Get the CRS
43
+ # fiona is not built with parquet support, must read this from fgb. ideally duckdb's st_read_meta would do this from the parquet
44
+ meta = fiona.open(fgb)
45
+ crs = meta.crs
46
+
47
+ # Now we can do all the usual SQL queries to subset the data. Note the `geom.within()` spatial filter!
48
+ focal_columns = ["row_n", "FeatClass", "Mang_Name",
49
+ "Mang_Type", "Des_Tp", "Pub_Access",
50
+ "GAP_Sts", "IUCN_Cat", "Unit_Nm",
51
+ "State_Nm", "EsmtHldr", "Date_Est",
52
+ "SHAPE_Area", "geom"]
53
+ (
54
+ pad
55
+ .mutate(row_n=ibis.row_number())
56
+ .filter(_.FeatClass.isin(["Easement", "Fee"]))
57
+ .filter(_.State_Nm.isin(["AK", "HI"]))
58
+ .select(focal_columns)
59
+ .rename(geometry="geom")
60
+ .rename(manager_name_id = "Mang_Name",
61
+ manager_type_id = "Mang_Type",
62
+ designation_type_id = "Des_Tp",
63
+ public_access_id = "Pub_Access",
64
+ category = "FeatClass",
65
+ iucn_code = "IUCN_Cat",
66
+ gap_code = "GAP_Sts",
67
+ state = "State_Nm",
68
+ easement_holder = "EsmtHldr",
69
+ date_established = "Date_Est",
70
+ area_square_meters = "SHAPE_Area",
71
+ area_name = "Unit_Nm")
72
+ .left_join(agency_name, "manager_name_id")
73
+ .left_join(agency_type, "manager_type_id")
74
+ .left_join(desig_type, "designation_type_id")
75
+ .left_join(public_access, "public_access_id")
76
+ .left_join(state_name, "state")
77
+ .left_join(iucn, "iucn_code")
78
+ .select(~s.contains("_right"))
79
+ # .select(~s.contains("_id"))
80
+ # if we keep the original geoparquet WKB 'geometry' column, to_pandas() (or execute) gives us only a normal pandas data.frame, and geopandas doesn't see the metadata.
81
+ # if we replace the geometry with duckdb-native 'geometry' type, to_pandas() gives us a geopanadas! But requires reading into RAM.
82
+ .to_pandas()
83
+ .set_crs(crs)
84
+ .to_parquet(vec_file)
85
+ )
86
+
87
+ # +
88
+ import rasterio
89
+ from rasterstats import zonal_stats
90
+ import geopandas as gpd
91
+ import pandas as pd
92
+ from joblib import Parallel, delayed
93
+
94
+ def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=10000):
95
+
96
+ # read in vector as geopandas, match CRS to raster
97
+ with rasterio.open(tif_file) as src:
98
+ raster_profile = src.profile
99
+ gdf = gpd.read_parquet(vec_file).to_crs(raster_profile['crs'])
100
+
101
+ # row_n is a global id, may refer to excluded polygons
102
+ # gdf["row_id"] = gdf.index + 1
103
+
104
+ # lamba fn to zonal_stats a slice:
105
+ def get_stats(geom_slice, tif_file, stats):
106
+ stats = zonal_stats(geom_slice.geometry, tif_file, stats = stats)
107
+ stats[0]['row_n'] = geom_slice.row_n
108
+ # print(geom_slice.row_n)
109
+ return stats[0]
110
+
111
+ # iteratation (could be a list comprehension?)
112
+ jobs = []
113
+ for r in gdf.itertuples():
114
+ jobs.append(delayed(get_stats)(r, tif_file, stats))
115
+
116
+ # And here we go
117
+ output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
118
+
119
+ # reshape output
120
+ df = (
121
+ pd.DataFrame(output)
122
+ .rename(columns={'mean': col_name})
123
+ .merge(gdf, how='right', on = 'row_n')
124
+ )
125
+ gdf = gpd.GeoDataFrame(df, geometry="geometry")
126
+ return gdf
127
+
128
+
129
+ # -
130
+
131
+
132
+ tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
133
+ threads=1
134
+
135
+ # +
136
+ #import geopandas as gpd
137
+ #test = gpd.read_parquet("pad-processed.parquet")
138
+ #test.columns
139
+
140
+ # +
141
+ # %%time
142
+ #
143
+ tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
144
+
145
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
146
+ col_name = "human_impact", n_jobs=1, verbose=0)
147
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet(vec_file)
148
+ # -
149
+
150
+ # %%time
151
+ tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
152
+ big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=threads, verbose=0).to_parquet(vec_file)
153
+
154
+
155
+ # +
156
+ # %%time
157
+
158
+ tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif'
159
+
160
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
161
+ col_name = "rsr", n_jobs=threads, verbose=0).to_parquet(vec_file)
162
+
163
+ # +
164
+ # %%time
165
+
166
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
167
+
168
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
169
+ col_name = "deforest_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
170
+
171
+ # +
172
+ # %%time
173
+
174
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
175
+
176
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
177
+ col_name = "biodiversity_intactness_loss", n_jobs=threads, verbose=0).to_parquet(vec_file)
178
+
179
+ # +
180
+ # %%time
181
+
182
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
183
+
184
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
185
+ col_name = "forest_integrity_loss", n_jobs=threads, verbose=0).to_parquet(vec_file)
186
+
187
+ # +
188
+ # %%time
189
+
190
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
191
+
192
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_expansion", n_jobs=threads, verbose=0)
193
+ gpd.GeoDataFrame(df, geometry="geometry").to_parquet(vec_file)
194
+
195
+ # +
196
+ # %%time
197
+ tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
198
+
199
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "crop_reduction", n_jobs=threads, verbose=0).to_parquet(vec_file)
200
+
201
+ # +
202
+ # %%time
203
+ tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
204
+
205
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "irrecoverable_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
206
+
207
+ # +
208
+ # %%time
209
+ tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
210
+
211
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "manageable_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
212
+
213
+ # +
214
+ # %%time
215
+ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
216
+
217
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_rwr", n_jobs=threads, verbose=0).to_parquet(vec_file)
218
+
219
+ # +
220
+ # %%time
221
+ tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
222
+
223
+ df = big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "all_species_richness", n_jobs=threads, verbose=0).to_parquet(vec_file)
224
+
225
+ # +
226
+ columns = '''
227
+ area_name,
228
+ manager_name,
229
+ manager_name_id,
230
+ manager_type,
231
+ manager_type_id,
232
+ manager_group,
233
+ designation_type,
234
+ designation_type_id,
235
+ public_access,
236
+ category,
237
+ iucn_code,
238
+ iucn_category,
239
+ gap_code,
240
+ state,
241
+ state_name,
242
+ easement_holder,
243
+ date_established,
244
+ area_square_meters,
245
+ geometry,
246
+ all_species_richness,
247
+ all_species_rwr,
248
+ manageable_carbon,
249
+ irrecoverable_carbon,
250
+ crop_reduction,
251
+ crop_expansion,
252
+ deforest_carbon,
253
+ richness,
254
+ rsr,
255
+ forest_integrity_loss,
256
+ biodiversity_intactness_loss
257
+ '''
258
+
259
+ items = columns.split(',')
260
+ # Remove empty strings and whitespace
261
+ items = [item.strip() for item in items if item.strip()]
262
+ items
263
+ # -
264
+
265
+ import ibis
266
+ from ibis import _
267
+ df = ibis.read_parquet(vec_file).select(items).to_parquet(vec_file)
268
+
269
+
270
+ import ibis
271
+ from ibis import _
272
+ ibis.read_parquet("pad-AK-HI-stats.parquet")
pad-AK-HI-stats.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1019bf85ac264c5ebe437ebfea942809bf9df6394837c54f315bc94b487c566
3
+ size 151708809
preprocess.py CHANGED
@@ -14,7 +14,7 @@ parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
14
 
15
  con = ibis.duckdb.connect()
16
  con.load_extension("spatial")
17
- threads = 24
18
 
19
  # or read the fgb version, much slower
20
  # pad = con.read_geo(fgb)
@@ -283,7 +283,6 @@ manager_name,
283
  manager_name_id,
284
  manager_type,
285
  manager_type_id,
286
- manager_group,
287
  designation_type,
288
  designation_type_id,
289
  public_access,
@@ -319,7 +318,7 @@ items
319
  import ibis
320
  from ibis import _
321
  df = ibis.read_parquet("pad-stats.parquet").select(items)
322
- df.group_by(_.manager_group).aggregate(n = _.manager_group.count()).to_pandas()
323
 
324
  # +
325
  ## create pad.duckdb
 
14
 
15
  con = ibis.duckdb.connect()
16
  con.load_extension("spatial")
17
+ threads = -1
18
 
19
  # or read the fgb version, much slower
20
  # pad = con.read_geo(fgb)
 
283
  manager_name_id,
284
  manager_type,
285
  manager_type_id,
 
286
  designation_type,
287
  designation_type_id,
288
  public_access,
 
318
  import ibis
319
  from ibis import _
320
  df = ibis.read_parquet("pad-stats.parquet").select(items)
321
+ df.group_by(_.manager_type).aggregate(n = _.manager_type.count()).to_pandas()
322
 
323
  # +
324
  ## create pad.duckdb