Carl Boettiger commited on
Commit
308c0d8
1 Parent(s): ef3b477
Files changed (1) hide show
  1. preprocess.py +31 -16
preprocess.py CHANGED
@@ -5,21 +5,28 @@ import fiona
5
  import geopandas as gpd
6
  import rioxarray
7
  from shapely.geometry import box
 
 
 
 
 
 
 
 
 
 
8
 
9
  # +
10
 
11
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
12
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
13
  # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
14
-
15
- con = ibis.duckdb.connect()
16
- con.load_extension("spatial")
17
- threads = -1
18
-
19
  # or read the fgb version, much slower
20
  # pad = con.read_geo(fgb)
21
  # pad = con.read_parquet(parquet)
22
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
 
 
23
  con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
24
  pad = con.table("pad")
25
  # -
@@ -30,12 +37,9 @@ pad = con.table("pad")
30
  meta = fiona.open(fgb)
31
  crs = meta.crs
32
 
33
- # +
34
  ## optional getting bounds
35
- cog = "https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif"
36
-
37
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
38
- r = rioxarray.open_rasterio(cog)
39
  bounds = box(*r.rio.transform_bounds(crs))
40
 
41
  # +
@@ -89,18 +93,29 @@ pad_grouping = (
89
  )
90
  .mutate(bucket = case)
91
  .select(categorical_columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
 
94
  pad_grouping.to_parquet("pad-groupings.parquet")
95
  # -
96
 
97
- agency_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
98
- agency_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
99
- desig_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
100
- public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
101
- state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
102
- iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
103
-
104
  (pad_parquet
105
  .rename(manager_name_id = "Mang_Name",
106
  manager_type_id = "Mang_Type",
 
5
  import geopandas as gpd
6
  import rioxarray
7
  from shapely.geometry import box
8
+ con = ibis.duckdb.connect()
9
+ con.load_extension("spatial")
10
+ threads = -1
11
+
12
+ agency_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
13
+ agency_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
14
+ desig_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
15
+ public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
16
+ state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
17
+ iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
18
 
19
  # +
20
 
21
  fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
22
  parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
23
  # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
 
 
 
 
 
24
  # or read the fgb version, much slower
25
  # pad = con.read_geo(fgb)
26
  # pad = con.read_parquet(parquet)
27
  # Currently ibis doesn't detect that this is GeoParquet. We need a SQL escape-hatch to cast the geometry
28
+
29
+
30
  con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
31
  pad = con.table("pad")
32
  # -
 
37
  meta = fiona.open(fgb)
38
  crs = meta.crs
39
 
 
40
  ## optional getting bounds
 
 
41
  # extract bounds. (in this case these are already in the same projection actually so r.rio.bounds() would work)
42
+ r = rioxarray.open_rasterio("https://data.source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif")
43
  bounds = box(*r.rio.transform_bounds(crs))
44
 
45
  # +
 
93
  )
94
  .mutate(bucket = case)
95
  .select(categorical_columns)
96
+ .rename(manager_name_id = "Mang_Name",
97
+ manager_type_id = "Mang_Type",
98
+ designation_type_id = "Des_Tp",
99
+ public_access_id = "Pub_Access",
100
+ category = "FeatClass",
101
+ iucn_code = "IUCN_Cat",
102
+ gap_code = "GAP_Sts",
103
+ state = "State_Nm",
104
+ easement_holder = "EsmtHldr",
105
+ date_established = "Date_Est",
106
+ area_name = "Unit_Nm")
107
+ .left_join(agency_name, "manager_name_id")
108
+ .left_join(agency_type, "manager_type_id")
109
+ .left_join(desig_type, "designation_type_id")
110
+ .left_join(public_access, "public_access_id")
111
+ .left_join(state_name, "state")
112
+ .left_join(iucn, "iucn_code")
113
+ .select(~s.contains("_right"))
114
  )
115
 
116
  pad_grouping.to_parquet("pad-groupings.parquet")
117
  # -
118
 
 
 
 
 
 
 
 
119
  (pad_parquet
120
  .rename(manager_name_id = "Mang_Name",
121
  manager_type_id = "Mang_Type",