|
|
import ibis |
|
|
from ibis import _ |
|
|
|
|
|
def get_ecoregion(index): |
|
|
ecoregions = ['Central_California_Coast', |
|
|
'Central_Valley_Coast_Ranges', |
|
|
'Colorado_Desert', |
|
|
'Great_Valley_North', |
|
|
'Great_Valley_South', |
|
|
'Klamath_Mountains', |
|
|
'Modoc_Plateau', |
|
|
'Mojave_Desert', |
|
|
'Mono', |
|
|
'Northern_California_Coast', |
|
|
'Northern_California_Coast_Ranges', |
|
|
'Northern_California_Interior_Coast_Ranges', |
|
|
'Northwestern_Basin_and_Range', |
|
|
'Sierra_Nevada', |
|
|
'Sierra_Nevada_Foothills', |
|
|
'Sonoran_Desert', |
|
|
'Southeastern_Great_Basin', |
|
|
'Southern_California_Coast', |
|
|
'Southern_California_Mountains_and_Valleys', |
|
|
'Southern_Cascades'] |
|
|
eco = ecoregions[index] |
|
|
return eco |
|
|
|
|
|
def split_layer(data3_url, con): |
|
|
overlap_url = 's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/Habitat_and_Climate_zones/CWHR13_climate_dissolved_geoms_simplify10m_includesNA.parquet' |
|
|
overlap_table = con.read_parquet(overlap_url) |
|
|
SQM_PER_ACRE = 4046.8564224 |
|
|
t3 = con.read_parquet(data3_url).select("id", "name", "manager", "manager_type", "county", "gap_code", |
|
|
"status", "land_tenure", "ecoregion", "access_type", "geom") |
|
|
|
|
|
|
|
|
habitat_letter_map = { |
|
|
"Agriculture": "a", |
|
|
"Barren/Other": "b", |
|
|
"Conifer Forest": "c", |
|
|
"Conifer Woodland": "d", |
|
|
"Desert Shrub": "e", |
|
|
"Desert Woodland": "f", |
|
|
"Hardwood Forest": "g", |
|
|
"Hardwood Woodland": "h", |
|
|
"Herbaceous": "i", |
|
|
"Shrub": "j", |
|
|
"Urban": "k", |
|
|
"Water": "l", |
|
|
"Wetland": "m", |
|
|
"None": "n" |
|
|
} |
|
|
|
|
|
climate_letter_map = { |
|
|
"Zone 1": "a", |
|
|
"Zone 2": "b", |
|
|
"Zone 3": "c", |
|
|
"Zone 4": "d", |
|
|
"Zone 5": "e", |
|
|
"Zone 6": "f", |
|
|
"Zone 7": "g", |
|
|
"Zone 8": "h", |
|
|
"Zone 9": "i", |
|
|
"Zone 10": "j", |
|
|
"None": "k", |
|
|
} |
|
|
|
|
|
habitat_letter_table = ibis.memtable([{"habitat_type": k, "habitat_letter": v} for k, v in habitat_letter_map.items()]) |
|
|
climate_letter_table = ibis.memtable([{"climate_zone": k, "climate_letter": v} for k, v in climate_letter_map.items()]) |
|
|
|
|
|
|
|
|
overlap_labeled = ( |
|
|
overlap_table |
|
|
.inner_join(habitat_letter_table, "habitat_type") |
|
|
.inner_join(climate_letter_table, "climate_zone") |
|
|
) |
|
|
|
|
|
|
|
|
joined = t3.cross_join(overlap_labeled) |
|
|
joined = joined.mutate(intersects=t3.geom.intersects(overlap_labeled.geom)) |
|
|
|
|
|
|
|
|
matched = joined.filter(joined.intersects) |
|
|
|
|
|
|
|
|
matched = matched.select( |
|
|
id=t3.id, |
|
|
sub_id=t3.id.cast("string") |
|
|
.concat("_") |
|
|
.concat(overlap_labeled.habitat_letter) |
|
|
.concat(overlap_labeled.climate_letter) |
|
|
.name("sub_id"), |
|
|
habitat_type=overlap_labeled.habitat_type, |
|
|
climate_zone=overlap_labeled.climate_zone, |
|
|
name=t3.name, |
|
|
manager=t3.manager, |
|
|
manager_type=t3.manager_type, |
|
|
county=t3.county, |
|
|
gap_code=t3.gap_code, |
|
|
status=t3.status, |
|
|
land_tenure=t3.land_tenure, |
|
|
ecoregion=t3.ecoregion, |
|
|
access_type=t3.access_type, |
|
|
geom=t3.geom.intersection(overlap_labeled.geom), |
|
|
acres=(t3.geom.intersection(overlap_labeled.geom).area() / SQM_PER_ACRE).round(4) |
|
|
) |
|
|
|
|
|
|
|
|
matched_ids = matched.select("id").distinct() |
|
|
|
|
|
|
|
|
unmatched = t3.left_join(matched_ids, "id").filter(matched_ids.id.isnull()) |
|
|
unmatched = unmatched.select( |
|
|
id=t3.id, |
|
|
sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"), |
|
|
habitat_type=ibis.literal("None"), |
|
|
climate_zone=ibis.literal("None"), |
|
|
name=t3.name, |
|
|
manager=t3.manager, |
|
|
manager_type=t3.manager_type, |
|
|
county=t3.county, |
|
|
gap_code=t3.gap_code, |
|
|
status=t3.status, |
|
|
land_tenure=t3.land_tenure, |
|
|
ecoregion=t3.ecoregion, |
|
|
access_type=t3.access_type, |
|
|
geom=t3.geom, |
|
|
acres=(t3.geom.area() / SQM_PER_ACRE).round(4) |
|
|
) |
|
|
|
|
|
|
|
|
matched_geoms = matched.group_by("id").aggregate( |
|
|
matched_union=matched.geom.unary_union() |
|
|
) |
|
|
|
|
|
residuals = t3.inner_join(matched_geoms, "id").mutate( |
|
|
residual_geom=t3.geom.difference(matched_geoms.matched_union) |
|
|
) |
|
|
|
|
|
residuals = (residuals |
|
|
.filter((_.residual_geom.is_valid()) |
|
|
& ((_.residual_geom.area()) > 0 ) |
|
|
)) |
|
|
|
|
|
|
|
|
residual_rows = residuals.select( |
|
|
id=t3.id, |
|
|
sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"), |
|
|
habitat_type=ibis.literal("None"), |
|
|
climate_zone=ibis.literal("None"), |
|
|
name=t3.name, |
|
|
manager=t3.manager, |
|
|
manager_type=t3.manager_type, |
|
|
county=t3.county, |
|
|
gap_code=t3.gap_code, |
|
|
status=t3.status, |
|
|
land_tenure=t3.land_tenure, |
|
|
ecoregion=t3.ecoregion, |
|
|
access_type=t3.access_type, |
|
|
geom=_.residual_geom, |
|
|
acres=(_.residual_geom.area() / SQM_PER_ACRE).round(4) |
|
|
) |
|
|
result = matched.union(unmatched).union(residual_rows) |
|
|
return result |
|
|
|
|
|
def check_results(con, url,save_url): |
|
|
original_id = con.read_parquet(url).select('id').distinct().execute()['id'] |
|
|
new_id = con.read_parquet(save_url).select('id').distinct().execute()['id'] |
|
|
missing_ids = list(set(original_id)- set(new_id)) |
|
|
print(f'# of missing IDs: {len(missing_ids)}') |
|
|
original_acres = con.read_parquet(url).select('acres').execute()['acres'].sum() |
|
|
new_acres = con.read_parquet(save_url).select('acres').execute()['acres'].sum() |
|
|
acres_loss = original_acres-new_acres |
|
|
print(f'Acres loss: {acres_loss}\n') |
|
|
print(f'Ratio: {new_acres/original_acres}\n') |
|
|
return missing_ids |
|
|
|