{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "3e5756d2-382b-49e9-93b5-2ecf6d0eb812", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "con = duckdb.connect()\n", "\n", "con.execute(\"SET s3_region='us-west-2';\")\n", "con.execute(\"LOAD spatial;\")\n", "con.execute(\"LOAD httpfs;\")\n", "\n", "query = \"\"\"\n", " COPY (\n", " SELECT * \n", " FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=divisions/*/*')\n", " WHERE country = 'US' AND subtype IN ('locality', 'neighborhood')\n", " ) TO 'us_localities_neighborhoods.parquet' (FORMAT 'parquet');\n", "\"\"\"\n", "con.execute(query)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "25f62dd7-5539-438b-8f0a-1d85c9bc78ab", "metadata": {}, "outputs": [], "source": [ "import ibis\n", "from ibis import _\n", "\n", "conn = ibis.duckdb.connect(extensions=[\"spatial\"])\n", "\n", "df = (conn\n", " .read_parquet(\"us_localities_neighborhoods.parquet\")\n", " .cast({\"geometry\": \"geometry\"})\n", " .filter(_[\"type\"] == \"division\")\n", " .filter(_[\"subtype\"] == \"locality\")\n", " .mutate(name = _.names[\"primary\"])\n", " .mutate(state_id = _.region.replace(\"US-\", \"\")) \n", " .mutate(county = _.hierarchies[0][2]['name'] )\n", " .mutate(key_long = _.name + ibis.literal('-') + _.county + ibis.literal('-') + _.state_id)\n", " .select(\"key_long\",\"name\", \"county\",\"state_id\" ,\"geometry\")\n", " )\n", "\n", "\n", "## Dropping rows with same locality and state, with differing counties \n", "county_count = (\n", " df.group_by([\"name\", \"state_id\"])\n", " .aggregate(county_count=_.county.nunique()) # Count unique counties for each group\n", ") \n", "valid_names = county_count.filter(county_count.county_count == 1).select(\"name\", \"state_id\")\n", "df_filtered = df.join(valid_names, [\"name\", \"state_id\"], how=\"inner\")\n", "\n", "\n", "# if two records have the same name but different geometries, only keep the first one.\n", "df_first = (\n", " df_filtered.group_by(\"key_long\")\n", " .aggregate(\n", " name=df_filtered.name.first(),\n", " county=df_filtered.county.first(),\n", " state_id=df_filtered.state_id.first(),\n", " geometry=df_filtered.geometry.first()\n", " )\n", "\n", ")\n", "\n", "df_first.execute().to_parquet(\"us_localities.parquet\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }