File size: 3,174 Bytes
c67e57e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e5756d2-382b-49e9-93b5-2ecf6d0eb812",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "\n",
    "con = duckdb.connect()\n",
    "\n",
    "con.execute(\"SET s3_region='us-west-2';\")\n",
    "con.execute(\"LOAD spatial;\")\n",
    "con.execute(\"LOAD httpfs;\")\n",
    "\n",
    "query = \"\"\"\n",
    "    COPY (\n",
    "        SELECT * \n",
    "        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=divisions/*/*')\n",
    "        WHERE country = 'US' AND subtype IN ('locality', 'neighborhood')\n",
    "    ) TO 'us_localities_neighborhoods.parquet' (FORMAT 'parquet');\n",
    "\"\"\"\n",
    "con.execute(query)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25f62dd7-5539-438b-8f0a-1d85c9bc78ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ibis\n",
    "from ibis import _\n",
    "\n",
    "conn = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
    "\n",
    "df = (conn\n",
    "      .read_parquet(\"us_localities_neighborhoods.parquet\")\n",
    "      .cast({\"geometry\": \"geometry\"})\n",
    "      .filter(_[\"type\"] == \"division\")\n",
    "      .filter(_[\"subtype\"] == \"locality\")\n",
    "      .mutate(name = _.names[\"primary\"])\n",
    "      .mutate(state_id = _.region.replace(\"US-\", \"\")) \n",
    "      .mutate(county = _.hierarchies[0][2]['name'] )\n",
    "      .mutate(key_long = _.name + ibis.literal('-') + _.county + ibis.literal('-') + _.state_id)\n",
    "      .select(\"key_long\",\"name\", \"county\",\"state_id\" ,\"geometry\")\n",
    "     )\n",
    "\n",
    "\n",
    "## Dropping rows with same locality and state, with differing counties \n",
    "county_count = (\n",
    "    df.group_by([\"name\", \"state_id\"])\n",
    "    .aggregate(county_count=_.county.nunique())  # Count unique counties for each group\n",
    ") \n",
    "valid_names = county_count.filter(county_count.county_count == 1).select(\"name\", \"state_id\")\n",
    "df_filtered = df.join(valid_names, [\"name\", \"state_id\"], how=\"inner\")\n",
    "\n",
    "\n",
    "# if two records have the same name but different geometries, only keep the first one.\n",
    "df_first = (\n",
    "    df_filtered.group_by(\"key_long\")\n",
    "    .aggregate(\n",
    "        name=df_filtered.name.first(),\n",
    "        county=df_filtered.county.first(),\n",
    "        state_id=df_filtered.state_id.first(),\n",
    "        geometry=df_filtered.geometry.first()\n",
    "    )\n",
    "\n",
    ")\n",
    "\n",
    "df_first.execute().to_parquet(\"us_localities.parquet\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}