Spaces:
Sleeping
Sleeping
skripts for map creation in app
Browse files- utils/compare_old_coord.py +65 -0
- utils/get_coordinates.py +200 -0
- utils/process_data.py +61 -0
utils/compare_old_coord.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import logging
|
4 |
+
|
5 |
+
# define logger
|
6 |
+
logging.basicConfig(
|
7 |
+
level=logging.INFO,
|
8 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
9 |
+
handlers=[
|
10 |
+
logging.FileHandler("process_data.log"),
|
11 |
+
logging.StreamHandler(),
|
12 |
+
],
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
|
17 |
+
CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
|
18 |
+
CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
|
19 |
+
MISSING = os.path.join("data", "missing.csv")
|
20 |
+
|
21 |
+
|
22 |
+
def load_data(path: str) -> pd.DataFrame:
|
23 |
+
df = pd.read_csv(path)
|
24 |
+
return df
|
25 |
+
|
26 |
+
|
27 |
+
def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple:
|
28 |
+
old_cities = old["ORG"].unique()
|
29 |
+
new_cities = new["Kommune"].unique()
|
30 |
+
new_cities = set(new_cities) - set(old_cities)
|
31 |
+
deleted_cities = set(old_cities) - set(new_cities)
|
32 |
+
return new_cities, deleted_cities
|
33 |
+
|
34 |
+
|
35 |
+
def enrich_new(old, new) -> pd.DataFrame:
|
36 |
+
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
|
37 |
+
for row in missing.itertuples():
|
38 |
+
old_city = old[old["ORG"] == row.Kommune]
|
39 |
+
old_city_code = old[old["Code"] == row.Code]
|
40 |
+
# print(type(old_city.Geometry.iloc[0]), old_city)
|
41 |
+
if len(old_city) > 0:
|
42 |
+
print(new.iloc[row.Index, 2])
|
43 |
+
new.iloc[row.Index, 2] = old_city["Geometry"].iloc[0]
|
44 |
+
elif len(old_city_code) > 0:
|
45 |
+
new.iloc[row.Index, 2] = old_city_code["Geometry"].iloc[0]
|
46 |
+
print(new.loc[[row.Index], ["Geometry"]])
|
47 |
+
return new
|
48 |
+
|
49 |
+
|
50 |
+
def report_missing(new):
|
51 |
+
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
|
52 |
+
logging.info(f"Finally missing cities: {missing['Kommune'].unique()}")
|
53 |
+
return missing
|
54 |
+
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
old = load_data(CITIES_ENRICHED_OLD)
|
58 |
+
new = load_data(CITIES_ENRICHED_NEW)
|
59 |
+
new_cities, deleted_cities = compare_cities(old, new)
|
60 |
+
logging.info(f"New cities: {new_cities}")
|
61 |
+
logging.info(f"Deleted cities: {deleted_cities}")
|
62 |
+
new = enrich_new(old, new)
|
63 |
+
new.to_csv(CITIES_ENRICHED_FINAL, index=False)
|
64 |
+
missing = report_missing(new)
|
65 |
+
missing.to_csv(MISSING, index=False)
|
utils/get_coordinates.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import numpy as np
|
5 |
+
import ast
|
6 |
+
import math
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
# define logger
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
13 |
+
handlers=[
|
14 |
+
logging.FileHandler("process_data.log"),
|
15 |
+
logging.StreamHandler(),
|
16 |
+
],
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
CITIES_DATA = os.path.join("data", "raw", "2024_06_24_cities_0624_v5.csv")
|
21 |
+
DATA_ENRICHED = os.path.join("data", "cities_enriched.csv")
|
22 |
+
|
23 |
+
# meta data for kreis codes ( variable in coordinates table)
|
24 |
+
NAME_CODE_DATA = os.path.join("data", "raw", "name_kreiscode.csv")
|
25 |
+
CODES_KOMMUNEN = os.path.join("data", "raw", "Deutschlandatlas.csv")
|
26 |
+
# coordinates for Gemeinden
|
27 |
+
COORDINATES = os.path.join("data", "raw", "coordinates_plz_kreiscode.csv")
|
28 |
+
|
29 |
+
if not os.path.exists(os.path.join("data", "preprocessed")):
|
30 |
+
Path(os.path.join("data", "preprocessed")).mkdir(parents=True, exist_ok=True)
|
31 |
+
|
32 |
+
|
33 |
+
def load_cities(path: str) -> pd.DataFrame:
|
34 |
+
df = pd.read_csv(path)
|
35 |
+
df.drop(columns=["name"], inplace=True)
|
36 |
+
df.drop_duplicates(inplace=True)
|
37 |
+
return df
|
38 |
+
|
39 |
+
|
40 |
+
def create_code_mapper(path: str) -> dict:
|
41 |
+
name_code = pd.read_csv(
|
42 |
+
path, sep=";", encoding="latin_1", names=["Datum", "Code", "Name", "Fläche"]
|
43 |
+
)[7:13929]
|
44 |
+
# adds all Landkreise and gemeinden to the mapper
|
45 |
+
code_mapper = {
|
46 |
+
(key if type(key) != float else "0000"): value
|
47 |
+
for key, value in zip(name_code["Name"], name_code["Code"])
|
48 |
+
}
|
49 |
+
# adds all gemeindeverbände to the mapper
|
50 |
+
kommunen_code = pd.read_csv(CODES_KOMMUNEN, sep=";", encoding="latin_1")
|
51 |
+
code_mapper_update = {
|
52 |
+
key: value
|
53 |
+
for key, value in zip(kommunen_code["name"], kommunen_code["Gebietskennziffer"])
|
54 |
+
}
|
55 |
+
print(code_mapper_update)
|
56 |
+
code_mapper.update(code_mapper_update)
|
57 |
+
return code_mapper
|
58 |
+
|
59 |
+
|
60 |
+
def map_code(org_name, code_mapper):
|
61 |
+
# Split the org_name string into parts
|
62 |
+
parts = org_name.split()
|
63 |
+
# print(parts, type(parts[0]))
|
64 |
+
# Find a key in code_mapper that contains all parts of the split org_name
|
65 |
+
for key in code_mapper.keys():
|
66 |
+
# look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
|
67 |
+
if all(part in key for part in parts):
|
68 |
+
return code_mapper[key]
|
69 |
+
elif any(part in key for part in parts):
|
70 |
+
return code_mapper[key]
|
71 |
+
# Return None or a default value if no key matches all parts
|
72 |
+
return None
|
73 |
+
|
74 |
+
|
75 |
+
# main goal with this: identify Landkreise and their codes
|
76 |
+
def add_code(df: pd.DataFrame, code_mapper: dict) -> pd.DataFrame:
|
77 |
+
"""Add the (Kreis-/Gemeinde-)code to the dataframe based on the name of the (administrative) region."""
|
78 |
+
df["Code"] = df["Kommune"].apply(lambda x: map_code(x, code_mapper))
|
79 |
+
df["Code"] = df["Code"].apply(lambda x: int(x) if x is not None else None)
|
80 |
+
return df
|
81 |
+
|
82 |
+
|
83 |
+
def org_in_plzname(org_name, plz_name):
|
84 |
+
parts = org_name.split()
|
85 |
+
if any(part in plz_name for part in parts):
|
86 |
+
return True
|
87 |
+
else:
|
88 |
+
return False
|
89 |
+
|
90 |
+
|
91 |
+
def load_coordinates(path: str) -> pd.DataFrame:
|
92 |
+
return pd.read_csv(path, sep=";")
|
93 |
+
|
94 |
+
|
95 |
+
# maybe 2d coordinates instead of geometry
|
96 |
+
def merge_coordinates(df: pd.DataFrame, coordinates: pd.DataFrame) -> pd.DataFrame:
|
97 |
+
"""Merge the coordinates of the regions to the dataframe. Try to use
|
98 |
+
Kreiscode first, if it consists of 5 digits. Else, use the name of
|
99 |
+
the region.
|
100 |
+
"""
|
101 |
+
geometries = []
|
102 |
+
for row in df.itertuples():
|
103 |
+
# adds coordinates for Landkreise
|
104 |
+
if pd.notna(row.Code) and (
|
105 |
+
len(str(int(row.Code))) == 5 or len(str(int(row.Code))) == 4
|
106 |
+
):
|
107 |
+
coor = coordinates[coordinates["Kreis code"] == row.Code]
|
108 |
+
geometry = [co.geo_point_2d for co in coor.itertuples()]
|
109 |
+
geometries.append(geometry)
|
110 |
+
else:
|
111 |
+
coor = coordinates[
|
112 |
+
coordinates["PLZ Name (short)"].apply(
|
113 |
+
lambda x: org_in_plzname(row.Kommune, x)
|
114 |
+
)
|
115 |
+
]
|
116 |
+
# adds coordinates for Gemeindenamen in coordinates table
|
117 |
+
if len(coor) > 0:
|
118 |
+
geometry = [co.geo_point_2d for co in coor.itertuples()]
|
119 |
+
geometries.append(geometry)
|
120 |
+
# adds coordinates from infered kreis code if Gebietskennziffer available
|
121 |
+
elif row.Code and pd.notna(row.Code): # and not math.isnan(row.Code):
|
122 |
+
if len(str(int(row.Code))) < 4:
|
123 |
+
code_str = str(int(row.Code))
|
124 |
+
coor = coordinates[
|
125 |
+
coordinates["Kreis code"]
|
126 |
+
.astype(str)
|
127 |
+
.apply(lambda x: x[: len(code_str)])
|
128 |
+
== code_str
|
129 |
+
]
|
130 |
+
geometry = [co.geo_point_2d for co in coor.itertuples()]
|
131 |
+
geometries.append(geometry)
|
132 |
+
elif str(row.Code)[:2] in ["11", "12", "13", "14", "15", "16"]:
|
133 |
+
coor = coordinates[
|
134 |
+
coordinates["Kreis code"] == int(str(row.Code)[:5])
|
135 |
+
]
|
136 |
+
else:
|
137 |
+
coor = coordinates[
|
138 |
+
coordinates["Kreis code"] == int(str(row.Code)[:4])
|
139 |
+
]
|
140 |
+
geometry = [co.geo_point_2d for co in coor.itertuples()]
|
141 |
+
geometries.append(geometry)
|
142 |
+
else:
|
143 |
+
geometries.append([])
|
144 |
+
df["Geometry"] = geometries
|
145 |
+
return df
|
146 |
+
|
147 |
+
|
148 |
+
def aggregate_coordinates(geo_element: str) -> list:
|
149 |
+
# Convert the string representation of a list into an actual list
|
150 |
+
if geo_element == "[]" or geo_element == []:
|
151 |
+
return []
|
152 |
+
else:
|
153 |
+
actual_list = geo_element # ast.literal_eval(geo_element)
|
154 |
+
processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
|
155 |
+
# print(processed_list)
|
156 |
+
if len(processed_list) > 1:
|
157 |
+
coordinates = np.mean(np.array(processed_list), axis=0)
|
158 |
+
else:
|
159 |
+
coordinates = np.array(processed_list[0])
|
160 |
+
return coordinates
|
161 |
+
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
code_mapper = create_code_mapper(NAME_CODE_DATA)
|
165 |
+
logging.info("Code mapper created")
|
166 |
+
cities = load_cities(CITIES_DATA)
|
167 |
+
data = add_code(cities, code_mapper)
|
168 |
+
missing = data[data["Code"].isnull()]
|
169 |
+
logging.info(f"Missing values Gebietscode: {len(missing)}")
|
170 |
+
data.to_csv(
|
171 |
+
os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"),
|
172 |
+
index=False,
|
173 |
+
)
|
174 |
+
# data = pd.read_csv(
|
175 |
+
# os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"))
|
176 |
+
data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
|
177 |
+
coordinates = load_coordinates(COORDINATES)
|
178 |
+
data = merge_coordinates(data, coordinates)
|
179 |
+
data.to_csv(
|
180 |
+
os.path.join("data", "preprocessed", "cities_enriched_with_coordinates.csv"),
|
181 |
+
index=False,
|
182 |
+
)
|
183 |
+
logging.info("Coordinates merged")
|
184 |
+
|
185 |
+
missing = data[
|
186 |
+
[
|
187 |
+
all([x, y])
|
188 |
+
for x, y in zip(
|
189 |
+
data["Geometry"].apply(lambda x: x == []), data["Code"].isnull()
|
190 |
+
)
|
191 |
+
]
|
192 |
+
]
|
193 |
+
missing_geometry = data[data["Geometry"].apply(lambda x: x == [])]
|
194 |
+
logging.info(f"Missing values: {len(missing)}")
|
195 |
+
logging.info(f"Missing geometry: {len(missing_geometry)}")
|
196 |
+
missing_geometry.to_csv(os.path.join("data", "missing_values.csv"), index=False)
|
197 |
+
|
198 |
+
# data = pd.read_csv(os.path.join("data", "cities_enriched_manually.csv"))
|
199 |
+
data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
|
200 |
+
data.to_csv(DATA_ENRICHED, index=False)
|
utils/process_data.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
|
6 |
+
# define logger
|
7 |
+
logging.basicConfig(
|
8 |
+
level=logging.INFO,
|
9 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
10 |
+
handlers=[
|
11 |
+
logging.FileHandler("process_data.log"),
|
12 |
+
logging.StreamHandler(),
|
13 |
+
],
|
14 |
+
)
|
15 |
+
|
16 |
+
# change these to paths if you want to generate the map_data.csv separately from the app
|
17 |
+
DATA_RAW = os.path.join("data.json")
|
18 |
+
CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")
|
19 |
+
|
20 |
+
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
|
21 |
+
|
22 |
+
|
23 |
+
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
|
24 |
+
df = pd.read_json(path)
|
25 |
+
counts = df["ORG"].value_counts().reset_index()
|
26 |
+
counts.columns = ["ORG", "Count"]
|
27 |
+
return counts
|
28 |
+
|
29 |
+
|
30 |
+
def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
|
31 |
+
data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left")
|
32 |
+
return data
|
33 |
+
|
34 |
+
|
35 |
+
def add_coor(data: pd.DataFrame):
|
36 |
+
# very experminetal, but works
|
37 |
+
if type(data["Geometry"].iloc[0]) == str:
|
38 |
+
data["Geometry"] = data["Geometry"].apply(
|
39 |
+
lambda x: [
|
40 |
+
float(item) if type(item) != float else None
|
41 |
+
for item in x.strip("[]").split()
|
42 |
+
]
|
43 |
+
)
|
44 |
+
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
|
45 |
+
data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None)
|
46 |
+
data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None)
|
47 |
+
return data
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
extraction = load_data()
|
52 |
+
# extraction.to_csv(
|
53 |
+
# os.path.join("data", "preprocessed", "map_data.csv"), index=False)
|
54 |
+
logging.info("Extraction data loaded.")
|
55 |
+
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
|
56 |
+
logging.info("Data merged with Geometry from cities.csv.")
|
57 |
+
# extraction = extraction[extraction["Geometry"].notna()]
|
58 |
+
extraction_enriched = add_coor(extraction)
|
59 |
+
logging.info("Extra columns for lat/lon created from Geometry column.")
|
60 |
+
extraction_enriched.to_csv(OUTPUT, index=False)
|
61 |
+
logging.info("Data enriched and saved.")
|