Josephina commited on
Commit
d689310
1 Parent(s): 05c3095

skripts for map creation in app

Browse files
utils/compare_old_coord.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import logging
4
+
5
+ # define logger
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s [%(levelname)s] %(message)s",
9
+ handlers=[
10
+ logging.FileHandler("process_data.log"),
11
+ logging.StreamHandler(),
12
+ ],
13
+ )
14
+
15
+
16
+ CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
17
+ CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
18
+ CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
19
+ MISSING = os.path.join("data", "missing.csv")
20
+
21
+
22
+ def load_data(path: str) -> pd.DataFrame:
23
+ df = pd.read_csv(path)
24
+ return df
25
+
26
+
27
+ def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple:
28
+ old_cities = old["ORG"].unique()
29
+ new_cities = new["Kommune"].unique()
30
+ new_cities = set(new_cities) - set(old_cities)
31
+ deleted_cities = set(old_cities) - set(new_cities)
32
+ return new_cities, deleted_cities
33
+
34
+
35
+ def enrich_new(old, new) -> pd.DataFrame:
36
+ missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
37
+ for row in missing.itertuples():
38
+ old_city = old[old["ORG"] == row.Kommune]
39
+ old_city_code = old[old["Code"] == row.Code]
40
+ # print(type(old_city.Geometry.iloc[0]), old_city)
41
+ if len(old_city) > 0:
42
+ print(new.iloc[row.Index, 2])
43
+ new.iloc[row.Index, 2] = old_city["Geometry"].iloc[0]
44
+ elif len(old_city_code) > 0:
45
+ new.iloc[row.Index, 2] = old_city_code["Geometry"].iloc[0]
46
+ print(new.loc[[row.Index], ["Geometry"]])
47
+ return new
48
+
49
+
50
+ def report_missing(new):
51
+ missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
52
+ logging.info(f"Finally missing cities: {missing['Kommune'].unique()}")
53
+ return missing
54
+
55
+
56
+ if __name__ == "__main__":
57
+ old = load_data(CITIES_ENRICHED_OLD)
58
+ new = load_data(CITIES_ENRICHED_NEW)
59
+ new_cities, deleted_cities = compare_cities(old, new)
60
+ logging.info(f"New cities: {new_cities}")
61
+ logging.info(f"Deleted cities: {deleted_cities}")
62
+ new = enrich_new(old, new)
63
+ new.to_csv(CITIES_ENRICHED_FINAL, index=False)
64
+ missing = report_missing(new)
65
+ missing.to_csv(MISSING, index=False)
utils/get_coordinates.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import logging
4
+ import numpy as np
5
+ import ast
6
+ import math
7
+ from pathlib import Path
8
+
9
+ # define logger
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s [%(levelname)s] %(message)s",
13
+ handlers=[
14
+ logging.FileHandler("process_data.log"),
15
+ logging.StreamHandler(),
16
+ ],
17
+ )
18
+
19
+
20
+ CITIES_DATA = os.path.join("data", "raw", "2024_06_24_cities_0624_v5.csv")
21
+ DATA_ENRICHED = os.path.join("data", "cities_enriched.csv")
22
+
23
+ # meta data for kreis codes ( variable in coordinates table)
24
+ NAME_CODE_DATA = os.path.join("data", "raw", "name_kreiscode.csv")
25
+ CODES_KOMMUNEN = os.path.join("data", "raw", "Deutschlandatlas.csv")
26
+ # coordinates for Gemeinden
27
+ COORDINATES = os.path.join("data", "raw", "coordinates_plz_kreiscode.csv")
28
+
29
+ if not os.path.exists(os.path.join("data", "preprocessed")):
30
+ Path(os.path.join("data", "preprocessed")).mkdir(parents=True, exist_ok=True)
31
+
32
+
33
+ def load_cities(path: str) -> pd.DataFrame:
34
+ df = pd.read_csv(path)
35
+ df.drop(columns=["name"], inplace=True)
36
+ df.drop_duplicates(inplace=True)
37
+ return df
38
+
39
+
40
+ def create_code_mapper(path: str) -> dict:
41
+ name_code = pd.read_csv(
42
+ path, sep=";", encoding="latin_1", names=["Datum", "Code", "Name", "Fläche"]
43
+ )[7:13929]
44
+ # adds all Landkreise and gemeinden to the mapper
45
+ code_mapper = {
46
+ (key if type(key) != float else "0000"): value
47
+ for key, value in zip(name_code["Name"], name_code["Code"])
48
+ }
49
+ # adds all gemeindeverbände to the mapper
50
+ kommunen_code = pd.read_csv(CODES_KOMMUNEN, sep=";", encoding="latin_1")
51
+ code_mapper_update = {
52
+ key: value
53
+ for key, value in zip(kommunen_code["name"], kommunen_code["Gebietskennziffer"])
54
+ }
55
+ print(code_mapper_update)
56
+ code_mapper.update(code_mapper_update)
57
+ return code_mapper
58
+
59
+
60
+ def map_code(org_name, code_mapper):
61
+ # Split the org_name string into parts
62
+ parts = org_name.split()
63
+ # print(parts, type(parts[0]))
64
+ # Find a key in code_mapper that contains all parts of the split org_name
65
+ for key in code_mapper.keys():
66
+ # look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
67
+ if all(part in key for part in parts):
68
+ return code_mapper[key]
69
+ elif any(part in key for part in parts):
70
+ return code_mapper[key]
71
+ # Return None or a default value if no key matches all parts
72
+ return None
73
+
74
+
75
+ # main goal with this: identify Landkreise and their codes
76
+ def add_code(df: pd.DataFrame, code_mapper: dict) -> pd.DataFrame:
77
+ """Add the (Kreis-/Gemeinde-)code to the dataframe based on the name of the (administrative) region."""
78
+ df["Code"] = df["Kommune"].apply(lambda x: map_code(x, code_mapper))
79
+ df["Code"] = df["Code"].apply(lambda x: int(x) if x is not None else None)
80
+ return df
81
+
82
+
83
+ def org_in_plzname(org_name, plz_name):
84
+ parts = org_name.split()
85
+ if any(part in plz_name for part in parts):
86
+ return True
87
+ else:
88
+ return False
89
+
90
+
91
+ def load_coordinates(path: str) -> pd.DataFrame:
92
+ return pd.read_csv(path, sep=";")
93
+
94
+
95
+ # maybe 2d coordinates instead of geometry
96
+ def merge_coordinates(df: pd.DataFrame, coordinates: pd.DataFrame) -> pd.DataFrame:
97
+ """Merge the coordinates of the regions to the dataframe. Try to use
98
+ Kreiscode first, if it consists of 5 digits. Else, use the name of
99
+ the region.
100
+ """
101
+ geometries = []
102
+ for row in df.itertuples():
103
+ # adds coordinates for Landkreise
104
+ if pd.notna(row.Code) and (
105
+ len(str(int(row.Code))) == 5 or len(str(int(row.Code))) == 4
106
+ ):
107
+ coor = coordinates[coordinates["Kreis code"] == row.Code]
108
+ geometry = [co.geo_point_2d for co in coor.itertuples()]
109
+ geometries.append(geometry)
110
+ else:
111
+ coor = coordinates[
112
+ coordinates["PLZ Name (short)"].apply(
113
+ lambda x: org_in_plzname(row.Kommune, x)
114
+ )
115
+ ]
116
+ # adds coordinates for Gemeindenamen in coordinates table
117
+ if len(coor) > 0:
118
+ geometry = [co.geo_point_2d for co in coor.itertuples()]
119
+ geometries.append(geometry)
120
+ # adds coordinates from infered kreis code if Gebietskennziffer available
121
+ elif row.Code and pd.notna(row.Code): # and not math.isnan(row.Code):
122
+ if len(str(int(row.Code))) < 4:
123
+ code_str = str(int(row.Code))
124
+ coor = coordinates[
125
+ coordinates["Kreis code"]
126
+ .astype(str)
127
+ .apply(lambda x: x[: len(code_str)])
128
+ == code_str
129
+ ]
130
+ geometry = [co.geo_point_2d for co in coor.itertuples()]
131
+ geometries.append(geometry)
132
+ elif str(row.Code)[:2] in ["11", "12", "13", "14", "15", "16"]:
133
+ coor = coordinates[
134
+ coordinates["Kreis code"] == int(str(row.Code)[:5])
135
+ ]
136
+ else:
137
+ coor = coordinates[
138
+ coordinates["Kreis code"] == int(str(row.Code)[:4])
139
+ ]
140
+ geometry = [co.geo_point_2d for co in coor.itertuples()]
141
+ geometries.append(geometry)
142
+ else:
143
+ geometries.append([])
144
+ df["Geometry"] = geometries
145
+ return df
146
+
147
+
148
+ def aggregate_coordinates(geo_element: str) -> list:
149
+ # Convert the string representation of a list into an actual list
150
+ if geo_element == "[]" or geo_element == []:
151
+ return []
152
+ else:
153
+ actual_list = geo_element # ast.literal_eval(geo_element)
154
+ processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
155
+ # print(processed_list)
156
+ if len(processed_list) > 1:
157
+ coordinates = np.mean(np.array(processed_list), axis=0)
158
+ else:
159
+ coordinates = np.array(processed_list[0])
160
+ return coordinates
161
+
162
+
163
+ if __name__ == "__main__":
164
+ code_mapper = create_code_mapper(NAME_CODE_DATA)
165
+ logging.info("Code mapper created")
166
+ cities = load_cities(CITIES_DATA)
167
+ data = add_code(cities, code_mapper)
168
+ missing = data[data["Code"].isnull()]
169
+ logging.info(f"Missing values Gebietscode: {len(missing)}")
170
+ data.to_csv(
171
+ os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"),
172
+ index=False,
173
+ )
174
+ # data = pd.read_csv(
175
+ # os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"))
176
+ data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
177
+ coordinates = load_coordinates(COORDINATES)
178
+ data = merge_coordinates(data, coordinates)
179
+ data.to_csv(
180
+ os.path.join("data", "preprocessed", "cities_enriched_with_coordinates.csv"),
181
+ index=False,
182
+ )
183
+ logging.info("Coordinates merged")
184
+
185
+ missing = data[
186
+ [
187
+ all([x, y])
188
+ for x, y in zip(
189
+ data["Geometry"].apply(lambda x: x == []), data["Code"].isnull()
190
+ )
191
+ ]
192
+ ]
193
+ missing_geometry = data[data["Geometry"].apply(lambda x: x == [])]
194
+ logging.info(f"Missing values: {len(missing)}")
195
+ logging.info(f"Missing geometry: {len(missing_geometry)}")
196
+ missing_geometry.to_csv(os.path.join("data", "missing_values.csv"), index=False)
197
+
198
+ # data = pd.read_csv(os.path.join("data", "cities_enriched_manually.csv"))
199
+ data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
200
+ data.to_csv(DATA_ENRICHED, index=False)
utils/process_data.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import pandas as pd
4
+
5
+
6
+ # define logger
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s [%(levelname)s] %(message)s",
10
+ handlers=[
11
+ logging.FileHandler("process_data.log"),
12
+ logging.StreamHandler(),
13
+ ],
14
+ )
15
+
16
+ # change these to paths if you want to generate the map_data.csv separately from the app
17
+ DATA_RAW = os.path.join("data.json")
18
+ CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")
19
+
20
+ OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
21
+
22
+
23
+ def load_data(path: str = DATA_RAW) -> pd.DataFrame:
24
+ df = pd.read_json(path)
25
+ counts = df["ORG"].value_counts().reset_index()
26
+ counts.columns = ["ORG", "Count"]
27
+ return counts
28
+
29
+
30
+ def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
31
+ data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left")
32
+ return data
33
+
34
+
35
+ def add_coor(data: pd.DataFrame):
36
+ # very experminetal, but works
37
+ if type(data["Geometry"].iloc[0]) == str:
38
+ data["Geometry"] = data["Geometry"].apply(
39
+ lambda x: [
40
+ float(item) if type(item) != float else None
41
+ for item in x.strip("[]").split()
42
+ ]
43
+ )
44
+ # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
45
+ data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None)
46
+ data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None)
47
+ return data
48
+
49
+
50
+ if __name__ == "__main__":
51
+ extraction = load_data()
52
+ # extraction.to_csv(
53
+ # os.path.join("data", "preprocessed", "map_data.csv"), index=False)
54
+ logging.info("Extraction data loaded.")
55
+ extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
56
+ logging.info("Data merged with Geometry from cities.csv.")
57
+ # extraction = extraction[extraction["Geometry"].notna()]
58
+ extraction_enriched = add_coor(extraction)
59
+ logging.info("Extra columns for lat/lon created from Geometry column.")
60
+ extraction_enriched.to_csv(OUTPUT, index=False)
61
+ logging.info("Data enriched and saved.")