Spaces:
Running
Running
File size: 2,659 Bytes
d689310 861b8bf 908b9a8 d689310 861b8bf 908b9a8 d689310 908b9a8 d689310 908b9a8 d689310 908b9a8 d689310 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import logging
import pandas as pd
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv")
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
df = pd.read_json(path)
counts = df["ORG"].value_counts().reset_index()
counts.columns = ["ORG", "Count"]
return counts
def merge_geoemtry(data_in: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
data = data_in.merge(cities, left_on="ORG", right_on="Kommune", how="left")
if data["Geometry"].isna().sum() > 0:
logging.warning(
f"Missing {data['Geometry'].isna().sum()} geometries in the data."
)
for row in data[data["Geometry"].isna()].itertuples():
if row.ORG in cities["name"].values:
data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][
"Geometry"
].values[0]
logging.info("data found in citiesname.")
return data
def add_coor(data: pd.DataFrame):
# very experminetal, but works
for row in data.itertuples():
if type(row.Geometry) == str:
data.at[row.Index, "Geometry"] = [
item for item in row.Geometry.strip("[]").split()
]
else:
logging.info(f"{row.Geometry}, {row.Geometry}")
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
data["lat"] = data["Geometry"].apply(
lambda x: float(x[0]) if x is not None else None
)
data["lon"] = data["Geometry"].apply(
lambda x: float(x[1]) if x is not None else None
)
return data
if __name__ == "__main__":
extraction = load_data()
# extraction.to_csv(
# os.path.join("data", "preprocessed", "map_data.csv"), index=False)
logging.info("Extraction data loaded.")
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
logging.info("Data merged with Geometry from cities.csv.")
# extraction = extraction[extraction["Geometry"].notna()]
extraction_enriched = add_coor(extraction)
logging.info("Extra columns for lat/lon created from Geometry column.")
extraction_enriched.to_csv(OUTPUT, index=False)
logging.info("Data enriched and saved.")
|