File size: 2,659 Bytes
d689310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861b8bf
908b9a8
d689310
 
 
 
 
 
 
 
 
 
 
861b8bf
 
908b9a8
 
 
 
 
 
 
 
 
 
d689310
 
 
 
 
908b9a8
 
 
 
d689310
908b9a8
 
d689310
908b9a8
 
 
 
 
 
d689310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import logging
import pandas as pd


# define logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("process_data.log"),
        logging.StreamHandler(),
    ],
)

# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv")

OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")


def load_data(path: str = DATA_RAW) -> pd.DataFrame:
    df = pd.read_json(path)
    counts = df["ORG"].value_counts().reset_index()
    counts.columns = ["ORG", "Count"]
    return counts


def merge_geoemtry(data_in: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
    data = data_in.merge(cities, left_on="ORG", right_on="Kommune", how="left")
    if data["Geometry"].isna().sum() > 0:
        logging.warning(
            f"Missing {data['Geometry'].isna().sum()} geometries in the data."
        )
        for row in data[data["Geometry"].isna()].itertuples():
            if row.ORG in cities["name"].values:
                data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][
                    "Geometry"
                ].values[0]
                logging.info("data found in citiesname.")
    return data


def add_coor(data: pd.DataFrame):
    # very experminetal, but works
    for row in data.itertuples():
        if type(row.Geometry) == str:
            data.at[row.Index, "Geometry"] = [
                item for item in row.Geometry.strip("[]").split()
            ]
        else:
            logging.info(f"{row.Geometry}, {row.Geometry}")
        # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
    data["lat"] = data["Geometry"].apply(
        lambda x: float(x[0]) if x is not None else None
    )
    data["lon"] = data["Geometry"].apply(
        lambda x: float(x[1]) if x is not None else None
    )
    return data


if __name__ == "__main__":
    extraction = load_data()
    # extraction.to_csv(
    #    os.path.join("data", "preprocessed", "map_data.csv"), index=False)
    logging.info("Extraction data loaded.")
    extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
    logging.info("Data merged with Geometry from cities.csv.")
    # extraction = extraction[extraction["Geometry"].notna()]
    extraction_enriched = add_coor(extraction)
    logging.info("Extra columns for lat/lon created from Geometry column.")
    extraction_enriched.to_csv(OUTPUT, index=False)
    logging.info("Data enriched and saved.")