Musterdatenkatalog / utils /process_data.py
Josephina's picture
skripts for map creation in app
d689310
raw
history blame
2.1 kB
import os
import logging
import pandas as pd
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("data.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
df = pd.read_json(path)
counts = df["ORG"].value_counts().reset_index()
counts.columns = ["ORG", "Count"]
return counts
def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left")
return data
def add_coor(data: pd.DataFrame):
# very experminetal, but works
if type(data["Geometry"].iloc[0]) == str:
data["Geometry"] = data["Geometry"].apply(
lambda x: [
float(item) if type(item) != float else None
for item in x.strip("[]").split()
]
)
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None)
data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None)
return data
if __name__ == "__main__":
extraction = load_data()
# extraction.to_csv(
# os.path.join("data", "preprocessed", "map_data.csv"), index=False)
logging.info("Extraction data loaded.")
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
logging.info("Data merged with Geometry from cities.csv.")
# extraction = extraction[extraction["Geometry"].notna()]
extraction_enriched = add_coor(extraction)
logging.info("Extra columns for lat/lon created from Geometry column.")
extraction_enriched.to_csv(OUTPUT, index=False)
logging.info("Data enriched and saved.")