import os import logging import pandas as pd # define logger logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("process_data.log"), logging.StreamHandler(), ], ) # change these to paths if you want to generate the map_data.csv separately from the app DATA_RAW = os.path.join("data.json") CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv") OUTPUT = os.path.join("data", "preprocessed", "map_data.csv") def load_data(path: str = DATA_RAW) -> pd.DataFrame: df = pd.read_json(path) counts = df["ORG"].value_counts().reset_index() counts.columns = ["ORG", "Count"] return counts def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame: data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left") return data def add_coor(data: pd.DataFrame): # very experminetal, but works if type(data["Geometry"].iloc[0]) == str: data["Geometry"] = data["Geometry"].apply( lambda x: [ float(item) if type(item) != float else None for item in x.strip("[]").split() ] ) # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0]) data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None) data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None) return data if __name__ == "__main__": extraction = load_data() # extraction.to_csv( # os.path.join("data", "preprocessed", "map_data.csv"), index=False) logging.info("Extraction data loaded.") extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED)) logging.info("Data merged with Geometry from cities.csv.") # extraction = extraction[extraction["Geometry"].notna()] extraction_enriched = add_coor(extraction) logging.info("Extra columns for lat/lon created from Geometry column.") extraction_enriched.to_csv(OUTPUT, index=False) logging.info("Data enriched and saved.")