Musterdatenkatalog / utils /process_data.py
Josephina's picture
fixed bug with data path
861b8bf
raw
history blame
2.66 kB
import os
import logging
import pandas as pd
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv")
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
df = pd.read_json(path)
counts = df["ORG"].value_counts().reset_index()
counts.columns = ["ORG", "Count"]
return counts
def merge_geoemtry(data_in: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
data = data_in.merge(cities, left_on="ORG", right_on="Kommune", how="left")
if data["Geometry"].isna().sum() > 0:
logging.warning(
f"Missing {data['Geometry'].isna().sum()} geometries in the data."
)
for row in data[data["Geometry"].isna()].itertuples():
if row.ORG in cities["name"].values:
data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][
"Geometry"
].values[0]
logging.info("data found in citiesname.")
return data
def add_coor(data: pd.DataFrame):
# very experminetal, but works
for row in data.itertuples():
if type(row.Geometry) == str:
data.at[row.Index, "Geometry"] = [
item for item in row.Geometry.strip("[]").split()
]
else:
logging.info(f"{row.Geometry}, {row.Geometry}")
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
data["lat"] = data["Geometry"].apply(
lambda x: float(x[0]) if x is not None else None
)
data["lon"] = data["Geometry"].apply(
lambda x: float(x[1]) if x is not None else None
)
return data
if __name__ == "__main__":
extraction = load_data()
# extraction.to_csv(
# os.path.join("data", "preprocessed", "map_data.csv"), index=False)
logging.info("Extraction data loaded.")
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
logging.info("Data merged with Geometry from cities.csv.")
# extraction = extraction[extraction["Geometry"].notna()]
extraction_enriched = add_coor(extraction)
logging.info("Extra columns for lat/lon created from Geometry column.")
extraction_enriched.to_csv(OUTPUT, index=False)
logging.info("Data enriched and saved.")