Musterdatenkatalog / utils /compare_old_coord.py
Josephina's picture
Refactored Musterdatenkatalog with APP (#1)
4eea983 verified
import os
import pandas as pd
import logging
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
MISSING = os.path.join("data", "missing_final.csv")
def load_data(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
return df
def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple:
old_cities = old["Kommune"].unique()
new_cities = new["Kommune"].unique()
new_cities = set(new_cities) - set(old_cities)
deleted_cities = set(old_cities) - set(new_cities)
return new_cities, deleted_cities
def enrich_new(old, new) -> pd.DataFrame:
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
for row in missing.itertuples():
old_city = old[old["Kommune"] == row.Kommune]
old_city_code = old[old["Code"] == row.Code]
# print(type(old_city.Geometry.iloc[0]), old_city)
if len(old_city) > 0:
# print(new.iloc[row.Index, 2])
new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0]
elif len(old_city_code) > 0:
new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0]
# print(new.loc[[row.Index], ["Geometry"]])
return new
def report_missing(new):
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
logging.info(f"Finally missing cities: {missing['Kommune'].unique()}")
return missing
if __name__ == "__main__":
old = load_data(CITIES_ENRICHED_OLD)
new = load_data(CITIES_ENRICHED_NEW)
new_cities, deleted_cities = compare_cities(old, new)
logging.info(f"New cities: {new_cities}")
logging.info(f"Deleted cities: {deleted_cities}")
new = enrich_new(old, new)
new.to_csv(CITIES_ENRICHED_FINAL, index=False)
missing = report_missing(new)
missing.to_csv(MISSING, index=False)