Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
import logging | |
# define logger | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] %(message)s", | |
handlers=[ | |
logging.FileHandler("process_data.log"), | |
logging.StreamHandler(), | |
], | |
) | |
CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv") | |
CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv") | |
CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv") | |
MISSING = os.path.join("data", "missing_final.csv") | |
def load_data(path: str) -> pd.DataFrame: | |
df = pd.read_csv(path) | |
return df | |
def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple: | |
old_cities = old["Kommune"].unique() | |
new_cities = new["Kommune"].unique() | |
new_cities = set(new_cities) - set(old_cities) | |
deleted_cities = set(old_cities) - set(new_cities) | |
return new_cities, deleted_cities | |
def enrich_new(old, new) -> pd.DataFrame: | |
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] | |
for row in missing.itertuples(): | |
old_city = old[old["Kommune"] == row.Kommune] | |
old_city_code = old[old["Code"] == row.Code] | |
# print(type(old_city.Geometry.iloc[0]), old_city) | |
if len(old_city) > 0: | |
# print(new.iloc[row.Index, 2]) | |
new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0] | |
elif len(old_city_code) > 0: | |
new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0] | |
# print(new.loc[[row.Index], ["Geometry"]]) | |
return new | |
def report_missing(new): | |
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] | |
logging.info(f"Finally missing cities: {missing['Kommune'].unique()}") | |
return missing | |
if __name__ == "__main__": | |
old = load_data(CITIES_ENRICHED_OLD) | |
new = load_data(CITIES_ENRICHED_NEW) | |
new_cities, deleted_cities = compare_cities(old, new) | |
logging.info(f"New cities: {new_cities}") | |
logging.info(f"Deleted cities: {deleted_cities}") | |
new = enrich_new(old, new) | |
new.to_csv(CITIES_ENRICHED_FINAL, index=False) | |
missing = report_missing(new) | |
missing.to_csv(MISSING, index=False) | |