Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import pandas as pd | |
# define logger | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] %(message)s", | |
handlers=[ | |
logging.FileHandler("process_data.log"), | |
logging.StreamHandler(), | |
], | |
) | |
# change these to paths if you want to generate the map_data.csv separately from the app | |
DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json") | |
CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv") | |
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv") | |
def load_data(path: str = DATA_RAW) -> pd.DataFrame: | |
df = pd.read_json(path) | |
counts = df["ORG"].value_counts().reset_index() | |
counts.columns = ["ORG", "Count"] | |
return counts | |
def merge_geoemtry(data_in: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame: | |
data = data_in.merge(cities, left_on="ORG", right_on="Kommune", how="left") | |
if data["Geometry"].isna().sum() > 0: | |
logging.warning( | |
f"Missing {data['Geometry'].isna().sum()} geometries in the data." | |
) | |
for row in data[data["Geometry"].isna()].itertuples(): | |
if row.ORG in cities["name"].values: | |
data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][ | |
"Geometry" | |
].values[0] | |
logging.info("data found in citiesname.") | |
return data | |
def add_coor(data: pd.DataFrame): | |
# very experminetal, but works | |
for row in data.itertuples(): | |
if type(row.Geometry) == str: | |
data.at[row.Index, "Geometry"] = [ | |
item for item in row.Geometry.strip("[]").split() | |
] | |
else: | |
logging.info(f"{row.Geometry}, {row.Geometry}") | |
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0]) | |
data["lat"] = data["Geometry"].apply( | |
lambda x: float(x[0]) if x is not None else None | |
) | |
data["lon"] = data["Geometry"].apply( | |
lambda x: float(x[1]) if x is not None else None | |
) | |
return data | |
if __name__ == "__main__": | |
extraction = load_data() | |
# extraction.to_csv( | |
# os.path.join("data", "preprocessed", "map_data.csv"), index=False) | |
logging.info("Extraction data loaded.") | |
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED)) | |
logging.info("Data merged with Geometry from cities.csv.") | |
# extraction = extraction[extraction["Geometry"].notna()] | |
extraction_enriched = add_coor(extraction) | |
logging.info("Extra columns for lat/lon created from Geometry column.") | |
extraction_enriched.to_csv(OUTPUT, index=False) | |
logging.info("Data enriched and saved.") | |