Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import pandas as pd | |
# define logger | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] %(message)s", | |
handlers=[ | |
logging.FileHandler("process_data.log"), | |
logging.StreamHandler(), | |
], | |
) | |
# change these to paths if you want to generate the map_data.csv separately from the app | |
DATA_RAW = os.path.join("data.json") | |
CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv") | |
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv") | |
def load_data(path: str = DATA_RAW) -> pd.DataFrame: | |
df = pd.read_json(path) | |
counts = df["ORG"].value_counts().reset_index() | |
counts.columns = ["ORG", "Count"] | |
return counts | |
def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame: | |
data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left") | |
return data | |
def add_coor(data: pd.DataFrame): | |
# very experminetal, but works | |
if type(data["Geometry"].iloc[0]) == str: | |
data["Geometry"] = data["Geometry"].apply( | |
lambda x: [ | |
float(item) if type(item) != float else None | |
for item in x.strip("[]").split() | |
] | |
) | |
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0]) | |
data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None) | |
data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None) | |
return data | |
if __name__ == "__main__": | |
extraction = load_data() | |
# extraction.to_csv( | |
# os.path.join("data", "preprocessed", "map_data.csv"), index=False) | |
logging.info("Extraction data loaded.") | |
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED)) | |
logging.info("Data merged with Geometry from cities.csv.") | |
# extraction = extraction[extraction["Geometry"].notna()] | |
extraction_enriched = add_coor(extraction) | |
logging.info("Extra columns for lat/lon created from Geometry column.") | |
extraction_enriched.to_csv(OUTPUT, index=False) | |
logging.info("Data enriched and saved.") | |