Datathon-2024 / app /data_processing /aggregate_data.py
Birgir
added data to app
35f2f48
import csv
from data_processing.age_distribution_by_id import get_age_distribution
from data_processing.get_smallAreaInfo import get_smallAreas
from data_processing.get_density import get_density
from data_processing.income_decile_by_id import get_income_decile
import os
import pandas as pd
import geopandas as gpd
def get_feature_df():
'''
# TODO
'''
# Specify file paths here
csv_ibuafjoldi = os.path.join('given_data', 'ibuafjoldi.csv')
csv_tekjutiundir = os.path.join('given_data', 'tekjutiundir.csv')
json_ibuafjoldi = os.path.join('given_data', 'smasvaedi_2021.json')
smallareas = gpd.read_file("given_data/smasvaedi_2021.json")
# Small area id: id of the small area
# Density: current density of the small area
# Income distribution: the distribution of income in the small area per year (dictionary, keys: years, values: income distribution [buckets])
# Age distribution: distibution of age in the small area (age buckets of 5 years starting at 0-4)
# Geometry: the lat and long coordinates for the small area polygon
# Projected dwellings:
columns = ["smallAreaId", "density", "income_distribution_per_year", "age_distribution", "geometry", "projected_dwellings"]
# get list of smsv, each represented as {"id": smsv_id, "geometry": [(long, lat), ...]}
smsv_id_geom = get_smallAreas()
smsv_ids = [smsv["id"] for smsv in smsv_id_geom] # list of smsv ids
# for each smsv_id get the age distribution for several years if required
years = [2023, 2024] # Example years for age distribution
age_distribution = get_age_distribution(years, smsv_ids, csv_ibuafjoldi) # Dict with age data
# for each smsv_id get the income distribution (distributed in deciles) for several years if required
years = [2023, 2024] # Example years for age distribution
income_distribution = get_income_decile(years, smsv_ids, csv_tekjutiundir) # Dict with income data
# Populate pandas dataframe
data = []
for smsv in smsv_id_geom:
smsv_id = smsv["id"]
geometry = smsv["geometry"]
# Calculate total population for density calculation
population = sum(age_distribution.get(smsv_id, {}).get(2024, {}).values())
# Calculate density
try:
density = get_density(geometry, population)
except ValueError as e:
print(f"Density calculation failed for {smsv_id}: {e}")
density = None
# Age distribution
age_dist = age_distribution.get(smsv_id, {})
# Income distribution
income_dist = income_distribution.get(smsv_id, {})
# Add row to data
data.append({
"smallAreaId": smsv_id,
"density": density,
"income_distribution_per_year": income_dist,
"age_distribution": age_dist,
"geometry": geometry,
"projected_dwellings": None # Placeholder for now
})
# Convert to Pandas DataFrame
df = pd.DataFrame(data, columns=columns)
# Display or save the DataFrame
# print(df.head())
# df.to_csv('output.csv', index=False, encoding='utf-8') # Save to CSV
return df