Spaces:
Sleeping
Sleeping
import io | |
import pandas as pd | |
import geopandas as gpd | |
from geopy.geocoders import Nominatim | |
geolocator = Nominatim(user_agent='Cities OECD') | |
import requests | |
import requests_cache | |
from retry_requests import retry | |
from fake_useragent import UserAgent | |
ua = UserAgent(browsers=['Chrome']) | |
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600) | |
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2) | |
headers = { | |
'Sec-Fetch-Dest': 'empty', | |
'Sec-Fetch-Mode': 'cors', | |
'Sec-Fetch-Site': 'same-origin', | |
'Upgrade-Insecure-Requests': '1', | |
'Connection': 'keep-alive', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Cache-Control': 'max-age=0', | |
'User-Agent': ua.random, | |
} | |
url_data = 'https://sdmx.oecd.org/archive/rest/data/OECD,DF_CITIES,/all?startPeriod=2018&dimensionAtObservation=AllDimensions&format=csvfilewithlabels' | |
def geocode_nan(row): | |
if pd.isnull(row['coordinates']): | |
geolocate = row['Metropolitan areas'] | |
location = geolocator.geocode(geolocate, timeout=10) | |
row['coordinates'] = (location.longitude, location.latitude) | |
else: | |
row['coordinates'] = row['coordinates'] | |
return row | |
def oecd_data(): | |
csv = retry_session.get(url_data, headers=headers) | |
df = pd.read_csv(io.StringIO(csv.text), usecols=['METRO_ID', 'Metropolitan areas', 'Variables', 'TIME_PERIOD', 'OBS_VALUE'], low_memory=False) | |
variables = ['Income', 'GDP'] | |
df = df[df['Variables'].str.contains('|'.join(variables))].copy() | |
df = df.sort_values(['METRO_ID', 'Variables', 'TIME_PERIOD'], ascending=[True, True, True]) | |
df = df.drop_duplicates(subset=['METRO_ID', 'Variables'], keep='last') | |
df_concat = df[['METRO_ID', 'Metropolitan areas']].sort_values('METRO_ID').drop_duplicates(subset='METRO_ID', keep='first').reset_index(drop=True) | |
df_concat = df_concat[df_concat['METRO_ID'].str.len() > 3].reset_index(drop=True).copy() | |
df_groups = df.groupby('Variables') | |
for group in df_groups: | |
group_name = group[0] | |
group = group[1][['METRO_ID', 'OBS_VALUE']] | |
group = group.rename(columns={'OBS_VALUE': group_name}) | |
df_concat = df_concat.merge(group, on='METRO_ID', how='left') | |
df_concat.to_csv('OECD_DF_CITIES_all.csv', encoding='utf-8', index=False) | |
url_shap = 'https://www.oecd.org/content/dam/oecd/en/data/datasets/oecd-definition-of-cities-and-functional-urban-areas/fuas%20(1).zip' | |
shape = retry_session.get(url_shap, headers=headers) | |
with open('OECD_FUAS_shape.zip', 'wb') as z: | |
z.write(shape.content) | |
df_fua = gpd.read_file('OECD_FUAS_shape.zip') | |
df_fua['centroid'] = df_fua.representative_point() | |
df_fua['coordinates'] = df_fua['centroid'].apply(lambda p: (p.x, p.y)) | |
df_fua['fuacode'] = df_fua['fuacode'].str.replace(r'F$', '', regex=True) | |
df_fua = df_fua[['fuacode', 'coordinates']] | |
df_fua = df_fua.rename(columns={'fuacode': 'METRO_ID'}) | |
df_merged = df_concat.merge(df_fua, on='METRO_ID', how='left') | |
#df_merged[['Longitude', 'Latitude']] = df_merged.apply(lambda x: geocode_nan(x) if pd.isnull(x['Longitude']) else [x['Longitude'], x['Latitude']], axis=1, result_type='expand') | |
df_merged = df_merged.apply(geocode_nan, axis=1) | |
df_merged.to_csv('OECD_DF_CITIES_coord.csv', encoding='utf-8', index=False) | |
return df_merged | |