Spaces:
Sleeping
Sleeping
File size: 3,548 Bytes
e5210b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import io
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='Cities OECD')
import requests
import requests_cache
from retry_requests import retry
from fake_useragent import UserAgent
ua = UserAgent(browsers=['Chrome'])
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
headers = {
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'User-Agent': ua.random,
}
url_data = 'https://sdmx.oecd.org/archive/rest/data/OECD,DF_CITIES,/all?startPeriod=2018&dimensionAtObservation=AllDimensions&format=csvfilewithlabels'
def geocode_nan(row):
if pd.isnull(row['coordinates']):
geolocate = row['Metropolitan areas']
location = geolocator.geocode(geolocate, timeout=10)
row['coordinates'] = (location.longitude, location.latitude)
else:
row['coordinates'] = row['coordinates']
return row
def oecd_data():
csv = retry_session.get(url_data, headers=headers)
df = pd.read_csv(io.StringIO(csv.text), usecols=['METRO_ID', 'Metropolitan areas', 'Variables', 'TIME_PERIOD', 'OBS_VALUE'], low_memory=False)
variables = ['Income', 'GDP']
df = df[df['Variables'].str.contains('|'.join(variables))].copy()
df = df.sort_values(['METRO_ID', 'Variables', 'TIME_PERIOD'], ascending=[True, True, True])
df = df.drop_duplicates(subset=['METRO_ID', 'Variables'], keep='last')
df_concat = df[['METRO_ID', 'Metropolitan areas']].sort_values('METRO_ID').drop_duplicates(subset='METRO_ID', keep='first').reset_index(drop=True)
df_concat = df_concat[df_concat['METRO_ID'].str.len() > 3].reset_index(drop=True).copy()
df_groups = df.groupby('Variables')
for group in df_groups:
group_name = group[0]
group = group[1][['METRO_ID', 'OBS_VALUE']]
group = group.rename(columns={'OBS_VALUE': group_name})
df_concat = df_concat.merge(group, on='METRO_ID', how='left')
df_concat.to_csv('OECD_DF_CITIES_all.csv', encoding='utf-8', index=False)
url_shap = 'https://www.oecd.org/content/dam/oecd/en/data/datasets/oecd-definition-of-cities-and-functional-urban-areas/fuas%20(1).zip'
shape = retry_session.get(url_shap, headers=headers)
with open('OECD_FUAS_shape.zip', 'wb') as z:
z.write(shape.content)
df_fua = gpd.read_file('OECD_FUAS_shape.zip')
df_fua['centroid'] = df_fua.representative_point()
df_fua['coordinates'] = df_fua['centroid'].apply(lambda p: (p.x, p.y))
df_fua['fuacode'] = df_fua['fuacode'].str.replace(r'F$', '', regex=True)
df_fua = df_fua[['fuacode', 'coordinates']]
df_fua = df_fua.rename(columns={'fuacode': 'METRO_ID'})
df_merged = df_concat.merge(df_fua, on='METRO_ID', how='left')
#df_merged[['Longitude', 'Latitude']] = df_merged.apply(lambda x: geocode_nan(x) if pd.isnull(x['Longitude']) else [x['Longitude'], x['Latitude']], axis=1, result_type='expand')
df_merged = df_merged.apply(geocode_nan, axis=1)
df_merged.to_csv('OECD_DF_CITIES_coord.csv', encoding='utf-8', index=False)
return df_merged
|