|
import pandas as pd |
|
import geopandas as gpd |
|
from pyeuropeana import apis |
|
|
|
def fetch_and_process_data(query, world_gdf, rows=1000, additional_params=None): |
|
""" |
|
Query Europeana API and process the data. |
|
|
|
Args: |
|
query (str): Query string to search for |
|
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join |
|
rows (int): Number of rows to retrieve (default 1000) |
|
additional_params (dict): Additional parameters for the API query (optional) |
|
|
|
Returns: |
|
tuple: (pandas.DataFrame, geopandas.GeoDataFrame) |
|
- First DataFrame: Processed DataFrame with the queried data, including object location |
|
- Second GeoDataFrame: Count of objects per country from the world dataset, including geometry |
|
""" |
|
|
|
response = apis.search( |
|
query=query, |
|
rows=rows, |
|
**(additional_params or {}) |
|
) |
|
|
|
|
|
if "items" not in response: |
|
raise ValueError("API response does not contain 'items'") |
|
|
|
|
|
myquery_df = pd.DataFrame( |
|
response["items"], |
|
columns=['edmPlaceLatitude', 'edmPlaceLongitude', 'id', 'country', 'dataProvider', 'dcCreator', 'edmPreview'] |
|
) |
|
|
|
|
|
def extract_single(x): |
|
return x[0] if isinstance(x, list) and len(x) > 0 else x |
|
|
|
|
|
for col in ['edmPlaceLatitude', 'edmPlaceLongitude', 'country', 'dataProvider', 'dcCreator', 'edmPreview']: |
|
myquery_df[col] = myquery_df[col].apply(extract_single) |
|
|
|
|
|
myquery_df['edmPlaceLatitude'] = pd.to_numeric(myquery_df['edmPlaceLatitude'], errors='coerce') |
|
myquery_df['edmPlaceLongitude'] = pd.to_numeric(myquery_df['edmPlaceLongitude'], errors='coerce') |
|
|
|
|
|
gdf = gpd.GeoDataFrame( |
|
myquery_df, |
|
geometry=gpd.points_from_xy(myquery_df.edmPlaceLongitude, myquery_df.edmPlaceLatitude), |
|
crs="EPSG:4326" |
|
) |
|
|
|
|
|
gdf_with_country = gpd.sjoin(gdf, world_gdf[['geometry', 'name']], how='left', predicate='within') |
|
|
|
|
|
myquery_df['object_location'] = gdf_with_country['name'] |
|
|
|
|
|
myquery_df['object_location'] = myquery_df['object_location'].fillna("Unknown") |
|
|
|
|
|
world_counts = world_gdf.copy() |
|
country_counts = gdf_with_country['name'].value_counts() |
|
world_counts['object_count'] = world_counts['name'].map(country_counts).fillna(0).astype(int) |
|
|
|
return myquery_df, world_counts |
|
|
|
def get_data(query, world_gdf, rows=1000): |
|
""" |
|
Query Europeana API with a general query and return processed DataFrames. |
|
|
|
Args: |
|
query (str): Query string to search for |
|
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join |
|
rows (int): Number of rows to retrieve (default 1000) |
|
|
|
Returns: |
|
tuple: (pandas.DataFrame, geopandas.GeoDataFrame) |
|
""" |
|
return fetch_and_process_data(query, world_gdf, rows) |
|
|
|
def get_provider_data(provider_name, world_gdf, rows=1000): |
|
""" |
|
Query Europeana API for a specific data provider and return processed DataFrames. |
|
|
|
Args: |
|
provider_name (str): Name of the data provider to query |
|
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join |
|
rows (int): Number of rows to retrieve (default 1000) |
|
|
|
Returns: |
|
tuple: (pandas.DataFrame, geopandas.GeoDataFrame) |
|
""" |
|
query = 'pl_wgs84_pos_lat:(*)' |
|
additional_params = {'qf': f'DATA_PROVIDER:"{provider_name}"'} |
|
return fetch_and_process_data(query, world_gdf, rows, additional_params) |
|
|
|
|
|
def aggregate_location_counts(df): |
|
""" |
|
Aggregate the data by object_location and get counts. |
|
|
|
Args: |
|
df (pandas.DataFrame): DataFrame containing 'object_location' column |
|
|
|
Returns: |
|
pandas.DataFrame: DataFrame with object locations and their counts, sorted by count |
|
""" |
|
location_counts = df['object_location'].value_counts().reset_index() |
|
location_counts.columns = ['object_location', 'count'] |
|
return location_counts.sort_values('count', ascending=False) |
|
|