File size: 4,376 Bytes
7ee9809 71d9b4a 7ee9809 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 69f7892 f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a 1d0d64c 71d9b4a f89fe21 71d9b4a f89fe21 1d0d64c f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 1d0d64c f89fe21 71d9b4a 2cd2cef 71d9b4a 69f7892 f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a f89fe21 71d9b4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import pandas as pd
import geopandas as gpd
from pyeuropeana import apis
def fetch_and_process_data(query, world_gdf, rows=1000, additional_params=None):
"""
Query Europeana API and process the data.
Args:
query (str): Query string to search for
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
rows (int): Number of rows to retrieve (default 1000)
additional_params (dict): Additional parameters for the API query (optional)
Returns:
tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
- First DataFrame: Processed DataFrame with the queried data, including object location
- Second GeoDataFrame: Count of objects per country from the world dataset, including geometry
"""
# Query the Europeana API
response = apis.search(
query=query,
rows=rows,
**(additional_params or {})
)
# Check if the response contains the expected data
if "items" not in response:
raise ValueError("API response does not contain 'items'")
# Create initial DataFrame
myquery_df = pd.DataFrame(
response["items"],
columns=['edmPlaceLatitude', 'edmPlaceLongitude', 'id', 'country', 'dataProvider', 'dcCreator', 'edmPreview']
)
# Function to extract single value from list or return original value
def extract_single(x):
return x[0] if isinstance(x, list) and len(x) > 0 else x
# Apply extraction to relevant columns
for col in ['edmPlaceLatitude', 'edmPlaceLongitude', 'country', 'dataProvider', 'dcCreator', 'edmPreview']:
myquery_df[col] = myquery_df[col].apply(extract_single)
# Convert latitude and longitude to float type
myquery_df['edmPlaceLatitude'] = pd.to_numeric(myquery_df['edmPlaceLatitude'], errors='coerce')
myquery_df['edmPlaceLongitude'] = pd.to_numeric(myquery_df['edmPlaceLongitude'], errors='coerce')
# Create a GeoDataFrame from the DataFrame
gdf = gpd.GeoDataFrame(
myquery_df,
geometry=gpd.points_from_xy(myquery_df.edmPlaceLongitude, myquery_df.edmPlaceLatitude),
crs="EPSG:4326"
)
# Perform spatial join
gdf_with_country = gpd.sjoin(gdf, world_gdf[['geometry', 'name']], how='left', predicate='within')
# Add the new column to the original DataFrame
myquery_df['object_location'] = gdf_with_country['name']
# Fill NaN values (points that don't fall within any country) with "Unknown"
myquery_df['object_location'] = myquery_df['object_location'].fillna("Unknown")
# Create world_counts directly from the spatial join result
world_counts = world_gdf.copy()
country_counts = gdf_with_country['name'].value_counts()
world_counts['object_count'] = world_counts['name'].map(country_counts).fillna(0).astype(int)
return myquery_df, world_counts
def get_data(query, world_gdf, rows=1000):
"""
Query Europeana API with a general query and return processed DataFrames.
Args:
query (str): Query string to search for
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
rows (int): Number of rows to retrieve (default 1000)
Returns:
tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
"""
return fetch_and_process_data(query, world_gdf, rows)
def get_provider_data(provider_name, world_gdf, rows=1000):
"""
Query Europeana API for a specific data provider and return processed DataFrames.
Args:
provider_name (str): Name of the data provider to query
world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
rows (int): Number of rows to retrieve (default 1000)
Returns:
tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
"""
query = 'pl_wgs84_pos_lat:(*)'
additional_params = {'qf': f'DATA_PROVIDER:"{provider_name}"'}
return fetch_and_process_data(query, world_gdf, rows, additional_params)
def aggregate_location_counts(df):
"""
Aggregate the data by object_location and get counts.
Args:
df (pandas.DataFrame): DataFrame containing 'object_location' column
Returns:
pandas.DataFrame: DataFrame with object locations and their counts, sorted by count
"""
location_counts = df['object_location'].value_counts().reset_index()
location_counts.columns = ['object_location', 'count']
return location_counts.sort_values('count', ascending=False)
|