File size: 4,376 Bytes
7ee9809
 
71d9b4a
7ee9809
71d9b4a
f89fe21
71d9b4a
 
f89fe21
71d9b4a
f89fe21
 
71d9b4a
 
f89fe21
69f7892
 
 
f89fe21
 
71d9b4a
 
 
 
f89fe21
71d9b4a
 
 
 
 
f89fe21
71d9b4a
 
1d0d64c
71d9b4a
 
f89fe21
 
 
71d9b4a
f89fe21
1d0d64c
f89fe21
71d9b4a
f89fe21
 
 
71d9b4a
f89fe21
 
71d9b4a
 
f89fe21
 
 
 
1d0d64c
f89fe21
 
 
 
 
 
71d9b4a
2cd2cef
 
 
 
71d9b4a
69f7892
f89fe21
71d9b4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f89fe21
 
 
71d9b4a
f89fe21
 
71d9b4a
f89fe21
 
 
 
 
71d9b4a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import geopandas as gpd
from pyeuropeana import apis

def fetch_and_process_data(query, world_gdf, rows=1000, additional_params=None):
    """
    Query Europeana API and process the data.

    Args:
    query (str): Query string to search for
    world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
    rows (int): Number of rows to retrieve (default 1000)
    additional_params (dict): Additional parameters for the API query (optional)

    Returns:
    tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
        - First DataFrame: Processed DataFrame with the queried data, including object location
        - Second GeoDataFrame: Count of objects per country from the world dataset, including geometry
    """
    # Query the Europeana API
    response = apis.search(
        query=query,
        rows=rows,
        **(additional_params or {})
    )

    # Check if the response contains the expected data
    if "items" not in response:
        raise ValueError("API response does not contain 'items'")

    # Create initial DataFrame
    myquery_df = pd.DataFrame(
        response["items"],
        columns=['edmPlaceLatitude', 'edmPlaceLongitude', 'id', 'country', 'dataProvider', 'dcCreator', 'edmPreview']
    )

    # Function to extract single value from list or return original value
    def extract_single(x):
        return x[0] if isinstance(x, list) and len(x) > 0 else x

    # Apply extraction to relevant columns
    for col in ['edmPlaceLatitude', 'edmPlaceLongitude', 'country', 'dataProvider', 'dcCreator', 'edmPreview']:
        myquery_df[col] = myquery_df[col].apply(extract_single)

    # Convert latitude and longitude to float type
    myquery_df['edmPlaceLatitude'] = pd.to_numeric(myquery_df['edmPlaceLatitude'], errors='coerce')
    myquery_df['edmPlaceLongitude'] = pd.to_numeric(myquery_df['edmPlaceLongitude'], errors='coerce')

    # Create a GeoDataFrame from the DataFrame
    gdf = gpd.GeoDataFrame(
        myquery_df,
        geometry=gpd.points_from_xy(myquery_df.edmPlaceLongitude, myquery_df.edmPlaceLatitude),
        crs="EPSG:4326"
    )

    # Perform spatial join
    gdf_with_country = gpd.sjoin(gdf, world_gdf[['geometry', 'name']], how='left', predicate='within')

    # Add the new column to the original DataFrame
    myquery_df['object_location'] = gdf_with_country['name']

    # Fill NaN values (points that don't fall within any country) with "Unknown"
    myquery_df['object_location'] = myquery_df['object_location'].fillna("Unknown")

    # Create world_counts directly from the spatial join result
    world_counts = world_gdf.copy()
    country_counts = gdf_with_country['name'].value_counts()
    world_counts['object_count'] = world_counts['name'].map(country_counts).fillna(0).astype(int)

    return myquery_df, world_counts

def get_data(query, world_gdf, rows=1000):
    """
    Query Europeana API with a general query and return processed DataFrames.

    Args:
    query (str): Query string to search for
    world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
    rows (int): Number of rows to retrieve (default 1000)

    Returns:
    tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
    """
    return fetch_and_process_data(query, world_gdf, rows)

def get_provider_data(provider_name, world_gdf, rows=1000):
    """
    Query Europeana API for a specific data provider and return processed DataFrames.

    Args:
    provider_name (str): Name of the data provider to query
    world_gdf (GeoDataFrame): World map GeoDataFrame for spatial join
    rows (int): Number of rows to retrieve (default 1000)

    Returns:
    tuple: (pandas.DataFrame, geopandas.GeoDataFrame)
    """
    query = 'pl_wgs84_pos_lat:(*)'
    additional_params = {'qf': f'DATA_PROVIDER:"{provider_name}"'}
    return fetch_and_process_data(query, world_gdf, rows, additional_params)


def aggregate_location_counts(df):
    """
    Aggregate the data by object_location and get counts.

    Args:
    df (pandas.DataFrame): DataFrame containing 'object_location' column

    Returns:
    pandas.DataFrame: DataFrame with object locations and their counts, sorted by count
    """
    location_counts = df['object_location'].value_counts().reset_index()
    location_counts.columns = ['object_location', 'count']
    return location_counts.sort_values('count', ascending=False)