File size: 3,191 Bytes
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
e3012f6
6e89871
 
 
 
 
 
 
75ce42f
e3012f6
 
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
75ce42f
 
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75ce42f
 
 
 
 
 
 
 
 
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75ce42f
 
6e89871
 
75ce42f
6e89871
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import requests
import pandas as pd
import os
import time

from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

BING_API_KEY = os.getenv("BING_API_KEY", None)

def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY):
    """Get a list of actor images from the Bing Image Search API"""
    if api_key is None:
        raise ValueError("You must provide a Bing API key")

    headers = {
        "Ocp-Apim-Subscription-Key": BING_API_KEY
    }
    query = f'"{name}"'
    if role:
        query = f"{query} ({role})"
    params = {
        "q": query,
        "count": count,
        "imageType": "Photo",
        "safeSearch": "Strict",
        "imageContent": "Face",
        "freshness": "Year"
    }
    response = requests.get(
        f"https://api.bing.microsoft.com/v7.0/images/search",
        headers=headers,
        params=params
    )

    response.raise_for_status()
    return response.json()

def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
    """Read and filter the list of actors"""

    df = pd.read_csv("data/imdb_actors.csv")
    if last_year_active:
        df = df[df["lastYear"] >= last_year_active]

    if sort_by:
        df = df.sort_values(sort_by, ascending=False)
    
    if max_actors:
        df = df.head(max_actors)

    return df

def store_all_actor_images_data(
    max_actors: int = None, 
    images_per_actor: int = 10, 
    last_year_active: int = None, 
    output_file = None,
    max_api_calls_per_second: int = 3
):
    """Get images data for each actor from the Bing Image Search API and store the results as csv"""

    df = read_actors_list(max_actors, last_year_active)
    df_im = None
    if output_file:
        try:
            df_im = pd.read_csv(output_file)
        except: 
            # file does not exists yet
            pass

    # remove actors for which we already have images data
    if df_im is not None:
        df = df[~df["nconst"].isin(df_im["nconst"].unique())]

    print(f"Start retrieving images from Bing for {len(df)} actors")
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        try:
            images_data = get_actor_images(
                name=row["primaryName"],
                count=images_per_actor
            )
        except Exception as e:
            print(e)
            continue 

        df_im_tmp = pd.DataFrame(images_data["value"])
        df_im_tmp["nconst"] = row["nconst"]
        df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))

        if df_im is not None:
            df_im = pd.concat([df_im, df_im_tmp])
        else:
            df_im = df_im_tmp
        
        # Store progress
        df_im.to_csv(output_file, index=False)

        # Limit speed of requests to Bing Search (3 calls per seconds)
        time.sleep(1.0 / max_api_calls_per_second)


if __name__ == "__main__":
    store_all_actor_images_data(
        output_file="data/actors_images_new.csv", 
        max_actors=2000, 
        images_per_actor=20,
        last_year_active=datetime.now().year - 5,
        max_api_calls_per_second=100
    )