actors_matching / get_images_data.py
nbeuchat's picture
scripts for downloading actors data and extract embeddings
6e89871
raw history blame
No virus
3.04 kB
import os
import requests
import pandas as pd
import os
import time
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
BING_API_KEY = os.getenv("BING_API_KEY", None)
def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
"""Get a list of actor images from the Bing Image Search API"""
if api_key is None:
raise ValueError("You must provide a Bing API key")
headers = {
"Ocp-Apim-Subscription-Key": BING_API_KEY
}
query = f"{name}, actor or actress"
params = {
"q": query,
"count": count,
"imageType": "Photo",
"safeSearch": "Strict",
"imageContent": "Face",
"freshness": "Year"
}
response = requests.get(
f"https://api.bing.microsoft.com/v7.0/images/search",
headers=headers,
params=params
)
if response.status_code == 200:
return response.json()
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
"""Read and filter the list of actors"""
df = pd.read_csv("data/imdb_actors.csv")
if last_year_active:
df = df[df["lastYear"] >= last_year_active]
if sort_by:
df = df.sort_values(sort_by, ascending=False)
if max_actors:
df = df.head(max_actors)
return df
def store_all_actor_images_data(
max_actors: int = None,
images_per_actor: int = 10,
last_year_active: int = None,
output_file = None,
max_api_calls_per_second: int = 3
):
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
df = read_actors_list(max_actors, last_year_active)
df_im = None
if output_file:
try:
df_im = pd.read_csv(output_file)
except:
# file does not exists yet
pass
# remove actors for which we already have images data
if df_im is not None:
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
print(f"Start retrieving images from Bing for {len(df)} actors")
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
images_data = get_actor_images(
name=row["primaryName"],
count=images_per_actor
)
df_im_tmp = pd.DataFrame(images_data["value"])
df_im_tmp["nconst"] = row["nconst"]
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
if df_im is not None:
df_im = pd.concat([df_im, df_im_tmp])
else:
df_im = df_im_tmp
# Store progress
df_im.to_csv(output_file, index=False)
# Limit speed of requests to Bing Search (3 calls per seconds)
time.sleep(1.0 / max_api_calls_per_second)
if __name__ == "__main__":
store_all_actor_images_data(
output_file="data/actors_images.csv",
max_actors=1000,
images_per_actor=20,
last_year_active=datetime.now().year - 5,
max_api_calls_per_second=2
)