nbeuchat commited on
Commit
75ce42f
1 Parent(s): 0cecccf

fix image gathering

Browse files
data/actors_embeddings.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b2d6501a7fa59db2646f9d0438afe0e07358bd7d66eb00199227b3af2d1e26f
3
- size 54033196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9f1da52b8d6f8926a9aac335a4125f646359c5d5a882aea9ded679e4066f057
3
+ size 36828171
data/actors_images.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34e85e657e5e52e4467da41f1fce427bd07f22fefdac060e7eb136838a4e6d29
3
- size 19246721
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e859801f01b0dd87938c23be5211a66244489b7cdcd784a5c4dc008f3964869
3
+ size 38713146
data/imdb_actors.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eaffa4fa57bad732d00ecd0c4567bebee05f2c0f6f86325cd4d4600e9ca51ff9
3
- size 4444297
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95d36387eb646a14ea8038d3d02efbfa6d424d69d32a8b931ff8331d1951b97
3
+ size 7829655
get_images_data.py CHANGED
@@ -20,7 +20,7 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
20
  headers = {
21
  "Ocp-Apim-Subscription-Key": BING_API_KEY
22
  }
23
- query = f"{name}, actor or actress"
24
  params = {
25
  "q": query,
26
  "count": count,
@@ -35,8 +35,8 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
35
  params=params
36
  )
37
 
38
- if response.status_code == 200:
39
- return response.json()
40
 
41
  def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
42
  """Read and filter the list of actors"""
@@ -77,10 +77,15 @@ def store_all_actor_images_data(
77
 
78
  print(f"Start retrieving images from Bing for {len(df)} actors")
79
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
80
- images_data = get_actor_images(
81
- name=row["primaryName"],
82
- count=images_per_actor
83
- )
 
 
 
 
 
84
  df_im_tmp = pd.DataFrame(images_data["value"])
85
  df_im_tmp["nconst"] = row["nconst"]
86
  df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
@@ -99,9 +104,9 @@ def store_all_actor_images_data(
99
 
100
  if __name__ == "__main__":
101
  store_all_actor_images_data(
102
- output_file="data/actors_images.csv",
103
- max_actors=1000,
104
  images_per_actor=20,
105
  last_year_active=datetime.now().year - 5,
106
- max_api_calls_per_second=2
107
  )
 
20
  headers = {
21
  "Ocp-Apim-Subscription-Key": BING_API_KEY
22
  }
23
+ query = f'"{name}"'
24
  params = {
25
  "q": query,
26
  "count": count,
 
35
  params=params
36
  )
37
 
38
+ response.raise_for_status()
39
+ return response.json()
40
 
41
  def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
42
  """Read and filter the list of actors"""
 
77
 
78
  print(f"Start retrieving images from Bing for {len(df)} actors")
79
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
80
+ try:
81
+ images_data = get_actor_images(
82
+ name=row["primaryName"],
83
+ count=images_per_actor
84
+ )
85
+ except Exception as e:
86
+ print(e)
87
+ continue
88
+
89
  df_im_tmp = pd.DataFrame(images_data["value"])
90
  df_im_tmp["nconst"] = row["nconst"]
91
  df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
 
104
 
105
  if __name__ == "__main__":
106
  store_all_actor_images_data(
107
+ output_file="data/actors_images_new.csv",
108
+ max_actors=2000,
109
  images_per_actor=20,
110
  last_year_active=datetime.now().year - 5,
111
+ max_api_calls_per_second=100
112
  )
process_images.py CHANGED
@@ -21,7 +21,7 @@ def get_embeddings(url: str):
21
  print(e)
22
 
23
  def process_all_images(input_file, output_file):
24
- df = pd.read_csv(input_file)[["nconst","contentUrl"]]
25
 
26
  try:
27
  df_emb = pd.read_csv(output_file)
@@ -31,11 +31,13 @@ def process_all_images(input_file, output_file):
31
  df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
32
 
33
  print(f"Start processing of {df.shape[0]} images")
34
- df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
 
35
  for i, row in tqdm(df.iterrows(), total=df.shape[0]):
36
  embeddings = get_embeddings(row["contentUrl"])
37
  new_row = row.copy()
38
  new_row["embeddings"] = embeddings
 
39
  df_emb = df_emb.append(new_row, ignore_index=True)
40
 
41
  if i % 5 == 0:
 
21
  print(e)
22
 
23
  def process_all_images(input_file, output_file):
24
+ df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
25
 
26
  try:
27
  df_emb = pd.read_csv(output_file)
 
31
  df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
32
 
33
  print(f"Start processing of {df.shape[0]} images")
34
+ df = df.sort_values("resultPosition", ascending=True)
35
+ #df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
36
  for i, row in tqdm(df.iterrows(), total=df.shape[0]):
37
  embeddings = get_embeddings(row["contentUrl"])
38
  new_row = row.copy()
39
  new_row["embeddings"] = embeddings
40
+ new_row = new_row[["nconst", "contentUrl", "embeddings"]]
41
  df_emb = df_emb.append(new_row, ignore_index=True)
42
 
43
  if i % 5 == 0: