nbeuchat commited on
Commit
be3b0b4
1 Parent(s): b41b1f4

black py files

Browse files
app.py CHANGED
@@ -5,11 +5,12 @@ from pathlib import Path
5
 
6
  annoy_index, actors_mapping = load_annoy_index()
7
 
 
8
  def get_image_html(actor: dict):
9
  url = actor["url"]
10
  name = actor["name"]
11
  imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
12
- return f'''
13
  <div style="position: relative; text-align: center; color: white;">
14
  <img src="{url}" alt="{name} matches the input image" style="height: 500px">
15
  <div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
@@ -17,20 +18,23 @@ def get_image_html(actor: dict):
17
  <p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
18
  </div>
19
  </div>
20
- '''
 
21
 
22
  def no_faces_found_html():
23
  return f"""<div>No faces found in the picture</div>"""
24
 
 
25
  def get_best_matches(image, n_matches: int):
26
  return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
27
 
 
28
  def find_matching_actors(input_img, title, n_matches: int = 10):
29
  best_matches_list = get_best_matches(input_img, n_matches=n_matches)
30
 
31
  # TODO: allow looping through characters
32
  if best_matches_list:
33
- best_matches = best_matches_list[0]
34
 
35
  # TODO: Show how the initial image was parsed (ie: which person is displayed)
36
 
@@ -45,8 +49,9 @@ def find_matching_actors(input_img, title, n_matches: int = 10):
45
  # No matches
46
  return [no_faces_found_html()]
47
 
 
48
  iface = gr.Interface(
49
- find_matching_actors,
50
  title="Which actor or actress looks like you?",
51
  description="""Who is the best person to play a movie about you? Upload a picture and find out!
52
  Or maybe you'd like to know who would best interpret your favorite historical character?
@@ -54,19 +59,24 @@ iface = gr.Interface(
54
  and limitations of the tool!""",
55
  article=Path("README.md").read_text(),
56
  inputs=[
57
- gr.inputs.Image(shape=(256, 256), label="Your image"),
58
- gr.inputs.Textbox(label="Who's that?", placeholder="Optional, you can leave this blank"),
59
- #gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of matches"),
60
- ],
 
 
61
  outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
62
  examples=[
63
  ["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
64
- ["images/example_hannibal_barca.jpg", "Hannibal (the one with the elephants...)"],
 
 
 
65
  ["images/example_frederick_douglass.jpg", "Frederik Douglass"],
66
  ["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
67
  ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
68
  ["images/example_sun_tzu.jpg", "Sun Tzu"],
69
- ]
70
  )
71
 
72
  iface.launch()
5
 
6
  annoy_index, actors_mapping = load_annoy_index()
7
 
8
+
9
  def get_image_html(actor: dict):
10
  url = actor["url"]
11
  name = actor["name"]
12
  imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
13
+ return f"""
14
  <div style="position: relative; text-align: center; color: white;">
15
  <img src="{url}" alt="{name} matches the input image" style="height: 500px">
16
  <div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
18
  <p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
19
  </div>
20
  </div>
21
+ """
22
+
23
 
24
  def no_faces_found_html():
25
  return f"""<div>No faces found in the picture</div>"""
26
 
27
+
28
  def get_best_matches(image, n_matches: int):
29
  return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
30
 
31
+
32
  def find_matching_actors(input_img, title, n_matches: int = 10):
33
  best_matches_list = get_best_matches(input_img, n_matches=n_matches)
34
 
35
  # TODO: allow looping through characters
36
  if best_matches_list:
37
+ best_matches = best_matches_list[0]
38
 
39
  # TODO: Show how the initial image was parsed (ie: which person is displayed)
40
 
49
  # No matches
50
  return [no_faces_found_html()]
51
 
52
+
53
  iface = gr.Interface(
54
+ find_matching_actors,
55
  title="Which actor or actress looks like you?",
56
  description="""Who is the best person to play a movie about you? Upload a picture and find out!
57
  Or maybe you'd like to know who would best interpret your favorite historical character?
59
  and limitations of the tool!""",
60
  article=Path("README.md").read_text(),
61
  inputs=[
62
+ gr.inputs.Image(shape=(256, 256), label="Your image"),
63
+ gr.inputs.Textbox(
64
+ label="Who's that?", placeholder="Optional, you can leave this blank"
65
+ ),
66
+ # gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of matches"),
67
+ ],
68
  outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
69
  examples=[
70
  ["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
71
+ [
72
+ "images/example_hannibal_barca.jpg",
73
+ "Hannibal (the one with the elephants...)",
74
+ ],
75
  ["images/example_frederick_douglass.jpg", "Frederik Douglass"],
76
  ["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
77
  ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
78
  ["images/example_sun_tzu.jpg", "Sun Tzu"],
79
+ ],
80
  )
81
 
82
  iface.launch()
pipeline/download_imdb_data.py CHANGED
@@ -4,10 +4,12 @@ import shutil
4
  from urllib.request import urlretrieve
5
  from tqdm import tqdm
6
 
 
7
  def download_large_file(url: str, output_file: str):
8
  if not os.path.exists(output_file):
9
  urlretrieve(url, output_file)
10
 
 
11
  def unzip_file(input_file):
12
  output_file = os.path.splitext(input_file)[0]
13
  if not os.path.exists(output_file):
@@ -16,17 +18,17 @@ def unzip_file(input_file):
16
  with open(output_file, "wb") as f_out:
17
  shutil.copyfileobj(f_in, f_out)
18
 
 
19
  if __name__ == "__main__":
20
  imdb_url = "https://datasets.imdbws.com"
21
  filenames = [
22
- "name.basics.tsv.gz",
23
- "title.basics.tsv.gz",
24
  "title.ratings.tsv.gz",
25
- "title.principals.tsv.gz"
26
  ]
27
  for filename in tqdm(filenames):
28
  url = f"{imdb_url}/{filename}"
29
  output_file = os.path.join("data", filename)
30
  download_large_file(url, output_file)
31
  unzip_file(output_file)
32
-
4
  from urllib.request import urlretrieve
5
  from tqdm import tqdm
6
 
7
+
8
  def download_large_file(url: str, output_file: str):
9
  if not os.path.exists(output_file):
10
  urlretrieve(url, output_file)
11
 
12
+
13
  def unzip_file(input_file):
14
  output_file = os.path.splitext(input_file)[0]
15
  if not os.path.exists(output_file):
18
  with open(output_file, "wb") as f_out:
19
  shutil.copyfileobj(f_in, f_out)
20
 
21
+
22
  if __name__ == "__main__":
23
  imdb_url = "https://datasets.imdbws.com"
24
  filenames = [
25
+ "name.basics.tsv.gz",
26
+ "title.basics.tsv.gz",
27
  "title.ratings.tsv.gz",
28
+ "title.principals.tsv.gz",
29
  ]
30
  for filename in tqdm(filenames):
31
  url = f"{imdb_url}/{filename}"
32
  output_file = os.path.join("data", filename)
33
  download_large_file(url, output_file)
34
  unzip_file(output_file)
 
pipeline/get_images_data.py CHANGED
@@ -12,14 +12,15 @@ load_dotenv()
12
 
13
  BING_API_KEY = os.getenv("BING_API_KEY", None)
14
 
15
- def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY):
 
 
 
16
  """Get a list of actor images from the Bing Image Search API"""
17
  if api_key is None:
18
  raise ValueError("You must provide a Bing API key")
19
 
20
- headers = {
21
- "Ocp-Apim-Subscription-Key": BING_API_KEY
22
- }
23
  query = f'"{name}"'
24
  if role:
25
  query = f"{query} ({role})"
@@ -29,18 +30,21 @@ def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str
29
  "imageType": "Photo",
30
  "safeSearch": "Strict",
31
  "imageContent": "Face",
32
- "freshness": "Year"
33
  }
34
  response = requests.get(
35
  f"https://api.bing.microsoft.com/v7.0/images/search",
36
  headers=headers,
37
- params=params
38
  )
39
 
40
  response.raise_for_status()
41
  return response.json()
42
 
43
- def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
 
 
 
44
  """Read and filter the list of actors"""
45
 
46
  df = pd.read_csv("data/imdb_actors.csv")
@@ -49,18 +53,19 @@ def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_
49
 
50
  if sort_by:
51
  df = df.sort_values(sort_by, ascending=False)
52
-
53
  if max_actors:
54
  df = df.head(max_actors)
55
 
56
  return df
57
 
 
58
  def store_all_actor_images_data(
59
- max_actors: int = None,
60
- images_per_actor: int = 10,
61
- last_year_active: int = None,
62
- output_file = None,
63
- max_api_calls_per_second: int = 3
64
  ):
65
  """Get images data for each actor from the Bing Image Search API and store the results as csv"""
66
 
@@ -69,7 +74,7 @@ def store_all_actor_images_data(
69
  if output_file:
70
  try:
71
  df_im = pd.read_csv(output_file)
72
- except:
73
  # file does not exists yet
74
  pass
75
 
@@ -81,12 +86,11 @@ def store_all_actor_images_data(
81
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
82
  try:
83
  images_data = get_actor_images(
84
- name=row["primaryName"],
85
- count=images_per_actor
86
  )
87
  except Exception as e:
88
  print(e)
89
- continue
90
 
91
  df_im_tmp = pd.DataFrame(images_data["value"])
92
  df_im_tmp["nconst"] = row["nconst"]
@@ -96,7 +100,7 @@ def store_all_actor_images_data(
96
  df_im = pd.concat([df_im, df_im_tmp])
97
  else:
98
  df_im = df_im_tmp
99
-
100
  # Store progress
101
  df_im.to_csv(output_file, index=False)
102
 
@@ -106,9 +110,9 @@ def store_all_actor_images_data(
106
 
107
  if __name__ == "__main__":
108
  store_all_actor_images_data(
109
- output_file="data/actors_images_new.csv",
110
- max_actors=2000,
111
  images_per_actor=20,
112
  last_year_active=datetime.now().year - 5,
113
- max_api_calls_per_second=100
114
- )
12
 
13
  BING_API_KEY = os.getenv("BING_API_KEY", None)
14
 
15
+
16
+ def get_actor_images(
17
+ name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY
18
+ ):
19
  """Get a list of actor images from the Bing Image Search API"""
20
  if api_key is None:
21
  raise ValueError("You must provide a Bing API key")
22
 
23
+ headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
 
 
24
  query = f'"{name}"'
25
  if role:
26
  query = f"{query} ({role})"
30
  "imageType": "Photo",
31
  "safeSearch": "Strict",
32
  "imageContent": "Face",
33
+ "freshness": "Year",
34
  }
35
  response = requests.get(
36
  f"https://api.bing.microsoft.com/v7.0/images/search",
37
  headers=headers,
38
+ params=params,
39
  )
40
 
41
  response.raise_for_status()
42
  return response.json()
43
 
44
+
45
+ def read_actors_list(
46
+ max_actors: int = None, last_year_active: int = None, sort_by: str = None
47
+ ):
48
  """Read and filter the list of actors"""
49
 
50
  df = pd.read_csv("data/imdb_actors.csv")
53
 
54
  if sort_by:
55
  df = df.sort_values(sort_by, ascending=False)
56
+
57
  if max_actors:
58
  df = df.head(max_actors)
59
 
60
  return df
61
 
62
+
63
  def store_all_actor_images_data(
64
+ max_actors: int = None,
65
+ images_per_actor: int = 10,
66
+ last_year_active: int = None,
67
+ output_file=None,
68
+ max_api_calls_per_second: int = 3,
69
  ):
70
  """Get images data for each actor from the Bing Image Search API and store the results as csv"""
71
 
74
  if output_file:
75
  try:
76
  df_im = pd.read_csv(output_file)
77
+ except:
78
  # file does not exists yet
79
  pass
80
 
86
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
87
  try:
88
  images_data = get_actor_images(
89
+ name=row["primaryName"], count=images_per_actor
 
90
  )
91
  except Exception as e:
92
  print(e)
93
+ continue
94
 
95
  df_im_tmp = pd.DataFrame(images_data["value"])
96
  df_im_tmp["nconst"] = row["nconst"]
100
  df_im = pd.concat([df_im, df_im_tmp])
101
  else:
102
  df_im = df_im_tmp
103
+
104
  # Store progress
105
  df_im.to_csv(output_file, index=False)
106
 
110
 
111
  if __name__ == "__main__":
112
  store_all_actor_images_data(
113
+ output_file="data/actors_images_new.csv",
114
+ max_actors=2000,
115
  images_per_actor=20,
116
  last_year_active=datetime.now().year - 5,
117
+ max_api_calls_per_second=100,
118
+ )
pipeline/process_images.py CHANGED
@@ -7,35 +7,37 @@ from time import time
7
 
8
 
9
  def get_image(url: str):
10
- headers = {
11
- "User-Agent": "Actors matching app 1.0"
12
- }
13
  response = requests.get(url, headers=headers)
14
  response.raise_for_status()
15
  img_file_object = BytesIO(response.content)
16
  return face_recognition.load_image_file(img_file_object)
17
 
 
18
  def get_embeddings(url: str):
19
  try:
20
  image = get_image(url)
21
- embeddings = face_recognition.face_encodings(image, num_jitters=2, model="large")
 
 
22
  return list(embeddings[0])
23
  except Exception as e:
24
  print(e)
25
 
 
26
  def process_all_images(input_file, output_file):
27
- df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
28
-
29
  try:
30
  df_emb = pd.read_csv(output_file)
31
  df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
32
- except:
33
  # file does not exists yet
34
  df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
35
 
36
  print(f"Start processing of {df.shape[0]} images")
37
  df = df.sort_values("resultPosition", ascending=True)
38
- #df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
39
  for i, row in tqdm(df.iterrows(), total=df.shape[0]):
40
  embeddings = get_embeddings(row["contentUrl"])
41
  new_row = row.copy()
@@ -49,9 +51,13 @@ def process_all_images(input_file, output_file):
49
  df_emb.to_csv(output_file, index=False)
50
  return df_emb
51
 
 
52
  def build_annoy_index():
53
  pass
54
 
 
55
  if __name__ == "__main__":
56
  output_file = "../data/actors_embeddings.csv"
57
- df_embeddings = process_all_images(input_file="../data/actors_images.csv", output_file=output_file)
 
 
7
 
8
 
9
  def get_image(url: str):
10
+ headers = {"User-Agent": "Actors matching app 1.0"}
 
 
11
  response = requests.get(url, headers=headers)
12
  response.raise_for_status()
13
  img_file_object = BytesIO(response.content)
14
  return face_recognition.load_image_file(img_file_object)
15
 
16
+
17
  def get_embeddings(url: str):
18
  try:
19
  image = get_image(url)
20
+ embeddings = face_recognition.face_encodings(
21
+ image, num_jitters=2, model="large"
22
+ )
23
  return list(embeddings[0])
24
  except Exception as e:
25
  print(e)
26
 
27
+
28
  def process_all_images(input_file, output_file):
29
+ df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]]
30
+
31
  try:
32
  df_emb = pd.read_csv(output_file)
33
  df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
34
+ except:
35
  # file does not exists yet
36
  df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
37
 
38
  print(f"Start processing of {df.shape[0]} images")
39
  df = df.sort_values("resultPosition", ascending=True)
40
+ # df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
41
  for i, row in tqdm(df.iterrows(), total=df.shape[0]):
42
  embeddings = get_embeddings(row["contentUrl"])
43
  new_row = row.copy()
51
  df_emb.to_csv(output_file, index=False)
52
  return df_emb
53
 
54
+
55
  def build_annoy_index():
56
  pass
57
 
58
+
59
  if __name__ == "__main__":
60
  output_file = "../data/actors_embeddings.csv"
61
+ df_embeddings = process_all_images(
62
+ input_file="../data/actors_images.csv", output_file=output_file
63
+ )