Spaces:
Runtime error
Runtime error
black py files
Browse files- app.py +20 -10
- pipeline/download_imdb_data.py +6 -4
- pipeline/get_images_data.py +26 -22
- pipeline/process_images.py +15 -9
app.py
CHANGED
@@ -5,11 +5,12 @@ from pathlib import Path
|
|
5 |
|
6 |
annoy_index, actors_mapping = load_annoy_index()
|
7 |
|
|
|
8 |
def get_image_html(actor: dict):
|
9 |
url = actor["url"]
|
10 |
name = actor["name"]
|
11 |
imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
|
12 |
-
return f
|
13 |
<div style="position: relative; text-align: center; color: white;">
|
14 |
<img src="{url}" alt="{name} matches the input image" style="height: 500px">
|
15 |
<div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
|
@@ -17,20 +18,23 @@ def get_image_html(actor: dict):
|
|
17 |
<p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
|
18 |
</div>
|
19 |
</div>
|
20 |
-
|
|
|
21 |
|
22 |
def no_faces_found_html():
|
23 |
return f"""<div>No faces found in the picture</div>"""
|
24 |
|
|
|
25 |
def get_best_matches(image, n_matches: int):
|
26 |
return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
|
27 |
|
|
|
28 |
def find_matching_actors(input_img, title, n_matches: int = 10):
|
29 |
best_matches_list = get_best_matches(input_img, n_matches=n_matches)
|
30 |
|
31 |
# TODO: allow looping through characters
|
32 |
if best_matches_list:
|
33 |
-
best_matches = best_matches_list[0]
|
34 |
|
35 |
# TODO: Show how the initial image was parsed (ie: which person is displayed)
|
36 |
|
@@ -45,8 +49,9 @@ def find_matching_actors(input_img, title, n_matches: int = 10):
|
|
45 |
# No matches
|
46 |
return [no_faces_found_html()]
|
47 |
|
|
|
48 |
iface = gr.Interface(
|
49 |
-
find_matching_actors,
|
50 |
title="Which actor or actress looks like you?",
|
51 |
description="""Who is the best person to play a movie about you? Upload a picture and find out!
|
52 |
Or maybe you'd like to know who would best interpret your favorite historical character?
|
@@ -54,19 +59,24 @@ iface = gr.Interface(
|
|
54 |
and limitations of the tool!""",
|
55 |
article=Path("README.md").read_text(),
|
56 |
inputs=[
|
57 |
-
gr.inputs.Image(shape=(256, 256), label="Your image"),
|
58 |
-
gr.inputs.Textbox(
|
59 |
-
|
60 |
-
|
|
|
|
|
61 |
outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
|
62 |
examples=[
|
63 |
["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
|
64 |
-
[
|
|
|
|
|
|
|
65 |
["images/example_frederick_douglass.jpg", "Frederik Douglass"],
|
66 |
["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
|
67 |
["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
|
68 |
["images/example_sun_tzu.jpg", "Sun Tzu"],
|
69 |
-
]
|
70 |
)
|
71 |
|
72 |
iface.launch()
|
|
|
5 |
|
6 |
annoy_index, actors_mapping = load_annoy_index()
|
7 |
|
8 |
+
|
9 |
def get_image_html(actor: dict):
|
10 |
url = actor["url"]
|
11 |
name = actor["name"]
|
12 |
imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
|
13 |
+
return f"""
|
14 |
<div style="position: relative; text-align: center; color: white;">
|
15 |
<img src="{url}" alt="{name} matches the input image" style="height: 500px">
|
16 |
<div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
|
|
|
18 |
<p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
|
19 |
</div>
|
20 |
</div>
|
21 |
+
"""
|
22 |
+
|
23 |
|
24 |
def no_faces_found_html():
|
25 |
return f"""<div>No faces found in the picture</div>"""
|
26 |
|
27 |
+
|
28 |
def get_best_matches(image, n_matches: int):
|
29 |
return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
|
30 |
|
31 |
+
|
32 |
def find_matching_actors(input_img, title, n_matches: int = 10):
|
33 |
best_matches_list = get_best_matches(input_img, n_matches=n_matches)
|
34 |
|
35 |
# TODO: allow looping through characters
|
36 |
if best_matches_list:
|
37 |
+
best_matches = best_matches_list[0]
|
38 |
|
39 |
# TODO: Show how the initial image was parsed (ie: which person is displayed)
|
40 |
|
|
|
49 |
# No matches
|
50 |
return [no_faces_found_html()]
|
51 |
|
52 |
+
|
53 |
iface = gr.Interface(
|
54 |
+
find_matching_actors,
|
55 |
title="Which actor or actress looks like you?",
|
56 |
description="""Who is the best person to play a movie about you? Upload a picture and find out!
|
57 |
Or maybe you'd like to know who would best interpret your favorite historical character?
|
|
|
59 |
and limitations of the tool!""",
|
60 |
article=Path("README.md").read_text(),
|
61 |
inputs=[
|
62 |
+
gr.inputs.Image(shape=(256, 256), label="Your image"),
|
63 |
+
gr.inputs.Textbox(
|
64 |
+
label="Who's that?", placeholder="Optional, you can leave this blank"
|
65 |
+
),
|
66 |
+
# gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of matches"),
|
67 |
+
],
|
68 |
outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
|
69 |
examples=[
|
70 |
["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
|
71 |
+
[
|
72 |
+
"images/example_hannibal_barca.jpg",
|
73 |
+
"Hannibal (the one with the elephants...)",
|
74 |
+
],
|
75 |
["images/example_frederick_douglass.jpg", "Frederik Douglass"],
|
76 |
["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
|
77 |
["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
|
78 |
["images/example_sun_tzu.jpg", "Sun Tzu"],
|
79 |
+
],
|
80 |
)
|
81 |
|
82 |
iface.launch()
|
pipeline/download_imdb_data.py
CHANGED
@@ -4,10 +4,12 @@ import shutil
|
|
4 |
from urllib.request import urlretrieve
|
5 |
from tqdm import tqdm
|
6 |
|
|
|
7 |
def download_large_file(url: str, output_file: str):
|
8 |
if not os.path.exists(output_file):
|
9 |
urlretrieve(url, output_file)
|
10 |
|
|
|
11 |
def unzip_file(input_file):
|
12 |
output_file = os.path.splitext(input_file)[0]
|
13 |
if not os.path.exists(output_file):
|
@@ -16,17 +18,17 @@ def unzip_file(input_file):
|
|
16 |
with open(output_file, "wb") as f_out:
|
17 |
shutil.copyfileobj(f_in, f_out)
|
18 |
|
|
|
19 |
if __name__ == "__main__":
|
20 |
imdb_url = "https://datasets.imdbws.com"
|
21 |
filenames = [
|
22 |
-
"name.basics.tsv.gz",
|
23 |
-
"title.basics.tsv.gz",
|
24 |
"title.ratings.tsv.gz",
|
25 |
-
"title.principals.tsv.gz"
|
26 |
]
|
27 |
for filename in tqdm(filenames):
|
28 |
url = f"{imdb_url}/{filename}"
|
29 |
output_file = os.path.join("data", filename)
|
30 |
download_large_file(url, output_file)
|
31 |
unzip_file(output_file)
|
32 |
-
|
|
|
4 |
from urllib.request import urlretrieve
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
+
|
8 |
def download_large_file(url: str, output_file: str):
|
9 |
if not os.path.exists(output_file):
|
10 |
urlretrieve(url, output_file)
|
11 |
|
12 |
+
|
13 |
def unzip_file(input_file):
|
14 |
output_file = os.path.splitext(input_file)[0]
|
15 |
if not os.path.exists(output_file):
|
|
|
18 |
with open(output_file, "wb") as f_out:
|
19 |
shutil.copyfileobj(f_in, f_out)
|
20 |
|
21 |
+
|
22 |
if __name__ == "__main__":
|
23 |
imdb_url = "https://datasets.imdbws.com"
|
24 |
filenames = [
|
25 |
+
"name.basics.tsv.gz",
|
26 |
+
"title.basics.tsv.gz",
|
27 |
"title.ratings.tsv.gz",
|
28 |
+
"title.principals.tsv.gz",
|
29 |
]
|
30 |
for filename in tqdm(filenames):
|
31 |
url = f"{imdb_url}/{filename}"
|
32 |
output_file = os.path.join("data", filename)
|
33 |
download_large_file(url, output_file)
|
34 |
unzip_file(output_file)
|
|
pipeline/get_images_data.py
CHANGED
@@ -12,14 +12,15 @@ load_dotenv()
|
|
12 |
|
13 |
BING_API_KEY = os.getenv("BING_API_KEY", None)
|
14 |
|
15 |
-
|
|
|
|
|
|
|
16 |
"""Get a list of actor images from the Bing Image Search API"""
|
17 |
if api_key is None:
|
18 |
raise ValueError("You must provide a Bing API key")
|
19 |
|
20 |
-
headers = {
|
21 |
-
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
22 |
-
}
|
23 |
query = f'"{name}"'
|
24 |
if role:
|
25 |
query = f"{query} ({role})"
|
@@ -29,18 +30,21 @@ def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str
|
|
29 |
"imageType": "Photo",
|
30 |
"safeSearch": "Strict",
|
31 |
"imageContent": "Face",
|
32 |
-
"freshness": "Year"
|
33 |
}
|
34 |
response = requests.get(
|
35 |
f"https://api.bing.microsoft.com/v7.0/images/search",
|
36 |
headers=headers,
|
37 |
-
params=params
|
38 |
)
|
39 |
|
40 |
response.raise_for_status()
|
41 |
return response.json()
|
42 |
|
43 |
-
|
|
|
|
|
|
|
44 |
"""Read and filter the list of actors"""
|
45 |
|
46 |
df = pd.read_csv("data/imdb_actors.csv")
|
@@ -49,18 +53,19 @@ def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_
|
|
49 |
|
50 |
if sort_by:
|
51 |
df = df.sort_values(sort_by, ascending=False)
|
52 |
-
|
53 |
if max_actors:
|
54 |
df = df.head(max_actors)
|
55 |
|
56 |
return df
|
57 |
|
|
|
58 |
def store_all_actor_images_data(
|
59 |
-
max_actors: int = None,
|
60 |
-
images_per_actor: int = 10,
|
61 |
-
last_year_active: int = None,
|
62 |
-
output_file
|
63 |
-
max_api_calls_per_second: int = 3
|
64 |
):
|
65 |
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
|
66 |
|
@@ -69,7 +74,7 @@ def store_all_actor_images_data(
|
|
69 |
if output_file:
|
70 |
try:
|
71 |
df_im = pd.read_csv(output_file)
|
72 |
-
except:
|
73 |
# file does not exists yet
|
74 |
pass
|
75 |
|
@@ -81,12 +86,11 @@ def store_all_actor_images_data(
|
|
81 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
82 |
try:
|
83 |
images_data = get_actor_images(
|
84 |
-
name=row["primaryName"],
|
85 |
-
count=images_per_actor
|
86 |
)
|
87 |
except Exception as e:
|
88 |
print(e)
|
89 |
-
continue
|
90 |
|
91 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
92 |
df_im_tmp["nconst"] = row["nconst"]
|
@@ -96,7 +100,7 @@ def store_all_actor_images_data(
|
|
96 |
df_im = pd.concat([df_im, df_im_tmp])
|
97 |
else:
|
98 |
df_im = df_im_tmp
|
99 |
-
|
100 |
# Store progress
|
101 |
df_im.to_csv(output_file, index=False)
|
102 |
|
@@ -106,9 +110,9 @@ def store_all_actor_images_data(
|
|
106 |
|
107 |
if __name__ == "__main__":
|
108 |
store_all_actor_images_data(
|
109 |
-
output_file="data/actors_images_new.csv",
|
110 |
-
max_actors=2000,
|
111 |
images_per_actor=20,
|
112 |
last_year_active=datetime.now().year - 5,
|
113 |
-
max_api_calls_per_second=100
|
114 |
-
)
|
|
|
12 |
|
13 |
BING_API_KEY = os.getenv("BING_API_KEY", None)
|
14 |
|
15 |
+
|
16 |
+
def get_actor_images(
|
17 |
+
name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY
|
18 |
+
):
|
19 |
"""Get a list of actor images from the Bing Image Search API"""
|
20 |
if api_key is None:
|
21 |
raise ValueError("You must provide a Bing API key")
|
22 |
|
23 |
+
headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
|
|
|
|
|
24 |
query = f'"{name}"'
|
25 |
if role:
|
26 |
query = f"{query} ({role})"
|
|
|
30 |
"imageType": "Photo",
|
31 |
"safeSearch": "Strict",
|
32 |
"imageContent": "Face",
|
33 |
+
"freshness": "Year",
|
34 |
}
|
35 |
response = requests.get(
|
36 |
f"https://api.bing.microsoft.com/v7.0/images/search",
|
37 |
headers=headers,
|
38 |
+
params=params,
|
39 |
)
|
40 |
|
41 |
response.raise_for_status()
|
42 |
return response.json()
|
43 |
|
44 |
+
|
45 |
+
def read_actors_list(
|
46 |
+
max_actors: int = None, last_year_active: int = None, sort_by: str = None
|
47 |
+
):
|
48 |
"""Read and filter the list of actors"""
|
49 |
|
50 |
df = pd.read_csv("data/imdb_actors.csv")
|
|
|
53 |
|
54 |
if sort_by:
|
55 |
df = df.sort_values(sort_by, ascending=False)
|
56 |
+
|
57 |
if max_actors:
|
58 |
df = df.head(max_actors)
|
59 |
|
60 |
return df
|
61 |
|
62 |
+
|
63 |
def store_all_actor_images_data(
|
64 |
+
max_actors: int = None,
|
65 |
+
images_per_actor: int = 10,
|
66 |
+
last_year_active: int = None,
|
67 |
+
output_file=None,
|
68 |
+
max_api_calls_per_second: int = 3,
|
69 |
):
|
70 |
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
|
71 |
|
|
|
74 |
if output_file:
|
75 |
try:
|
76 |
df_im = pd.read_csv(output_file)
|
77 |
+
except:
|
78 |
# file does not exists yet
|
79 |
pass
|
80 |
|
|
|
86 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
87 |
try:
|
88 |
images_data = get_actor_images(
|
89 |
+
name=row["primaryName"], count=images_per_actor
|
|
|
90 |
)
|
91 |
except Exception as e:
|
92 |
print(e)
|
93 |
+
continue
|
94 |
|
95 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
96 |
df_im_tmp["nconst"] = row["nconst"]
|
|
|
100 |
df_im = pd.concat([df_im, df_im_tmp])
|
101 |
else:
|
102 |
df_im = df_im_tmp
|
103 |
+
|
104 |
# Store progress
|
105 |
df_im.to_csv(output_file, index=False)
|
106 |
|
|
|
110 |
|
111 |
if __name__ == "__main__":
|
112 |
store_all_actor_images_data(
|
113 |
+
output_file="data/actors_images_new.csv",
|
114 |
+
max_actors=2000,
|
115 |
images_per_actor=20,
|
116 |
last_year_active=datetime.now().year - 5,
|
117 |
+
max_api_calls_per_second=100,
|
118 |
+
)
|
pipeline/process_images.py
CHANGED
@@ -7,35 +7,37 @@ from time import time
|
|
7 |
|
8 |
|
9 |
def get_image(url: str):
|
10 |
-
headers = {
|
11 |
-
"User-Agent": "Actors matching app 1.0"
|
12 |
-
}
|
13 |
response = requests.get(url, headers=headers)
|
14 |
response.raise_for_status()
|
15 |
img_file_object = BytesIO(response.content)
|
16 |
return face_recognition.load_image_file(img_file_object)
|
17 |
|
|
|
18 |
def get_embeddings(url: str):
|
19 |
try:
|
20 |
image = get_image(url)
|
21 |
-
embeddings = face_recognition.face_encodings(
|
|
|
|
|
22 |
return list(embeddings[0])
|
23 |
except Exception as e:
|
24 |
print(e)
|
25 |
|
|
|
26 |
def process_all_images(input_file, output_file):
|
27 |
-
df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
|
28 |
-
|
29 |
try:
|
30 |
df_emb = pd.read_csv(output_file)
|
31 |
df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
|
32 |
-
except:
|
33 |
# file does not exists yet
|
34 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
35 |
|
36 |
print(f"Start processing of {df.shape[0]} images")
|
37 |
df = df.sort_values("resultPosition", ascending=True)
|
38 |
-
#df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
39 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
40 |
embeddings = get_embeddings(row["contentUrl"])
|
41 |
new_row = row.copy()
|
@@ -49,9 +51,13 @@ def process_all_images(input_file, output_file):
|
|
49 |
df_emb.to_csv(output_file, index=False)
|
50 |
return df_emb
|
51 |
|
|
|
52 |
def build_annoy_index():
|
53 |
pass
|
54 |
|
|
|
55 |
if __name__ == "__main__":
|
56 |
output_file = "../data/actors_embeddings.csv"
|
57 |
-
df_embeddings = process_all_images(
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def get_image(url: str):
|
10 |
+
headers = {"User-Agent": "Actors matching app 1.0"}
|
|
|
|
|
11 |
response = requests.get(url, headers=headers)
|
12 |
response.raise_for_status()
|
13 |
img_file_object = BytesIO(response.content)
|
14 |
return face_recognition.load_image_file(img_file_object)
|
15 |
|
16 |
+
|
17 |
def get_embeddings(url: str):
|
18 |
try:
|
19 |
image = get_image(url)
|
20 |
+
embeddings = face_recognition.face_encodings(
|
21 |
+
image, num_jitters=2, model="large"
|
22 |
+
)
|
23 |
return list(embeddings[0])
|
24 |
except Exception as e:
|
25 |
print(e)
|
26 |
|
27 |
+
|
28 |
def process_all_images(input_file, output_file):
|
29 |
+
df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]]
|
30 |
+
|
31 |
try:
|
32 |
df_emb = pd.read_csv(output_file)
|
33 |
df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
|
34 |
+
except:
|
35 |
# file does not exists yet
|
36 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
37 |
|
38 |
print(f"Start processing of {df.shape[0]} images")
|
39 |
df = df.sort_values("resultPosition", ascending=True)
|
40 |
+
# df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
41 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
42 |
embeddings = get_embeddings(row["contentUrl"])
|
43 |
new_row = row.copy()
|
|
|
51 |
df_emb.to_csv(output_file, index=False)
|
52 |
return df_emb
|
53 |
|
54 |
+
|
55 |
def build_annoy_index():
|
56 |
pass
|
57 |
|
58 |
+
|
59 |
if __name__ == "__main__":
|
60 |
output_file = "../data/actors_embeddings.csv"
|
61 |
+
df_embeddings = process_all_images(
|
62 |
+
input_file="../data/actors_images.csv", output_file=output_file
|
63 |
+
)
|