Spaces:

nbeuchat
/

actors_matching

Runtime error

App Files Files Community

nbeuchat commited on Jan 31, 2022

Commit

be3b0b4

1 Parent(s): b41b1f4

black py files

Browse files

Files changed (4) hide show

app.py +20 -10
pipeline/download_imdb_data.py +6 -4
pipeline/get_images_data.py +26 -22
pipeline/process_images.py +15 -9

app.py CHANGED Viewed

@@ -5,11 +5,12 @@ from pathlib import Path
 annoy_index, actors_mapping = load_annoy_index()
 def get_image_html(actor: dict):
     url = actor["url"]
     name = actor["name"]
     imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
-    return f'''
     <div style="position: relative; text-align: center; color: white;">
        <img src="{url}" alt="{name} matches the input image" style="height: 500px">
         <div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
@@ -17,20 +18,23 @@ def get_image_html(actor: dict):
             <p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
         </div>
     </div>
-    '''
 def no_faces_found_html():
     return f"""<div>No faces found in the picture</div>"""
 def get_best_matches(image, n_matches: int):
     return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
 def find_matching_actors(input_img, title, n_matches: int = 10):
     best_matches_list = get_best_matches(input_img, n_matches=n_matches)
     # TODO: allow looping through characters
     if best_matches_list:
-        best_matches = best_matches_list[0]
         # TODO: Show how the initial image was parsed (ie: which person is displayed)
@@ -45,8 +49,9 @@ def find_matching_actors(input_img, title, n_matches: int = 10):
     # No matches
     return [no_faces_found_html()]
 iface = gr.Interface(
-    find_matching_actors,
     title="Which actor or actress looks like you?",
     description="""Who is the best person to play a movie about you? Upload a picture and find out!
     Or maybe you'd like to know who would best interpret your favorite historical character?
@@ -54,19 +59,24 @@ iface = gr.Interface(
     and limitations of the tool!""",
     article=Path("README.md").read_text(),
     inputs=[
-        gr.inputs.Image(shape=(256, 256), label="Your image"),
-        gr.inputs.Textbox(label="Who's that?", placeholder="Optional, you can leave this blank"),
-        #gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of matches"),
-    ],
     outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
     examples=[
         ["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
-        ["images/example_hannibal_barca.jpg", "Hannibal (the one with the elephants...)"],
         ["images/example_frederick_douglass.jpg", "Frederik Douglass"],
         ["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
         ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
         ["images/example_sun_tzu.jpg", "Sun Tzu"],
-    ]
 )
 iface.launch()

 annoy_index, actors_mapping = load_annoy_index()
 def get_image_html(actor: dict):
     url = actor["url"]
     name = actor["name"]
     imdb_url = f"https://www.imdb.com/name/{actor['nconst']}/"
+    return f"""
     <div style="position: relative; text-align: center; color: white;">
        <img src="{url}" alt="{name} matches the input image" style="height: 500px">
         <div style="padding: 0.2em; position: absolute; bottom: 16px; left: 16px; background-color: #aacccccc; font-size: 2em;">
             <p style="font-size:0.5em"><a href={imdb_url} target="_blank">Click to see on IMDb</></p>
         </div>
     </div>
+    """
 def no_faces_found_html():
     return f"""<div>No faces found in the picture</div>"""
 def get_best_matches(image, n_matches: int):
     return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
 def find_matching_actors(input_img, title, n_matches: int = 10):
     best_matches_list = get_best_matches(input_img, n_matches=n_matches)
     # TODO: allow looping through characters
     if best_matches_list:
+        best_matches = best_matches_list[0]
         # TODO: Show how the initial image was parsed (ie: which person is displayed)
     # No matches
     return [no_faces_found_html()]
 iface = gr.Interface(
+    find_matching_actors,
     title="Which actor or actress looks like you?",
     description="""Who is the best person to play a movie about you? Upload a picture and find out!
     Or maybe you'd like to know who would best interpret your favorite historical character?
     and limitations of the tool!""",
     article=Path("README.md").read_text(),
     inputs=[
+        gr.inputs.Image(shape=(256, 256), label="Your image"),
+        gr.inputs.Textbox(
+            label="Who's that?", placeholder="Optional, you can leave this blank"
+        ),
+        # gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of matches"),
+    ],
     outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
     examples=[
         ["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
+        [
+            "images/example_hannibal_barca.jpg",
+            "Hannibal (the one with the elephants...)",
+        ],
         ["images/example_frederick_douglass.jpg", "Frederik Douglass"],
         ["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
         ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
         ["images/example_sun_tzu.jpg", "Sun Tzu"],
+    ],
 )
 iface.launch()

pipeline/download_imdb_data.py CHANGED Viewed

@@ -4,10 +4,12 @@ import shutil
 from urllib.request import urlretrieve
 from tqdm import tqdm
 def download_large_file(url: str, output_file: str):
     if not os.path.exists(output_file):
         urlretrieve(url, output_file)
 def unzip_file(input_file):
     output_file = os.path.splitext(input_file)[0]
     if not os.path.exists(output_file):
@@ -16,17 +18,17 @@ def unzip_file(input_file):
             with open(output_file, "wb") as f_out:
                 shutil.copyfileobj(f_in, f_out)
 if __name__ == "__main__":
     imdb_url = "https://datasets.imdbws.com"
     filenames = [
-        "name.basics.tsv.gz",
-        "title.basics.tsv.gz",
         "title.ratings.tsv.gz",
-        "title.principals.tsv.gz"
     ]
     for filename in tqdm(filenames):
         url = f"{imdb_url}/{filename}"
         output_file = os.path.join("data", filename)
         download_large_file(url, output_file)
         unzip_file(output_file)

 from urllib.request import urlretrieve
 from tqdm import tqdm
 def download_large_file(url: str, output_file: str):
     if not os.path.exists(output_file):
         urlretrieve(url, output_file)
 def unzip_file(input_file):
     output_file = os.path.splitext(input_file)[0]
     if not os.path.exists(output_file):
             with open(output_file, "wb") as f_out:
                 shutil.copyfileobj(f_in, f_out)
 if __name__ == "__main__":
     imdb_url = "https://datasets.imdbws.com"
     filenames = [
+        "name.basics.tsv.gz",
+        "title.basics.tsv.gz",
         "title.ratings.tsv.gz",
+        "title.principals.tsv.gz",
     ]
     for filename in tqdm(filenames):
         url = f"{imdb_url}/{filename}"
         output_file = os.path.join("data", filename)
         download_large_file(url, output_file)
         unzip_file(output_file)

pipeline/get_images_data.py CHANGED Viewed

@@ -12,14 +12,15 @@ load_dotenv()
 BING_API_KEY = os.getenv("BING_API_KEY", None)
-def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY):
     """Get a list of actor images from the Bing Image Search API"""
     if api_key is None:
         raise ValueError("You must provide a Bing API key")
-    headers = {
-        "Ocp-Apim-Subscription-Key": BING_API_KEY
-    }
     query = f'"{name}"'
     if role:
         query = f"{query} ({role})"
@@ -29,18 +30,21 @@ def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str
         "imageType": "Photo",
         "safeSearch": "Strict",
         "imageContent": "Face",
-        "freshness": "Year"
     }
     response = requests.get(
         f"https://api.bing.microsoft.com/v7.0/images/search",
         headers=headers,
-        params=params
     )
     response.raise_for_status()
     return response.json()
-def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
     """Read and filter the list of actors"""
     df = pd.read_csv("data/imdb_actors.csv")
@@ -49,18 +53,19 @@ def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_
     if sort_by:
         df = df.sort_values(sort_by, ascending=False)
     if max_actors:
         df = df.head(max_actors)
     return df
 def store_all_actor_images_data(
-    max_actors: int = None,
-    images_per_actor: int = 10,
-    last_year_active: int = None,
-    output_file = None,
-    max_api_calls_per_second: int = 3
 ):
     """Get images data for each actor from the Bing Image Search API and store the results as csv"""
@@ -69,7 +74,7 @@ def store_all_actor_images_data(
     if output_file:
         try:
             df_im = pd.read_csv(output_file)
-        except:
             # file does not exists yet
             pass
@@ -81,12 +86,11 @@ def store_all_actor_images_data(
     for _, row in tqdm(df.iterrows(), total=df.shape[0]):
         try:
             images_data = get_actor_images(
-                name=row["primaryName"],
-                count=images_per_actor
             )
         except Exception as e:
             print(e)
-            continue
         df_im_tmp = pd.DataFrame(images_data["value"])
         df_im_tmp["nconst"] = row["nconst"]
@@ -96,7 +100,7 @@ def store_all_actor_images_data(
             df_im = pd.concat([df_im, df_im_tmp])
         else:
             df_im = df_im_tmp
         # Store progress
         df_im.to_csv(output_file, index=False)
@@ -106,9 +110,9 @@ def store_all_actor_images_data(
 if __name__ == "__main__":
     store_all_actor_images_data(
-        output_file="data/actors_images_new.csv",
-        max_actors=2000,
         images_per_actor=20,
         last_year_active=datetime.now().year - 5,
-        max_api_calls_per_second=100
-    )

 BING_API_KEY = os.getenv("BING_API_KEY", None)
+def get_actor_images(
+    name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY
+):
     """Get a list of actor images from the Bing Image Search API"""
     if api_key is None:
         raise ValueError("You must provide a Bing API key")
+    headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
     query = f'"{name}"'
     if role:
         query = f"{query} ({role})"
         "imageType": "Photo",
         "safeSearch": "Strict",
         "imageContent": "Face",
+        "freshness": "Year",
     }
     response = requests.get(
         f"https://api.bing.microsoft.com/v7.0/images/search",
         headers=headers,
+        params=params,
     )
     response.raise_for_status()
     return response.json()
+def read_actors_list(
+    max_actors: int = None, last_year_active: int = None, sort_by: str = None
+):
     """Read and filter the list of actors"""
     df = pd.read_csv("data/imdb_actors.csv")
     if sort_by:
         df = df.sort_values(sort_by, ascending=False)
     if max_actors:
         df = df.head(max_actors)
     return df
 def store_all_actor_images_data(
+    max_actors: int = None,
+    images_per_actor: int = 10,
+    last_year_active: int = None,
+    output_file=None,
+    max_api_calls_per_second: int = 3,
 ):
     """Get images data for each actor from the Bing Image Search API and store the results as csv"""
     if output_file:
         try:
             df_im = pd.read_csv(output_file)
+        except:
             # file does not exists yet
             pass
     for _, row in tqdm(df.iterrows(), total=df.shape[0]):
         try:
             images_data = get_actor_images(
+                name=row["primaryName"], count=images_per_actor
             )
         except Exception as e:
             print(e)
+            continue
         df_im_tmp = pd.DataFrame(images_data["value"])
         df_im_tmp["nconst"] = row["nconst"]
             df_im = pd.concat([df_im, df_im_tmp])
         else:
             df_im = df_im_tmp
         # Store progress
         df_im.to_csv(output_file, index=False)
 if __name__ == "__main__":
     store_all_actor_images_data(
+        output_file="data/actors_images_new.csv",
+        max_actors=2000,
         images_per_actor=20,
         last_year_active=datetime.now().year - 5,
+        max_api_calls_per_second=100,
+    )

pipeline/process_images.py CHANGED Viewed

@@ -7,35 +7,37 @@ from time import time
 def get_image(url: str):
-    headers = {
-        "User-Agent": "Actors matching app 1.0"
-    }
     response = requests.get(url, headers=headers)
     response.raise_for_status()
     img_file_object = BytesIO(response.content)
     return face_recognition.load_image_file(img_file_object)
 def get_embeddings(url: str):
     try:
         image = get_image(url)
-        embeddings = face_recognition.face_encodings(image, num_jitters=2, model="large")
         return list(embeddings[0])
     except Exception as e:
         print(e)
 def process_all_images(input_file, output_file):
-    df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
     try:
         df_emb = pd.read_csv(output_file)
         df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
-    except:
         # file does not exists yet
         df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
     print(f"Start processing of {df.shape[0]} images")
     df = df.sort_values("resultPosition", ascending=True)
-    #df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
         embeddings = get_embeddings(row["contentUrl"])
         new_row = row.copy()
@@ -49,9 +51,13 @@ def process_all_images(input_file, output_file):
     df_emb.to_csv(output_file, index=False)
     return df_emb
 def build_annoy_index():
     pass
 if __name__ == "__main__":
     output_file = "../data/actors_embeddings.csv"
-    df_embeddings = process_all_images(input_file="../data/actors_images.csv", output_file=output_file)

 def get_image(url: str):
+    headers = {"User-Agent": "Actors matching app 1.0"}
     response = requests.get(url, headers=headers)
     response.raise_for_status()
     img_file_object = BytesIO(response.content)
     return face_recognition.load_image_file(img_file_object)
 def get_embeddings(url: str):
     try:
         image = get_image(url)
+        embeddings = face_recognition.face_encodings(
+            image, num_jitters=2, model="large"
+        )
         return list(embeddings[0])
     except Exception as e:
         print(e)
 def process_all_images(input_file, output_file):
+    df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]]
     try:
         df_emb = pd.read_csv(output_file)
         df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
+    except:
         # file does not exists yet
         df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
     print(f"Start processing of {df.shape[0]} images")
     df = df.sort_values("resultPosition", ascending=True)
+    # df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
         embeddings = get_embeddings(row["contentUrl"])
         new_row = row.copy()
     df_emb.to_csv(output_file, index=False)
     return df_emb
 def build_annoy_index():
     pass
 if __name__ == "__main__":
     output_file = "../data/actors_embeddings.csv"
+    df_embeddings = process_all_images(
+        input_file="../data/actors_images.csv", output_file=output_file
+    )