Spaces:

XAI
/

Cleaning-ImageNet-Hard

Paused

App Files Files Community

taesiri commited on Apr 26, 2023

Commit

15d5b2e

1 Parent(s): 0a8da33

backup

Browse files

Files changed (1) hide show

app.py +173 -104

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import csv
 import json
 import os
@@ -20,28 +19,30 @@ import torchvision
 from huggingface_hub import HfApi, login, snapshot_download
 from PIL import Image
-session_token = os.environ.get("SessionToken")
-login(token=session_token)
 csv.field_size_limit(sys.maxsize)
 np.random.seed(int(time.time()))
-with open('./imagenet_hard_nearest_indices.pkl', 'rb') as f:
-    knn_results  = pickle.load(f)
 with open("imagenet-labels.json") as f:
     wnid_to_label = json.load(f)
-with open('id_to_label.json', 'r') as f:
     id_to_labels = json.load(f)
-bad_items = open('./ex2.txt', 'r').read().split('\n')
-bad_items = [x.split('.')[0] for x in bad_items]
-bad_items = [int(x) for x in bad_items if x != '']
 # download and extract folders
@@ -54,7 +55,9 @@ gdown.cached_download(
 # EXTRACT if needed
-if not os.path.exists("./imagenet_traning_samples") or not os.path.exists("./knn_cache_for_imagenet_hard"):
     torchvision.datasets.utils.extract_archive(
         from_path="data.zip",
         to_path="./",
@@ -64,11 +67,12 @@ if not os.path.exists("./imagenet_traning_samples") or not os.path.exists("./knn
 imagenet_hard = datasets.load_dataset("taesiri/imagenet-hard", split="validation")
-def update_snapshot():
     output_dir = snapshot_download(
-            repo_id="taesiri/imagenet_hard_review_data", allow_patterns="*.json", repo_type="dataset"
     )
-    total_size = len(imagenet_hard)
     files = glob(f"{output_dir}/*.json")
     df = pd.DataFrame()
@@ -83,42 +87,35 @@ def update_snapshot():
             rows.append(tdf)
     df = pd.DataFrame(rows, columns=columns)
-    return df, total_size
-# df = update_snapshot()
-NUMBER_OF_IMAGES = 1000
-# Function to sample 10 ids based on their usage count
-def sample_ids(df, total_size, sample_size):
-    id_counts = df['id'].value_counts().to_dict()
-    all_ids = bad_items
-    for id in all_ids:
-        if id not in id_counts:
-            id_counts[id] = 0
-    weights = [id_counts[id] for id in all_ids]
-    inverse_weights = [1 / (count + 1) for count in weights]
-    normalized_weights = [w / sum(inverse_weights) for w in inverse_weights]
-    sampled_ids = np.random.choice(all_ids, size=sample_size, replace=False, p=normalized_weights)
-    return sampled_ids
-def generate_dataset():
-    df, total_size = update_snapshot()
-    random_indices = sample_ids(df, total_size, NUMBER_OF_IMAGES)
     random_images = [imagenet_hard[int(i)]["image"] for i in random_indices]
     random_gt_ids = [imagenet_hard[int(i)]["label"] for i in random_indices]
-    random_gt_labels = [imagenet_hard[int(x)]["english_label"]  for x in random_indices]
     data = []
     for i, image in enumerate(random_images):
         data.append(
-            {
                 "id": random_indices[i],
                 "image": image,
                 "correct_label": random_gt_labels[i],
@@ -128,83 +125,86 @@ def generate_dataset():
     return data
 def string_to_image(text):
-    text = text.replace('_', ' ').lower().replace(', ', '\n')
     # Create a blank white square image
     img = np.ones((220, 75, 3))
-    # Create a figure and axis object
     fig, ax = plt.subplots(figsize=(6, 2.25))
-    # Plot the blank white image
     ax.imshow(img, extent=[0, 1, 0, 1])
-    # Set the text in the center
-    ax.text(0.5, 0.75, text, fontsize=18, ha='center', va='center')
-    # Remove the axis labels and ticks
     ax.set_xticks([])
     ax.set_yticks([])
     ax.set_xticklabels([])
     ax.set_yticklabels([])
-    # Remove the axis spines
     for spine in ax.spines.values():
         spine.set_visible(False)
-    # Return the figure
     return fig
-def label_dist_of_nns(qid):
-    with open('./trainingset_filenames.json', 'r') as f:
-        trainingset_filenames = json.load(f)
-    nns = knn_results[qid][:15]
-    labels = [wnid_to_label[trainingset_filenames[f"{x}"]] for x in nns]
-    label_counts = {x: labels.count(x) for x in set(labels)}
-    # sort by count
-    label_counts = {k: v for k, v in sorted(label_counts.items(), key=lambda item: item[1], reverse=True)}
-    # percetage
-    label_counts = {k: v/len(labels) for k, v in label_counts.items()}
-    return label_counts
-from glob import glob
-all_samples = glob('./imagenet_traning_samples/*.JPEG')
-qid_to_sample = {int(x.split('/')[-1].split('.')[0].split('_')[0]): x for x in all_samples}
 def get_training_samples(qid):
-    labels_id = imagenet_hard[int(qid)]['label']
     samples = [qid_to_sample[x] for x in labels_id]
     return samples
-knn_cache_path = "knn_cache_for_imagenet_hard"
-imagenet_training_samples_path = "imagenet_traning_samples"
 def load_sample(data, current_index):
     image_id = data[current_index]["id"]
     qimage = data[current_index]["image"]
     labels = data[current_index]["correct_label"]
     return qimage, labels
-    # return qimage, neighbors_image, training_samples_image
 def update_app(decision, data, current_index, history, username):
     if current_index == -1:
-        data = generate_dataset()
-    if current_index>=0 and current_index < NUMBER_OF_IMAGES-1:
         time_stamp = int(time.time())
         image_id = data[current_index]["id"]
-        # convert to percentage
         dicision_dict = {
             "id": int(image_id),
             "user_id": username,
@@ -228,23 +228,56 @@ def update_app(decision, data, current_index, history, username):
         os.remove(temp_filename)
-    elif current_index == NUMBER_OF_IMAGES-1:
-        return None, None, current_index, history, data, None
-    current_index += 1
-    qimage, labels = load_sample(data, current_index)
-    image_id = data[current_index]["id"]
-    training_samples_image = get_training_samples(image_id)
-    training_samples_image = [Image.open(x).convert('RGB') for x in training_samples_image]
-    # labels is a list of labels, conver it to a string
-    labels = ", ".join(labels)
-    label_plot = string_to_image(labels)
-    return qimage, label_plot, current_index, history, data, training_samples_image
-newcss = '''
 #query_image{
     height: auto !important;
 }
@@ -256,51 +289,87 @@ newcss = '''
 #sample_gallery {
     height: auto !important;
 }
-'''
 with gr.Blocks(css=newcss) as demo:
     data_gr = gr.State({})
     current_index = gr.State(-1)
     history = gr.State({})
     gr.Markdown("# Cleaning ImageNet-Hard!")
     random_str = "".join(
         random.choice(string.ascii_lowercase + string.digits) for _ in range(5)
     )
-    username = gr.Textbox(label="Username", value=f"user-{random_str}")
     with gr.Column():
         with gr.Row():
             accept_btn = gr.Button(value="Accept")
             myabe_btn = gr.Button(value="Not Sure!")
             reject_btn = gr.Button(value="Reject")
-        with gr.Row():
             query_image = gr.Image(type="pil", label="Query", elem_id="query_image")
             with gr.Column():
-                label_plot = gr.Plot(label='Is this a correct label for this image?', type='fig')
-                training_samples = gr.Gallery(type="pil", label="Training samples" , elem_id="sample_gallery")
-    # with gr.Column():
-    #     gr.Markdown("## Nearest Neighbors Analysis of the Query (ResNet-50)")
-    #     nn_labels = gr.Label(label="NN-Labels")
-    #     neighbors_image = gr.Image(type="pil", label="Nearest Neighbors", elem_id="nn_gallery")
     accept_btn.click(
         update_app,
         inputs=[accept_btn, data_gr, current_index, history, username],
-        outputs=[query_image, label_plot, current_index, history, data_gr, training_samples]
     )
     myabe_btn.click(
         update_app,
         inputs=[myabe_btn, data_gr, current_index, history, username],
-        outputs=[query_image, label_plot, current_index, history, data_gr, training_samples]
     )
     reject_btn.click(
         update_app,
         inputs=[reject_btn, data_gr, current_index, history, username],
-        outputs=[query_image, label_plot, current_index, history, data_gr, training_samples]
     )
 demo.launch()

 import csv
 import json
 import os
 from huggingface_hub import HfApi, login, snapshot_download
 from PIL import Image
+# session_token = os.environ.get("SessionToken")
+# login(token=session_token)
 csv.field_size_limit(sys.maxsize)
 np.random.seed(int(time.time()))
+with open("./imagenet_hard_nearest_indices.pkl", "rb") as f:
+    knn_results = pickle.load(f)
 with open("imagenet-labels.json") as f:
     wnid_to_label = json.load(f)
+with open("id_to_label.json", "r") as f:
     id_to_labels = json.load(f)
+imagenet_training_samples_path = "imagenet_traning_samples"
+bad_items = open("./ex2.txt", "r").read().split("\n")
+bad_items = [x.split(".")[0] for x in bad_items]
+bad_items = [int(x) for x in bad_items if x != ""]
+NUMBER_OF_IMAGES = 100  # len(bad_items)
 # download and extract folders
 # EXTRACT if needed
+if not os.path.exists("./imagenet_traning_samples") or not os.path.exists(
+    "./knn_cache_for_imagenet_hard"
+):
     torchvision.datasets.utils.extract_archive(
         from_path="data.zip",
         to_path="./",
 imagenet_hard = datasets.load_dataset("taesiri/imagenet-hard", split="validation")
+def update_snapshot(username):
     output_dir = snapshot_download(
+        repo_id="taesiri/imagenet_hard_review_data",
+        allow_patterns="*.json",
+        repo_type="dataset",
     )
     files = glob(f"{output_dir}/*.json")
     df = pd.DataFrame()
             rows.append(tdf)
     df = pd.DataFrame(rows, columns=columns)
+    df = df[df["user_id"] == username]
+    return df
+def generate_dataset(username):
+    global NUMBER_OF_IMAGES
+    df = update_snapshot(username)
+    all_images = set(bad_items)
+    answered = set(df.id)
+    remaining = list(all_images - answered)
+    if len(remaining) < NUMBER_OF_IMAGES and len(remaining) > 0:
+        NUMBER_OF_IMAGES = len(remaining)
+        random_indices = list(remaining)
+    elif len(remaining) == 0:
+        return []
+    else:
+        random_indices = np.random.choice(remaining, NUMBER_OF_IMAGES, replace=False)
     random_images = [imagenet_hard[int(i)]["image"] for i in random_indices]
     random_gt_ids = [imagenet_hard[int(i)]["label"] for i in random_indices]
+    random_gt_labels = [imagenet_hard[int(x)]["english_label"] for x in random_indices]
     data = []
     for i, image in enumerate(random_images):
         data.append(
+            {
                 "id": random_indices[i],
                 "image": image,
                 "correct_label": random_gt_labels[i],
     return data
 def string_to_image(text):
+    text = text.replace("_", " ").lower().replace(", ", "\n")
     # Create a blank white square image
     img = np.ones((220, 75, 3))
     fig, ax = plt.subplots(figsize=(6, 2.25))
     ax.imshow(img, extent=[0, 1, 0, 1])
+    ax.text(0.5, 0.75, text, fontsize=18, ha="center", va="center")
     ax.set_xticks([])
     ax.set_yticks([])
     ax.set_xticklabels([])
     ax.set_yticklabels([])
     for spine in ax.spines.values():
         spine.set_visible(False)
     return fig
+all_samples = glob("./imagenet_traning_samples/*.JPEG")
+qid_to_sample = {
+    int(x.split("/")[-1].split(".")[0].split("_")[0]): x for x in all_samples
+}
+# user-e3z5b
 def get_training_samples(qid):
+    labels_id = imagenet_hard[int(qid)]["label"]
     samples = [qid_to_sample[x] for x in labels_id]
     return samples
 def load_sample(data, current_index):
     image_id = data[current_index]["id"]
     qimage = data[current_index]["image"]
     labels = data[current_index]["correct_label"]
     return qimage, labels
+def preprocessing(data, current_index, history, username):
+    data = generate_dataset(username)
+    if len(data) == 0:
+        fake_plot = string_to_image("No more images to review")
+        empty_image = Image.new("RGB", (224, 224))
+        return (
+            empty_image,
+            fake_plot,
+            current_index,
+            history,
+            data,
+            None,
+        )
+    current_index = 0
+    qimage, labels = load_sample(data, current_index)
+    image_id = data[current_index]["id"]
+    training_samples_image = get_training_samples(image_id)
+    training_samples_image = [
+        Image.open(x).convert("RGB") for x in training_samples_image
+    ]
+    # labels is a list of labels, conver it to a string
+    labels = ", ".join(labels)
+    label_plot = string_to_image(labels)
+    return qimage, label_plot, current_index, history, data, training_samples_image
 def update_app(decision, data, current_index, history, username):
+    global NUMBER_OF_IMAGES
     if current_index == -1:
+        return
+    if current_index == NUMBER_OF_IMAGES - 1:
         time_stamp = int(time.time())
         image_id = data[current_index]["id"]
+        # convert to percentage
         dicision_dict = {
             "id": int(image_id),
             "user_id": username,
         os.remove(temp_filename)
+        fake_plot = string_to_image("Thank you for your time!")
+        empty_image = Image.new("RGB", (224, 224))
+        return empty_image, fake_plot, current_index, history, data, None
+    if current_index >= 0 and current_index < NUMBER_OF_IMAGES - 1:
+        time_stamp = int(time.time())
+        image_id = data[current_index]["id"]
+        # convert to percentage
+        dicision_dict = {
+            "id": int(image_id),
+            "user_id": username,
+            "time": time_stamp,
+            "decision": decision,
+        }
+        # upload the decision to the server
+        temp_filename = f"results_{username}_{time_stamp}.json"
+        # convert decision_dict to json and save it on the disk
+        with open(temp_filename, "w") as f:
+            json.dump(dicision_dict, f)
+        api = HfApi()
+        api.upload_file(
+            path_or_fileobj=temp_filename,
+            path_in_repo=temp_filename,
+            repo_id="taesiri/imagenet_hard_review_data",
+            repo_type="dataset",
+        )
+        os.remove(temp_filename)
+        # Load the Next Image
+        current_index += 1
+        qimage, labels = load_sample(data, current_index)
+        image_id = data[current_index]["id"]
+        training_samples_image = get_training_samples(image_id)
+        training_samples_image = [
+            Image.open(x).convert("RGB") for x in training_samples_image
+        ]
+        # labels is a list of labels, conver it to a string
+        labels = ", ".join(labels)
+        label_plot = string_to_image(labels)
+        return qimage, label_plot, current_index, history, data, training_samples_image
+newcss = """
 #query_image{
     height: auto !important;
 }
 #sample_gallery {
     height: auto !important;
 }
+"""
 with gr.Blocks(css=newcss) as demo:
     data_gr = gr.State({})
     current_index = gr.State(-1)
     history = gr.State({})
     gr.Markdown("# Cleaning ImageNet-Hard!")
     random_str = "".join(
         random.choice(string.ascii_lowercase + string.digits) for _ in range(5)
     )
+    with gr.Row():
+        username = gr.Textbox(label="Username", value=f"user-{random_str}")
+        prepare_btn = gr.Button(value="Load Samples")
     with gr.Column():
         with gr.Row():
             accept_btn = gr.Button(value="Accept")
             myabe_btn = gr.Button(value="Not Sure!")
             reject_btn = gr.Button(value="Reject")
+        with gr.Row():
             query_image = gr.Image(type="pil", label="Query", elem_id="query_image")
             with gr.Column():
+                label_plot = gr.Plot(
+                    label="Is this a correct label for this image?", type="fig"
+                )
+                training_samples = gr.Gallery(
+                    type="pil", label="Training samples", elem_id="sample_gallery"
+                )
     accept_btn.click(
         update_app,
         inputs=[accept_btn, data_gr, current_index, history, username],
+        outputs=[
+            query_image,
+            label_plot,
+            current_index,
+            history,
+            data_gr,
+            training_samples,
+        ],
     )
     myabe_btn.click(
         update_app,
         inputs=[myabe_btn, data_gr, current_index, history, username],
+        outputs=[
+            query_image,
+            label_plot,
+            current_index,
+            history,
+            data_gr,
+            training_samples,
+        ],
     )
     reject_btn.click(
         update_app,
         inputs=[reject_btn, data_gr, current_index, history, username],
+        outputs=[
+            query_image,
+            label_plot,
+            current_index,
+            history,
+            data_gr,
+            training_samples,
+        ],
+    )
+    prepare_btn.click(
+        preprocessing,
+        inputs=[data_gr, current_index, history, username],
+        outputs=[
+            query_image,
+            label_plot,
+            current_index,
+            history,
+            data_gr,
+            training_samples,
+        ],
     )
 demo.launch()