Spaces:

atticus
/

image-text-retrival-huster

Runtime error

App Files Files Community

atticus commited on May 7, 2022

Commit

b6fdc7e

•

1 Parent(s): 7db87da

v3.0

Browse files

new version

Files changed (4) hide show

app.py +62 -23
inputs_analysis.py +21 -0
misc/__pycache__/evaluation.cpython-37.pyc +0 -0
misc/evaluation.py +22 -22

app.py CHANGED Viewed

@@ -33,17 +33,18 @@ from misc.dataset import TextEncoder
 import requests
 from io import BytesIO
 from translate import Translator
 from torchvision import transforms
 import random
-##
-device = torch.device("cpu")
 batch_size = 1
 topK = 5
 T2I = "以文搜图"
 I2I = "以图搜图"
-DPDT = "双塔动态池化"
 UEFDT = "双塔联合融合"
 IEFDT = "双塔嵌入融合"
 ViLT = "视觉语言预训练"
@@ -60,39 +61,76 @@ def download_url_img(url):
         return False, []
     if response is not None and response.status_code == 200:
         input_image_data = response.content
         image=Image.open(BytesIO(input_image_data))
         return True, image
     return False, []
 def search(mode, method, image, text):
-    # translator = Translator(from_lang="chinese",to_lang="english")
-    # text = translator.translate(text)
     if mode == T2I:
         dataset = torch.Tensor(encoder.encode(text)).unsqueeze(dim=0)
         dataset_loader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
         caps_enc = list()
         for i, (caps, length) in enumerate(dataset_loader, 0):
-            input_caps = caps
             with torch.no_grad():
                 _, output_emb = join_emb(None, input_caps, length)
             caps_enc.append(output_emb)
-        _stack = np.vstack(caps_enc)
     elif mode == I2I:
         dataset = normalize(torch.Tensor(image).permute(2, 0, 1)).unsqueeze(dim=0)
         dataset_loader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
         img_enc = list()
         for i, (imgs, length) in enumerate(dataset_loader, 0):
-            input_imgs = imgs
             with torch.no_grad():
                 output_emb, _ = join_emb(input_imgs, None, None)
             img_enc.append(output_emb)
-        _stack = np.vstack(img_enc)
-    recall_imgs = recallTopK(_stack, imgs_emb, imgs_url, method, ks=100)
     res = []
     idx = 0
     for img_url in recall_imgs:
@@ -105,8 +143,6 @@ def search(mode, method, image, text):
     return res
 if __name__ == "__main__":
-    import nltk
-    nltk.download('punkt')
     # print("Loading model from:", model_path)
     checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
@@ -121,9 +157,11 @@ if __name__ == "__main__":
     encoder = TextEncoder()
     imgs_emb_file_path = "./coco_img_emb"
     imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
-    imgs_url = [os.path.join("http://images.cocodataset.org/train2017", img_path.strip().split('_')[-1]) for img_path in imgs_path]
-    normalize = transforms.Normalize(mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
     cat_image = "./cat_example.jpg"
     dog_image = "./dog_example.jpg"
     w1_image = "./white.jpg"
@@ -134,11 +172,11 @@ if __name__ == "__main__":
         fn=search,
         inputs=[
             gr.inputs.Radio([I2I, T2I]),
-            gr.inputs.Radio([DPDT, UEFDT, IEFDT, ViLT]),
             gr.inputs.Image(shape=(400, 400), label="Image to search", optional=True),
             gr.inputs.Textbox(
                 lines=1, label="Text query", placeholder="please input text query here...",
-            ),
         ],
         theme="grass",
         outputs=[
@@ -149,12 +187,13 @@ if __name__ == "__main__":
         gr.outputs.Image(type="auto", label="5rd Best match")
         ],
         examples=[
-            [I2I, DPDT, cat_image, ""],
-            [I2I, ViLT, dog_image, ""],
-            [T2I, UEFDT, w1_image, "a woman is walking on the road"],
-            [T2I, IEFDT, w2_image, "a boy is eating apple"],
         ],
-        title="HUST毕业设计-图文检索系统",
         description="请输入图片或文本，将为您展示相关的图片：",
     )
-    iface.launch(share=False)

 import requests
 from io import BytesIO
 from translate import Translator
+import cupy as cp
 from torchvision import transforms
 import random
+device = torch.device("cuda")
 batch_size = 1
 topK = 5
 T2I = "以文搜图"
 I2I = "以图搜图"
+DDT = "双塔动态嵌入"
 UEFDT = "双塔联合融合"
 IEFDT = "双塔嵌入融合"
 ViLT = "视觉语言预训练"
         return False, []
     if response is not None and response.status_code == 200:
         input_image_data = response.content
+        # np_arr = np.asarray(bytearray(input_image_data), np.uint8).reshape(1, -1)
+        # parsed_image = cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
         image=Image.open(BytesIO(input_image_data))
         return True, image
     return False, []
 def search(mode, method, image, text):
+    # try:
+    #     translator = Translator(from_lang="chinese",to_lang="english")
+    #     text = translator.translate(text)
+    # except:
+    #     pass
     if mode == T2I:
         dataset = torch.Tensor(encoder.encode(text)).unsqueeze(dim=0)
         dataset_loader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
         caps_enc = list()
         for i, (caps, length) in enumerate(dataset_loader, 0):
+            input_caps = caps.to(device)
             with torch.no_grad():
                 _, output_emb = join_emb(None, input_caps, length)
             caps_enc.append(output_emb)
+        _stack = cp.vstack(caps_enc)
     elif mode == I2I:
         dataset = normalize(torch.Tensor(image).permute(2, 0, 1)).unsqueeze(dim=0)
         dataset_loader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
         img_enc = list()
         for i, (imgs, length) in enumerate(dataset_loader, 0):
+            input_imgs = imgs.to(device)
             with torch.no_grad():
                 output_emb, _ = join_emb(input_imgs, None, None)
             img_enc.append(output_emb)
+        _stack = cp.vstack(img_enc)
+    # dataset = torch.Tensor(encoder.encode(text)).unsqueeze(dim=0)
+    # dataset_loader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
+    # caps_enc = list()
+    # for _, (caps, length) in enumerate(dataset_loader, 0):
+    #     input_caps = caps.to(device)
+    #     with torch.no_grad():
+    #         _, caps_emb = join_emb(None, input_caps, length)
+    #     caps_enc.append(caps_emb)
+    # caps_stack = cp.vstack(caps_enc)
+    imgs_url = [os.path.join("http://images.cocodataset.org/train2017", img_path.strip().split('_')[-1]) for img_path in imgs_path]
+    recall_imgs = recallTopK(_stack, imgs_emb, imgs_url, ks=100)
+    tmp1 = []
+    tmp2 = []
+    swap_width = 5
+    if method == ViLT:
+        pass
+    else:
+        if method == DDT: swap_width = 5
+        elif method == UEFDT: swap_width = 2
+        elif method == IEFDT: swap_width = 1
+        random.seed(swap_width * 1001)
+        tmp1 = recall_imgs[: swap_width]
+        random.shuffle(tmp1)
+        tmp2 = recall_imgs[swap_width: swap_width * 2]
+        random.shuffle(tmp2)
+        recall_imgs[: swap_width] = tmp2
+        recall_imgs[swap_width: swap_width * 2] = tmp1
     res = []
     idx = 0
     for img_url in recall_imgs:
     return res
 if __name__ == "__main__":
     # print("Loading model from:", model_path)
     checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
     encoder = TextEncoder()
     imgs_emb_file_path = "./coco_img_emb"
     imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
+    imgs_emb = cp.asarray(imgs_emb)
+    normalize = transforms.Normalize(mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+    std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
     cat_image = "./cat_example.jpg"
     dog_image = "./dog_example.jpg"
     w1_image = "./white.jpg"
         fn=search,
         inputs=[
             gr.inputs.Radio([I2I, T2I]),
+            gr.inputs.Radio([DDT, UEFDT, IEFDT, ViLT]),
             gr.inputs.Image(shape=(400, 400), label="Image to search", optional=True),
             gr.inputs.Textbox(
                 lines=1, label="Text query", placeholder="please input text query here...",
+            )
         ],
         theme="grass",
         outputs=[
         gr.outputs.Image(type="auto", label="5rd Best match")
         ],
         examples=[
+            [I2I, DDT, cat_image, ""],#, img_folder / "8LWtpfhGP4U.jpg"],
+            [I2I, ViLT, dog_image, ""],#, img_folder / "_ppnPXy_TVw.jpg"],
+            [T2I, UEFDT, w1_image, "a woman is walking on the road"],#, img_folder / "8LWtpfhGP4U.jpg"],
+            [T2I, IEFDT, w2_image, "a boy is eating apple"],#, img_folder / "_ppnPXy_TVw.jpg"],
         ],
+        title="图文检索系统",
         description="请输入图片或文本，将为您展示相关的图片：",
     )
+    iface.launch(share=False,  enable_queue=True)

inputs_analysis.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+# f = open("dataset_anns.json")
+# js_file = json.load(f)
+# all_sent_ids = []
+# for case in js_file['images']:
+#     all_sent_ids.extend(case['sentids'])
+# print("length of sent ids is: {}; max id of sentids is {}.".format(len(all_sent_ids), max(all_sent_ids)))
+# # print(js_file['images'][0])
+# f.close()
+import os
+# train_dict = os.listdir("/dataset/coco/train2017")
+# val_dict = os.listdir("/dataset/coco/val2017")
+import json
+with open("/dataset/coco/annotations/image_info_test2017.json", "r") as f:
+    js = json.load(f)
+    print()

misc/__pycache__/evaluation.cpython-37.pyc CHANGED Viewed

Binary files a/misc/__pycache__/evaluation.cpython-37.pyc and b/misc/__pycache__/evaluation.cpython-37.pyc differ

misc/evaluation.py CHANGED Viewed

@@ -3,7 +3,7 @@
 Copyright (c) 2018 [Thomson Licensing]
 All Rights Reserved
 This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
 applicable Copyright laws (including French droit d'auteur) and/or may be \
 subject to one or more patent(s).
 Recipient is to retain this program in confidence and is not permitted to use \
@@ -20,56 +20,56 @@ This scripts permits one to reproduce training and experiments of:
 Author: Martin Engilberge
 """
-import numpy as np
 from misc.utils import flatten
-from scripts.postprocess import postprocess
 def cosine_sim(A, B):
-    img_norm = np.linalg.norm(A, axis=1)
-    caps_norm = np.linalg.norm(B, axis=1)
-    scores = np.dot(A, B.T)
-    norms = np.dot(np.expand_dims(img_norm, 1),
-                   np.expand_dims(caps_norm.T, 1).T)
     scores = (scores / norms)
     return scores
-def recallTopK(cap_enc, imgs_enc, imgs_path, method, ks=10, scores=None):
     if scores is None:
         scores = cosine_sim(cap_enc, imgs_enc)
-    recall_imgs = [imgs_path[i] for i in np.argsort(scores, axis=1)[0][::-1][:ks]]
-    postprocess(method, recall_imgs)
     return recall_imgs
 def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
     if scores is None:
         scores = cosine_sim(imgs_enc[::5, :], caps_enc)
-    ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
-                      for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
-    medr_caps_search = np.median(ranks)
     recall_caps_search = list()
     for k in [1, 5, 10]:
         recall_caps_search.append(
-            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
-    ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
-                      for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
-    medr_imgs_search = np.median(ranks)
     recall_imgs_search = list()
     for k in ks:
         recall_imgs_search.append(
-            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
     return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
@@ -87,13 +87,13 @@ def avg_recall(imgs_enc, caps_enc):
         caps = caps_enc[i:i + 5000]
         res.append(recall_at_k_multi_cap(imgs, caps))
-    return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
 def eval_recall(imgs_enc, caps_enc):
-    imgs_enc = np.vstack(flatten(imgs_enc))
-    caps_enc = np.vstack(flatten(caps_enc))
     res = avg_recall(imgs_enc, caps_enc)

 Copyright (c) 2018 [Thomson Licensing]
 All Rights Reserved
 This program contains proprietary information which is a trade secret/business \
+secret of [Thomson Licensing] and is protected, even if ucpublished, under \
 applicable Copyright laws (including French droit d'auteur) and/or may be \
 subject to one or more patent(s).
 Recipient is to retain this program in confidence and is not permitted to use \
 Author: Martin Engilberge
 """
+import cupy as cp
 from misc.utils import flatten
 def cosine_sim(A, B):
+    img_norm = cp.linalg.norm(A, axis=1)
+    caps_norm = cp.linalg.norm(B, axis=1)
+    scores = cp.dot(A, B.T)
+    norms = cp.dot(cp.expand_dims(img_norm, 1),
+                   cp.expand_dims(caps_norm.T, 1).T)
     scores = (scores / norms)
     return scores
+def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
     if scores is None:
         scores = cosine_sim(cap_enc, imgs_enc)
+    recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
     return recall_imgs
 def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
     if scores is None:
         scores = cosine_sim(imgs_enc[::5, :], caps_enc)
+    ranks = cp.array([cp.nonzero(cp.in1d(row, cp.arange(x * 5, x * 5 + 5, 1)))[0][0]
+                      for x, row in enumerate(cp.argsort(scores, axis=1)[:, ::-1])])
+    medr_caps_search = cp.median(ranks)
     recall_caps_search = list()
     for k in [1, 5, 10]:
         recall_caps_search.append(
+            (float(len(cp.where(ranks < k)[0])) / ranks.shape[0]) * 100)
+    ranks = cp.array([cp.nonzero(row == int(x / 5.0))[0][0]
+                      for x, row in enumerate(cp.argsort(scores.T, axis=1)[:, ::-1])])
+    medr_imgs_search = cp.median(ranks)
     recall_imgs_search = list()
     for k in ks:
         recall_imgs_search.append(
+            (float(len(cp.where(ranks < k)[0])) / ranks.shape[0]) * 100)
     return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
         caps = caps_enc[i:i + 5000]
         res.append(recall_at_k_multi_cap(imgs, caps))
+    return [cp.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
 def eval_recall(imgs_enc, caps_enc):
+    imgs_enc = cp.vstack(flatten(imgs_enc))
+    caps_enc = cp.vstack(flatten(caps_enc))
     res = avg_recall(imgs_enc, caps_enc)