|
|
|
''' |
|
This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs image-to-text retrieval prediction file for evaluation. |
|
''' |
|
|
|
import argparse |
|
import numpy |
|
from tqdm import tqdm |
|
import json |
|
|
|
import numpy as np |
|
import torch |
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
'--image-feats', |
|
type=str, |
|
required=True, |
|
help="Specify the path of image features." |
|
) |
|
parser.add_argument( |
|
'--text-feats', |
|
type=str, |
|
required=True, |
|
help="Specify the path of text features." |
|
) |
|
parser.add_argument( |
|
'--top-k', |
|
type=int, |
|
default=10, |
|
help="Specify the k value of top-k predictions." |
|
) |
|
parser.add_argument( |
|
'--eval-batch-size', |
|
type=int, |
|
default=32768, |
|
help="Specify the image-side batch size when computing the inner products, default to 8192" |
|
) |
|
parser.add_argument( |
|
'--output', |
|
type=str, |
|
required=True, |
|
help="Specify the output jsonl prediction filepath." |
|
) |
|
return parser.parse_args() |
|
|
|
if __name__ == "__main__": |
|
args = parse_args() |
|
|
|
|
|
print("Params:") |
|
for name in sorted(vars(args)): |
|
val = getattr(args, name) |
|
print(f" {name}: {val}") |
|
|
|
print("Begin to load text features...") |
|
text_ids = [] |
|
text_feats = [] |
|
with open(args.text_feats, "r") as fin: |
|
for line in tqdm(fin): |
|
obj = json.loads(line.strip()) |
|
text_ids.append(obj['text_id']) |
|
text_feats.append(obj['feature']) |
|
text_feats_array = np.array(text_feats, dtype=np.float32) |
|
print("Finished loading text features.") |
|
|
|
print("Begin to compute top-{} predictions for images...".format(args.top_k)) |
|
with open(args.output, "w") as fout: |
|
with open(args.image_feats, "r") as fin: |
|
for line in tqdm(fin): |
|
obj = json.loads(line.strip()) |
|
image_id = obj['image_id'] |
|
image_feat = obj['feature'] |
|
score_tuples = [] |
|
image_feat_tensor = torch.tensor([image_feat], dtype=torch.float).cuda() |
|
idx = 0 |
|
while idx < len(text_ids): |
|
text_feats_tensor = torch.from_numpy(text_feats_array[idx : min(idx + args.eval_batch_size, len(text_ids))]).cuda() |
|
batch_scores = image_feat_tensor @ text_feats_tensor.t() |
|
for text_id, score in zip(text_ids[idx : min(idx + args.eval_batch_size, len(text_ids))], batch_scores.squeeze(0).tolist()): |
|
score_tuples.append((text_id, score)) |
|
idx += args.eval_batch_size |
|
top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k] |
|
fout.write("{}\n".format(json.dumps({"image_id": image_id, "text_ids": [entry[0] for entry in top_k_predictions]}))) |
|
|
|
print("Top-{} predictions are saved in {}".format(args.top_k, args.output)) |
|
print("Done!") |
|
|