Spaces:
Build error
Build error
new: initial revision (copied from main repo)
Browse files- app.py +14 -0
- dashboard_image2image.py +88 -0
- dashboard_text2image.py +81 -0
- demo-image-encoder.py +69 -0
- requirements.txt +2 -0
app.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dashboard_text2image
|
| 2 |
+
import dashboard_image2image
|
| 3 |
+
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
PAGES = {
|
| 7 |
+
"Text to Image": dashboard_text2image,
|
| 8 |
+
"Image to Image": dashboard_image2image
|
| 9 |
+
}
|
| 10 |
+
st.sidebar.title("Navigation")
|
| 11 |
+
|
| 12 |
+
selection = st.sidebar.radio("Go to", list(PAGES.keys()))
|
| 13 |
+
page = PAGES[selection]
|
| 14 |
+
page.app()
|
dashboard_image2image.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
import nmslib
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from transformers import CLIPProcessor, FlaxCLIPModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
BASELINE_MODEL = "openai/clip-vit-base-patch32"
|
| 12 |
+
# MODEL_PATH = "/home/shared/models/clip-rsicd/bs128x8-lr5e-6-adam/ckpt-1"
|
| 13 |
+
MODEL_PATH = "flax-community/clip-rsicd"
|
| 14 |
+
|
| 15 |
+
# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-baseline.tsv"
|
| 16 |
+
# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
|
| 17 |
+
IMAGE_VECTOR_FILE = "./vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
|
| 18 |
+
|
| 19 |
+
# IMAGES_DIR = "/home/shared/data/rsicd_images"
|
| 20 |
+
IMAGES_DIR = "./images"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@st.cache(allow_output_mutation=True)
|
| 24 |
+
def load_index():
|
| 25 |
+
filenames, image_vecs = [], []
|
| 26 |
+
fvec = open(IMAGE_VECTOR_FILE, "r")
|
| 27 |
+
for line in fvec:
|
| 28 |
+
cols = line.strip().split('\t')
|
| 29 |
+
filename = cols[0]
|
| 30 |
+
image_vec = np.array([float(x) for x in cols[1].split(',')])
|
| 31 |
+
filenames.append(filename)
|
| 32 |
+
image_vecs.append(image_vec)
|
| 33 |
+
V = np.array(image_vecs)
|
| 34 |
+
index = nmslib.init(method='hnsw', space='cosinesimil')
|
| 35 |
+
index.addDataPointBatch(V)
|
| 36 |
+
index.createIndex({'post': 2}, print_progress=True)
|
| 37 |
+
return filenames, index
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@st.cache(allow_output_mutation=True)
|
| 41 |
+
def load_model():
|
| 42 |
+
model = FlaxCLIPModel.from_pretrained(MODEL_PATH)
|
| 43 |
+
processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
|
| 44 |
+
return model, processor
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def app():
|
| 48 |
+
filenames, index = load_index()
|
| 49 |
+
model, processor = load_model()
|
| 50 |
+
|
| 51 |
+
st.title("Image to Image Retrieval")
|
| 52 |
+
st.markdown("""
|
| 53 |
+
The CLIP model from OpenAI is trained in a self-supervised manner using
|
| 54 |
+
contrastive learning to project images and caption text onto a common
|
| 55 |
+
embedding space. We have fine-tuned the model using the RSICD dataset
|
| 56 |
+
(10k images and ~50k captions from the remote sensing domain).
|
| 57 |
+
|
| 58 |
+
This demo shows the image to image retrieval capabilities of this model, i.e.,
|
| 59 |
+
given an image file name as a query (we suggest copy pasting the file name
|
| 60 |
+
from the result of a text to image query), we use our fine-tuned CLIP model
|
| 61 |
+
to project the query image to the image/caption embedding space and search
|
| 62 |
+
for nearby images (by cosine similarity) in this space.
|
| 63 |
+
|
| 64 |
+
Our fine-tuned CLIP model was previously used to generate image vectors for
|
| 65 |
+
our demo, and NMSLib was used for fast vector access.
|
| 66 |
+
""")
|
| 67 |
+
|
| 68 |
+
image_file = st.text_input("Image Query (filename):")
|
| 69 |
+
if st.button("Query"):
|
| 70 |
+
image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_file)))
|
| 71 |
+
inputs = processor(images=image, return_tensors="jax", padding=True)
|
| 72 |
+
query_vec = model.get_image_features(**inputs)
|
| 73 |
+
query_vec = np.asarray(query_vec)
|
| 74 |
+
ids, distances = index.knnQuery(query_vec, k=11)
|
| 75 |
+
result_filenames = [filenames[id] for id in ids]
|
| 76 |
+
images, captions = [], []
|
| 77 |
+
for result_filename, score in zip(result_filenames, distances):
|
| 78 |
+
if result_filename == image_file:
|
| 79 |
+
continue
|
| 80 |
+
images.append(
|
| 81 |
+
plt.imread(os.path.join(IMAGES_DIR, result_filename)))
|
| 82 |
+
captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
|
| 83 |
+
images = images[0:10]
|
| 84 |
+
captions = captions[0:10]
|
| 85 |
+
st.image(images[0:3], caption=captions[0:3])
|
| 86 |
+
st.image(images[3:6], caption=captions[3:6])
|
| 87 |
+
st.image(images[6:9], caption=captions[6:9])
|
| 88 |
+
st.image(images[9:], caption=captions[9:])
|
dashboard_text2image.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
import nmslib
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
from transformers import CLIPProcessor, FlaxCLIPModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
BASELINE_MODEL = "openai/clip-vit-base-patch32"
|
| 11 |
+
# MODEL_PATH = "/home/shared/models/clip-rsicd/bs128x8-lr5e-6-adam/ckpt-1"
|
| 12 |
+
MODEL_PATH = "flax-community/clip-rsicd"
|
| 13 |
+
|
| 14 |
+
# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-baseline.tsv"
|
| 15 |
+
# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
|
| 16 |
+
IMAGE_VECTOR_FILE = "./vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
|
| 17 |
+
|
| 18 |
+
# IMAGES_DIR = "/home/shared/data/rsicd_images"
|
| 19 |
+
IMAGES_DIR = "./images"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@st.cache(allow_output_mutation=True)
|
| 23 |
+
def load_index():
|
| 24 |
+
filenames, image_vecs = [], []
|
| 25 |
+
fvec = open(IMAGE_VECTOR_FILE, "r")
|
| 26 |
+
for line in fvec:
|
| 27 |
+
cols = line.strip().split('\t')
|
| 28 |
+
filename = cols[0]
|
| 29 |
+
image_vec = np.array([float(x) for x in cols[1].split(',')])
|
| 30 |
+
filenames.append(filename)
|
| 31 |
+
image_vecs.append(image_vec)
|
| 32 |
+
V = np.array(image_vecs)
|
| 33 |
+
index = nmslib.init(method='hnsw', space='cosinesimil')
|
| 34 |
+
index.addDataPointBatch(V)
|
| 35 |
+
index.createIndex({'post': 2}, print_progress=True)
|
| 36 |
+
return filenames, index
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@st.cache(allow_output_mutation=True)
|
| 40 |
+
def load_model():
|
| 41 |
+
model = FlaxCLIPModel.from_pretrained(MODEL_PATH)
|
| 42 |
+
processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
|
| 43 |
+
return model, processor
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def app():
|
| 47 |
+
filenames, index = load_index()
|
| 48 |
+
model, processor = load_model()
|
| 49 |
+
|
| 50 |
+
st.title("Text to Image Retrieval")
|
| 51 |
+
st.markdown("""
|
| 52 |
+
The CLIP model from OpenAI is trained in a self-supervised manner using
|
| 53 |
+
contrastive learning to project images and caption text onto a common
|
| 54 |
+
embedding space. We have fine-tuned the model using the RSICD dataset
|
| 55 |
+
(10k images and ~50k captions from the remote sensing domain).
|
| 56 |
+
|
| 57 |
+
This demo shows the image to text retrieval capabilities of this model, i.e.,
|
| 58 |
+
given a text query, we use our fine-tuned CLIP model to project the text query
|
| 59 |
+
to the image/caption embedding space and search for nearby images (by
|
| 60 |
+
cosine similarity) in this space.
|
| 61 |
+
|
| 62 |
+
Our fine-tuned CLIP model was previously used to generate image vectors for
|
| 63 |
+
our demo, and NMSLib was used for fast vector access.
|
| 64 |
+
""")
|
| 65 |
+
|
| 66 |
+
query = st.text_input("Text Query:")
|
| 67 |
+
if st.button("Query"):
|
| 68 |
+
inputs = processor(text=[query], images=None, return_tensors="jax", padding=True)
|
| 69 |
+
query_vec = model.get_text_features(**inputs)
|
| 70 |
+
query_vec = np.asarray(query_vec)
|
| 71 |
+
ids, distances = index.knnQuery(query_vec, k=10)
|
| 72 |
+
result_filenames = [filenames[id] for id in ids]
|
| 73 |
+
images, captions = [], []
|
| 74 |
+
for result_filename, score in zip(result_filenames, distances):
|
| 75 |
+
images.append(
|
| 76 |
+
plt.imread(os.path.join(IMAGES_DIR, result_filename)))
|
| 77 |
+
captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
|
| 78 |
+
st.image(images[0:3], caption=captions[0:3])
|
| 79 |
+
st.image(images[3:6], caption=captions[3:6])
|
| 80 |
+
st.image(images[6:9], caption=captions[6:9])
|
| 81 |
+
st.image(images[9:], caption=captions[9:])
|
demo-image-encoder.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import jax
|
| 3 |
+
import jax.numpy as jnp
|
| 4 |
+
import json
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import numpy as np
|
| 7 |
+
import requests
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from transformers import CLIPProcessor, FlaxCLIPModel
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def encode_image(image_file, model, processor):
|
| 15 |
+
image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_file)))
|
| 16 |
+
inputs = processor(images=image, return_tensors="jax")
|
| 17 |
+
image_vec = model.get_image_features(**inputs)
|
| 18 |
+
return np.array(image_vec).reshape(-1)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
DATA_DIR = "/home/shared/data"
|
| 22 |
+
IMAGES_DIR = os.path.join(DATA_DIR, "rsicd_images")
|
| 23 |
+
CAPTIONS_FILE = os.path.join(DATA_DIR, "dataset_rsicd.json")
|
| 24 |
+
VECTORS_DIR = os.path.join(DATA_DIR, "vectors")
|
| 25 |
+
BASELINE_MODEL = "openai/clip-vit-base-patch32"
|
| 26 |
+
|
| 27 |
+
parser = argparse.ArgumentParser()
|
| 28 |
+
parser.add_argument("model_dir", help="Path to model to use for encoding")
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
|
| 31 |
+
print("Loading image list...", end="")
|
| 32 |
+
image2captions = {}
|
| 33 |
+
with open(CAPTIONS_FILE, "r") as fcap:
|
| 34 |
+
data = json.loads(fcap.read())
|
| 35 |
+
for image in data["images"]:
|
| 36 |
+
if image["split"] == "test":
|
| 37 |
+
filename = image["filename"]
|
| 38 |
+
sentences = []
|
| 39 |
+
for sentence in image["sentences"]:
|
| 40 |
+
sentences.append(sentence["raw"])
|
| 41 |
+
image2captions[filename] = sentences
|
| 42 |
+
|
| 43 |
+
print("{:d} images".format(len(image2captions)))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
print("Loading model...")
|
| 47 |
+
if args.model_dir == "baseline":
|
| 48 |
+
model = FlaxCLIPModel.from_pretrained(BASELINE_MODEL)
|
| 49 |
+
else:
|
| 50 |
+
model = FlaxCLIPModel.from_pretrained(args.model_dir)
|
| 51 |
+
processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
model_basename = "-".join(args.model_dir.split("/")[-2:])
|
| 55 |
+
vector_file = os.path.join(VECTORS_DIR, "test-{:s}.tsv".format(model_basename))
|
| 56 |
+
print("Vectors written to {:s}".format(vector_file))
|
| 57 |
+
num_written = 0
|
| 58 |
+
fvec = open(vector_file, "w")
|
| 59 |
+
for image_file in image2captions.keys():
|
| 60 |
+
if num_written % 100 == 0:
|
| 61 |
+
print("{:d} images processed".format(num_written))
|
| 62 |
+
image_vec = encode_image(image_file, model, processor)
|
| 63 |
+
image_vec_s = ",".join(["{:.7e}".format(x) for x in image_vec])
|
| 64 |
+
fvec.write("{:s}\t{:s}\n".format(image_file, image_vec_s))
|
| 65 |
+
num_written += 1
|
| 66 |
+
|
| 67 |
+
print("{:d} images processed, COMPLETE".format(num_written))
|
| 68 |
+
fvec.close()
|
| 69 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==0.84.1
|
| 2 |
+
nmslib==2.1.1
|