Spaces:

sujitpal
/

clip-rsicd-demo

Build error

App Files Files Community

sujitpal commited on Jul 16, 2021

Commit

357b0b8

•

1 Parent(s): e44b0e6

new: initial revision (copied from main repo)

Browse files

Files changed (5) hide show

app.py +14 -0
dashboard_image2image.py +88 -0
dashboard_text2image.py +81 -0
demo-image-encoder.py +69 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import dashboard_text2image
+import dashboard_image2image
+import streamlit as st
+PAGES = {
+    "Text to Image": dashboard_text2image,
+    "Image to Image": dashboard_image2image
+}
+st.sidebar.title("Navigation")
+selection = st.sidebar.radio("Go to", list(PAGES.keys()))
+page = PAGES[selection]
+page.app()

dashboard_image2image.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import matplotlib.pyplot as plt
+import nmslib
+import numpy as np
+import os
+import streamlit as st
+from PIL import Image
+from transformers import CLIPProcessor, FlaxCLIPModel
+BASELINE_MODEL = "openai/clip-vit-base-patch32"
+# MODEL_PATH = "/home/shared/models/clip-rsicd/bs128x8-lr5e-6-adam/ckpt-1"
+MODEL_PATH = "flax-community/clip-rsicd"
+# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-baseline.tsv"
+# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
+IMAGE_VECTOR_FILE = "./vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
+# IMAGES_DIR = "/home/shared/data/rsicd_images"
+IMAGES_DIR = "./images"
+@st.cache(allow_output_mutation=True)
+def load_index():
+    filenames, image_vecs = [], []
+    fvec = open(IMAGE_VECTOR_FILE, "r")
+    for line in fvec:
+        cols = line.strip().split('\t')
+        filename = cols[0]
+        image_vec = np.array([float(x) for x in cols[1].split(',')])
+        filenames.append(filename)
+        image_vecs.append(image_vec)
+    V = np.array(image_vecs)
+    index = nmslib.init(method='hnsw', space='cosinesimil')
+    index.addDataPointBatch(V)
+    index.createIndex({'post': 2}, print_progress=True)
+    return filenames, index
+@st.cache(allow_output_mutation=True)
+def load_model():
+    model = FlaxCLIPModel.from_pretrained(MODEL_PATH)
+    processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
+    return model, processor
+def app():
+    filenames, index = load_index()
+    model, processor = load_model()
+    st.title("Image to Image Retrieval")
+    st.markdown("""
+        The CLIP model from OpenAI is trained in a self-supervised manner using
+        contrastive learning to project images and caption text onto a common
+        embedding space. We have fine-tuned the model using the RSICD dataset
+        (10k images and ~50k captions from the remote sensing domain).
+        This demo shows the image to image retrieval capabilities of this model, i.e.,
+        given an image file name as a query (we suggest copy pasting the file name
+        from the result of a text to image query), we use our fine-tuned CLIP model
+        to project the query image to the image/caption embedding space and search
+        for nearby images (by cosine similarity) in this space.
+        Our fine-tuned CLIP model was previously used to generate image vectors for
+        our demo, and NMSLib was used for fast vector access.
+    """)
+    image_file = st.text_input("Image Query (filename):")
+    if st.button("Query"):
+        image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_file)))
+        inputs = processor(images=image, return_tensors="jax", padding=True)
+        query_vec = model.get_image_features(**inputs)
+        query_vec = np.asarray(query_vec)
+        ids, distances = index.knnQuery(query_vec, k=11)
+        result_filenames = [filenames[id] for id in ids]
+        images, captions = [], []
+        for result_filename, score in zip(result_filenames, distances):
+            if result_filename == image_file:
+                continue
+            images.append(
+                plt.imread(os.path.join(IMAGES_DIR, result_filename)))
+            captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
+        images = images[0:10]
+        captions = captions[0:10]
+        st.image(images[0:3], caption=captions[0:3])
+        st.image(images[3:6], caption=captions[3:6])
+        st.image(images[6:9], caption=captions[6:9])
+        st.image(images[9:], caption=captions[9:])

dashboard_text2image.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import matplotlib.pyplot as plt
+import nmslib
+import numpy as np
+import os
+import streamlit as st
+from transformers import CLIPProcessor, FlaxCLIPModel
+BASELINE_MODEL = "openai/clip-vit-base-patch32"
+# MODEL_PATH = "/home/shared/models/clip-rsicd/bs128x8-lr5e-6-adam/ckpt-1"
+MODEL_PATH = "flax-community/clip-rsicd"
+# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-baseline.tsv"
+# IMAGE_VECTOR_FILE = "/home/shared/data/vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
+IMAGE_VECTOR_FILE = "./vectors/test-bs128x8-lr5e-6-adam-ckpt-1.tsv"
+# IMAGES_DIR = "/home/shared/data/rsicd_images"
+IMAGES_DIR = "./images"
+@st.cache(allow_output_mutation=True)
+def load_index():
+    filenames, image_vecs = [], []
+    fvec = open(IMAGE_VECTOR_FILE, "r")
+    for line in fvec:
+        cols = line.strip().split('\t')
+        filename = cols[0]
+        image_vec = np.array([float(x) for x in cols[1].split(',')])
+        filenames.append(filename)
+        image_vecs.append(image_vec)
+    V = np.array(image_vecs)
+    index = nmslib.init(method='hnsw', space='cosinesimil')
+    index.addDataPointBatch(V)
+    index.createIndex({'post': 2}, print_progress=True)
+    return filenames, index
+@st.cache(allow_output_mutation=True)
+def load_model():
+    model = FlaxCLIPModel.from_pretrained(MODEL_PATH)
+    processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
+    return model, processor
+def app():
+    filenames, index = load_index()
+    model, processor = load_model()
+    st.title("Text to Image Retrieval")
+    st.markdown("""
+        The CLIP model from OpenAI is trained in a self-supervised manner using
+        contrastive learning to project images and caption text onto a common
+        embedding space. We have fine-tuned the model using the RSICD dataset
+        (10k images and ~50k captions from the remote sensing domain).
+        This demo shows the image to text retrieval capabilities of this model, i.e.,
+        given a text query, we use our fine-tuned CLIP model to project the text query
+        to the image/caption embedding space and search for nearby images (by
+        cosine similarity) in this space.
+        Our fine-tuned CLIP model was previously used to generate image vectors for
+        our demo, and NMSLib was used for fast vector access.
+    """)
+    query = st.text_input("Text Query:")
+    if st.button("Query"):
+        inputs = processor(text=[query], images=None, return_tensors="jax", padding=True)
+        query_vec = model.get_text_features(**inputs)
+        query_vec = np.asarray(query_vec)
+        ids, distances = index.knnQuery(query_vec, k=10)
+        result_filenames = [filenames[id] for id in ids]
+        images, captions = [], []
+        for result_filename, score in zip(result_filenames, distances):
+            images.append(
+                plt.imread(os.path.join(IMAGES_DIR, result_filename)))
+            captions.append("{:s} (score: {:.3f})".format(result_filename, 1.0 - score))
+        st.image(images[0:3], caption=captions[0:3])
+        st.image(images[3:6], caption=captions[3:6])
+        st.image(images[6:9], caption=captions[6:9])
+        st.image(images[9:], caption=captions[9:])

demo-image-encoder.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import argparse
+import jax
+import jax.numpy as jnp
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+import os
+from PIL import Image
+from transformers import CLIPProcessor, FlaxCLIPModel
+def encode_image(image_file, model, processor):
+    image = Image.fromarray(plt.imread(os.path.join(IMAGES_DIR, image_file)))
+    inputs = processor(images=image, return_tensors="jax")
+    image_vec = model.get_image_features(**inputs)
+    return np.array(image_vec).reshape(-1)
+DATA_DIR = "/home/shared/data"
+IMAGES_DIR = os.path.join(DATA_DIR, "rsicd_images")
+CAPTIONS_FILE = os.path.join(DATA_DIR, "dataset_rsicd.json")
+VECTORS_DIR = os.path.join(DATA_DIR, "vectors")
+BASELINE_MODEL = "openai/clip-vit-base-patch32"
+parser = argparse.ArgumentParser()
+parser.add_argument("model_dir", help="Path to model to use for encoding")
+args = parser.parse_args()
+print("Loading image list...", end="")
+image2captions = {}
+with open(CAPTIONS_FILE, "r") as fcap:
+    data = json.loads(fcap.read())
+for image in data["images"]:
+    if image["split"] == "test":
+        filename = image["filename"]
+        sentences = []
+        for sentence in image["sentences"]:
+            sentences.append(sentence["raw"])
+        image2captions[filename] = sentences
+print("{:d} images".format(len(image2captions)))
+print("Loading model...")
+if args.model_dir == "baseline":
+    model = FlaxCLIPModel.from_pretrained(BASELINE_MODEL)
+else:
+    model = FlaxCLIPModel.from_pretrained(args.model_dir)
+processor = CLIPProcessor.from_pretrained(BASELINE_MODEL)
+model_basename = "-".join(args.model_dir.split("/")[-2:])
+vector_file = os.path.join(VECTORS_DIR, "test-{:s}.tsv".format(model_basename))
+print("Vectors written to {:s}".format(vector_file))
+num_written = 0
+fvec = open(vector_file, "w")
+for image_file in image2captions.keys():
+    if num_written % 100 == 0:
+        print("{:d} images processed".format(num_written))
+    image_vec = encode_image(image_file, model, processor)
+    image_vec_s = ",".join(["{:.7e}".format(x) for x in image_vec])
+    fvec.write("{:s}\t{:s}\n".format(image_file, image_vec_s))
+    num_written += 1
+print("{:d} images processed, COMPLETE".format(num_written))
+fvec.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit==0.84.1
2	+ nmslib==2.1.1