import gradio as gr from PIL import Image from sentence_transformers import SentenceTransformer, util # define model model_sentence = SentenceTransformer('clip-ViT-B-32') def clip_sim_preds(img, text): ''' This function: 1. Takes in an IMG/Text/ pair, IMG already as PIl image in RGB form 2. Feeds the image/text-pair into the defined clip model 3. returns calculated similarities ''' try: # Encode an image: img_emb = model_sentence.encode(img) # Encode text descriptions text_emb = model_sentence.encode([text]) # Compute cosine similarities cos_scores = util.cos_sim(img_emb, text_emb) # return the predicted similarity return cos_scores.item() except: return "error" # define app # takes in upload of an image and a corresponding text, computes and returns cosine similarity gr.Interface(clip_sim_preds, inputs=[gr.inputs.Image(invert_colors=False, image_mode="RGB", type="pil", source="upload", label=None, optional=False), gr.inputs.Textbox(lines=1, placeholder=None, default="two cats with black stripes on a purple blanket, tv remotes, green collar", label="Text", optional=False)], outputs=[gr.outputs.Textbox(type="auto", label="Cosine similarity")], theme="huggingface", title="Clip Cosine similarity", description="Cosine similarity of image/text pair using a multimodal clip model", allow_flagging=False,).launch(debug=True)