import gradio as gr import requests from PIL import Image from sentence_transformers import SentenceTransformer, util # define model model_sentence = SentenceTransformer('clip-ViT-B-32') # functions def download_images(url): ''' This function: 1. takes in a URL 2. downloads the raw content (image) 3. reads this image out 4. returns temp img, HTTP status code and flag ''' try: # request image response = requests.get(url, stream=True, timeout=3.5).raw # request status code (can't be done with .raw) status_code = requests.get(url).status_code # read in image image = Image.open(response) # convert all images to rgb -> case png is in rgba format rgb_im = image.convert('RGB') # return temp image, status code and flag return rgb_im, status_code, 0 except: print("error", status_code) # error flag return "error url", "", -1 def clip_sim_preds(url, text): ''' This function: 1. Takes in an URL/Text/ID pair 2. Calls download images 3. Receives a temp image 4. Feeds the image/text-pair into the defined clip model 5. returns calculated similarities ''' # call download images image, status_code, flag = download_images(url) # if no error occured and temp image successfully downloaded, proceed if flag == 0: try: # Encode an image: img_emb = model_sentence.encode(image) # Encode text descriptions text_emb = model_sentence.encode([text]) # Compute cosine similarities cos_scores = util.cos_sim(img_emb, text_emb) # return the predicted similarity, flag return cos_scores.item() except: return "error clip_si" # if error occured, indicate this with -1 flag else: return "error" article = "

Alternative

" # define app # takes in url of an image and a corresponding text, computes and returns cosine similarity gr.Interface(clip_sim_preds, inputs=[gr.inputs.Textbox(lines=1, placeholder=None, default="http://images.cocodataset.org/val2017/000000039769.jpg", label="URL", optional=False), gr.inputs.Textbox(lines=1, placeholder=None, default="two cats with black stripes on a purple blanket, tv remotes, green collar", label="Text", optional=False)], outputs=[gr.outputs.Textbox(type="auto", label="Cosine similarity")], theme="huggingface", title="Clip Cosine similarity", description="Clip cosine similarity of an image/text pair", article=article, allow_flagging=False,).launch(debug=True)