1 import sys
2
3 import gradio as gr
4 import jax
5 from huggingface_hub import snapshot_download
6 from PIL import Image
7 from transformers import AutoTokenizer
8
9 LOCAL_PATH = snapshot_download("flax-community/clip-spanish")
10 sys.path.append(LOCAL_PATH)
11
12 from modeling_hybrid_clip import FlaxHybridCLIP
13 from test_on_image import prepare_image, prepare_text
14
15
16 def save_file_to_disk(uplaoded_file):
17 temp_file = "/tmp/image.jpeg"
18 im = Image.fromarray(uplaoded_file)
19 im.save(temp_file)
20 # with open(temp_file, "wb") as f:
21 # f.write(uploaded_file.getbuffer())
22 return temp_file
23
24
25 def run_inference(image_path, text, model, tokenizer):
26 pixel_values = prepare_image(image_path, model)
27 input_text = prepare_text(text, tokenizer)
28 model_output = model(
29 input_text["input_ids"],
30 pixel_values,
31 attention_mask=input_text["attention_mask"],
32 train=False,
33 return_dict=True,
34 )
35 logits = model_output["logits_per_image"]
36 score = jax.nn.sigmoid(logits)[0][0]
37 return score
38
39
40 def load_tokenizer_and_model():
41 # load the saved model
42 tokenizer = AutoTokenizer.from_pretrained(
43 "bertin-project/bertin-roberta-base-spanish"
44 )
45 model = FlaxHybridCLIP.from_pretrained(LOCAL_PATH)
46 return tokenizer, model
47
48
49 tokenizer, model = load_tokenizer_and_model()
50
51
52 def score_image_caption_pair(uploaded_file, text_input):
53 local_image_path = save_file_to_disk(uploaded_file)
54 score = run_inference(
55 local_image_path, text_input, model, tokenizer).tolist()
56 return {"Score": score}, "{:.2f}".format(score)
57
58
59 image = gr.inputs.Image(shape=(299, 299))
60 iface = gr.Interface(
61 fn=score_image_caption_pair, inputs=[image, "text"], outputs=["label", "text"]
62 )
63 iface.launch()
64