import os import sys import jax import streamlit as st import transformers from huggingface_hub import snapshot_download from transformers import AutoTokenizer LOCAL_PATH = snapshot_download("flax-community/clip-spanish") sys.path.append(LOCAL_PATH) from modeling_hybrid_clip import FlaxHybridCLIP from test_on_image import prepare_image, prepare_text def save_file_to_disk(uplaoded_file): temp_file = os.path.join("/tmp", uplaoded_file.name) with open(temp_file, "wb") as f: f.write(uploaded_file.getbuffer()) return temp_file @st.cache( hash_funcs={ transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast: id, FlaxHybridCLIP: id, }, show_spinner=False ) def load_tokenizer_and_model(): # load the saved model tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish") model = FlaxHybridCLIP.from_pretrained(LOCAL_PATH) return tokenizer, model def run_inference(image_path, text, model, tokenizer): pixel_values = prepare_image(image_path, model) input_text = prepare_text(text, tokenizer) model_output = model( input_text["input_ids"], pixel_values, attention_mask=input_text["attention_mask"], train=False, return_dict=True, ) logits = model_output["logits_per_image"] score = jax.nn.sigmoid(logits)[0][0] return score tokenizer, model = load_tokenizer_and_model() st.title("Caption Scoring") uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg"]) text_input = st.text_input("Type a caption") if uploaded_file is not None and text_input: local_image_path = None try: local_image_path = save_file_to_disk(uploaded_file) score = run_inference(local_image_path, text_input, model, tokenizer).tolist() st.image( uploaded_file, caption=text_input, width=None, use_column_width=None, clamp=False, channels="RGB", output_format="auto", ) st.write(f"## Score: {score:.2f}") finally: if local_image_path: os.remove(local_image_path)