edugp's picture
User BERTIN model
089d2a3
import os
import sys
import jax
import streamlit as st
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
LOCAL_PATH = snapshot_download("flax-community/clip-spanish")
sys.path.append(LOCAL_PATH)
from modeling_hybrid_clip import FlaxHybridCLIP
from test_on_image import prepare_image, prepare_text
def save_file_to_disk(uplaoded_file):
temp_file = os.path.join("/tmp", uplaoded_file.name)
with open(temp_file, "wb") as f:
f.write(uploaded_file.getbuffer())
return temp_file
@st.cache(
hash_funcs={
transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast: id,
FlaxHybridCLIP: id,
},
show_spinner=False
)
def load_tokenizer_and_model():
# load the saved model
tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish")
model = FlaxHybridCLIP.from_pretrained(LOCAL_PATH)
return tokenizer, model
def run_inference(image_path, text, model, tokenizer):
pixel_values = prepare_image(image_path, model)
input_text = prepare_text(text, tokenizer)
model_output = model(
input_text["input_ids"],
pixel_values,
attention_mask=input_text["attention_mask"],
train=False,
return_dict=True,
)
logits = model_output["logits_per_image"]
score = jax.nn.sigmoid(logits)[0][0]
return score
tokenizer, model = load_tokenizer_and_model()
st.title("Caption Scoring")
uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg"])
text_input = st.text_input("Type a caption")
if uploaded_file is not None and text_input:
local_image_path = None
try:
local_image_path = save_file_to_disk(uploaded_file)
score = run_inference(local_image_path, text_input, model, tokenizer).tolist()
st.image(
uploaded_file,
caption=text_input,
width=None,
use_column_width=None,
clamp=False,
channels="RGB",
output_format="auto",
)
st.write(f"## Score: {score:.2f}")
finally:
if local_image_path:
os.remove(local_image_path)