Spaces:

flax-community
/

spanish-image-captioning

Runtime error

File size: 5,263 Bytes

3a2e60d
 
 
 
 
bea24f7
3a2e60d
84c69e6
3a2e60d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50811dd
1a320a6
 
3a2e60d
1a320a6
3a2e60d
 
 
 
 
 
 
1480d4a
053cee9
3a2e60d
 
3cc7516
3a2e60d
 
44fd978
3a2e60d
 
3cc7516
3a2e60d
 
 
 
 
3d09c0a
1a320a6
 
053cee9
 
 
bea24f7
 
3a2e60d
 
44fd978
3a2e60d
 
 
 
 
 
 
 
d065a7f
faa7c0d
ee0d592
faa7c0d
3a2e60d
 
 
 
 
 
 
 
bea24f7
 
 
 
d9e8d80
3a2e60d
 
 
 
 
 
 
 
 
 
 
bbf274a
97b0cf1
 
 
3a2e60d
97b0cf1
 
 
44fd978
c220e56
547e7ab
 
 
3a2e60d
bbf274a
 
 
 
 
3a2e60d
bea24f7
3a2e60d
89ea184
8678313
3d09c0a
3a2e60d
 
 
bbf274a
3a2e60d
 
 
bbf274a
3a2e60d

from io import BytesIO
import streamlit as st
import pandas as pd
import os
import numpy as np
from streamlit import caching
from PIL import Image
from model.flax_clip_vision_marian.modeling_clip_vision_marian import (
    FlaxCLIPVisionMarianMT,
)
from transformers import MarianTokenizer
from utils import (
    get_transformed_image,
)
import matplotlib.pyplot as plt
from mtranslate import translate


from session import _get_state

state = _get_state()


@st.cache
def load_model(ckpt):
    return FlaxCLIPVisionMarianMT.from_pretrained(ckpt)


tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")

@st.cache
def generate_sequence(pixel_values, num_beams, temperature, top_p, do_sample, top_k, max_length):
    output_ids = state.model.generate(input_ids=pixel_values, max_length=max_length, num_beams=num_beams, temperature=temperature, top_p = top_p, top_k=top_k, do_sample=do_sample)
    print(output_ids)
    output_sequence = tokenizer.batch_decode(output_ids[0], skip_special_tokens=True, max_length=max_length)
    return output_sequence

def read_markdown(path, parent="./sections/"):
    with open(os.path.join(parent, path)) as f:
        return f.read()


checkpoints = ["./ckpt/ckpt-23999"]  # TODO: Maybe add more checkpoints?
dummy_data = pd.read_csv("references.tsv", sep="\t")

st.set_page_config(
    page_title="Spanish Image Captioning",
    layout="wide",
    initial_sidebar_state="collapsed",
    page_icon="./misc/csi-logo.png",
)

st.title("Spanish Image Captioning")
st.write(
    "[Bhavitvya Malik](https://huggingface.co/bhavitvyamalik), [Gunjan Chhablani](https://huggingface.co/gchhablani)"
)

st.sidebar.title("Generation Parameters")
max_length = st.sidebar.number_input("Max Length", min_value=16, max_value=128, value=64, step=1, help="The maximum length of sequence to be generated.")
do_sample = st.sidebar.checkbox("Sample", value=False, help="Sample from the model instead of using beam search.")
top_k = st.sidebar.number_input("Top K", min_value=10, max_value=200, value=50, step=1, help="The number of highest probability vocabulary tokens to keep for top-k-filtering.")
num_beams = st.sidebar.number_input("Number of Beams", min_value=2, max_value=10, value=4, step=1, help="Number of beams to be used in beam search.")
temperature = st.sidebar.select_slider("Temperature", options = list(np.arange(0.0,1.1, step=0.1)), value=1.0, help ="The value used to module the next token probabilities.", format_func=lambda x: f"{x:.2f}")
top_p = st.sidebar.select_slider("Top-P", options = list(np.arange(0.0,1.1, step=0.1)),value=1.0, help="Nucleus Sampling : If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are kept for generation.", format_func=lambda x: f"{x:.2f}")
if st.sidebar.button("Clear All Cache"):
    caching.clear_cache()

image_col, intro_col = st.beta_columns([3, 8])
image_col.image("./misc/sic-logo.png", use_column_width="always")
intro_col.write(read_markdown("intro.md"))

with st.beta_expander("Usage"):
    st.markdown(read_markdown("usage.md"))

with st.beta_expander("Article"):
    st.write(read_markdown("abstract.md"))
    st.write(read_markdown("caveats.md"))
    st.write("## Methodology")
    st.image(
        "./misc/Spanish-IC.png"
    )
    st.markdown(read_markdown("pretraining.md"))
    st.write(read_markdown("challenges.md"))
    st.write(read_markdown("social_impact.md"))
    st.write(read_markdown("references.md"))
    # st.write(read_markdown("checkpoints.md"))
    st.write(read_markdown("acknowledgements.md"))


if state.model is None:
    with st.spinner("Loading model..."):
        state.model = load_model(checkpoints[0])

first_index = 40
# Init Session State
if state.image_file is None:
    state.image_file = dummy_data.loc[first_index, "image_file"]
    state.caption = dummy_data.loc[first_index, "caption"].strip("- ")

    image_path = os.path.join("images", state.image_file)
    image = plt.imread(image_path)
    state.image = image

new_col1, new_col2 = st.beta_columns([5,5])

if new_col1.button("Get a random example", help="Get a random example from one of the seeded examples."):
    sample = dummy_data.sample(1).reset_index()
    state.image_file = sample.loc[0, "image_file"]
    state.caption = sample.loc[0, "caption"].strip("- ")

    image_path = os.path.join("images", state.image_file)
    image = plt.imread(image_path)
    state.image = image

transformed_image = get_transformed_image(state.image)
# Display Image
new_col1.image(state.image, use_column_width="always")

# Display Reference Caption
with new_col1.beta_expander("Reference Caption"):
    st.write("**Reference Caption**: " + state.caption)
    st.markdown(
        f"""**English Translation**: {translate(state.caption, 'en')}"""
    )


sequence = ['']
if new_col2.button("Generate Caption", help="Generate a caption in the Spanish."):
    with st.spinner("Generating Sequence..."):
        sequence = generate_sequence(transformed_image, num_beams, temperature, top_p, do_sample, top_k, max_length)
# print(sequence)

if sequence!=['']:
    new_col2.write(
        "**Generated Caption**: "+sequence[0]
    )

    new_col2.write(
        "**English Translation**: "+  translate(sequence[0])
    )