Spaces:

flax-community
/

Multilingual-VQA

Runtime error

App Files Files Community

gchhablani commited on Jul 22, 2021

Commit

2c8f495

•

1 Parent(s): 405f2d4

Add mask filling app

Browse files

Files changed (6) hide show

app.py +8 -3
apps/mlm.py +49 -49
apps/utils.py +1 -0
apps/vqa.py +44 -42
multiapp.py +10 -3
resize_images.py +10 -3

app.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from apps import mlm, vqa
 import os
 import streamlit as st
 from multiapp import MultiApp
 def read_markdown(path, parent="./sections/"):
     with open(os.path.join(parent, path)) as f:
         return f.read()
 def main():
     st.set_page_config(
         page_title="Multilingual VQA",
         layout="wide",
@@ -30,7 +34,7 @@ def main():
         st.write(read_markdown("abstract.md"))
         st.write(read_markdown("caveats.md"))
         st.write("## Methodology")
-        col1, col2 = st.beta_columns([1,1])
         col1.image(
             "./misc/article/Multilingual-VQA.png",
             caption="Masked LM model for Image-text Pretraining.",
@@ -43,10 +47,11 @@ def main():
         st.write(read_markdown("checkpoints.md"))
         st.write(read_markdown("acknowledgements.md"))
-    app = MultiApp()
     app.add_app("Visual Question Answering", vqa.app)
     app.add_app("Mask Filling", mlm.app)
     app.run()
 if __name__ == "__main__":
-    main()

 from apps import mlm, vqa
 import os
 import streamlit as st
+from session import _get_state
 from multiapp import MultiApp
 def read_markdown(path, parent="./sections/"):
     with open(os.path.join(parent, path)) as f:
         return f.read()
 def main():
+    state = _get_state()
     st.set_page_config(
         page_title="Multilingual VQA",
         layout="wide",
         st.write(read_markdown("abstract.md"))
         st.write(read_markdown("caveats.md"))
         st.write("## Methodology")
+        col1, col2 = st.beta_columns([1, 1])
         col1.image(
             "./misc/article/Multilingual-VQA.png",
             caption="Masked LM model for Image-text Pretraining.",
         st.write(read_markdown("checkpoints.md"))
         st.write(read_markdown("acknowledgements.md"))
+    app = MultiApp(state)
     app.add_app("Visual Question Answering", vqa.app)
     app.add_app("Mask Filling", mlm.app)
     app.run()
+    state.sync()
 if __name__ == "__main__":
+    main()

apps/mlm.py CHANGED Viewed

@@ -1,11 +1,9 @@
 from .utils import (
     get_text_attributes,
     get_top_5_predictions,
     get_transformed_image,
     plotly_express_horizontal_bar_plot,
-    translate_labels,
-    bert_tokenizer
 )
 import streamlit as st
@@ -13,97 +11,99 @@ import numpy as np
 import pandas as pd
 import os
 import matplotlib.pyplot as plt
-from session import _get_state
 from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
     FlaxCLIPVisionBertForMaskedLM,
 )
 def softmax(logits):
     return np.exp(logits) / np.sum(np.exp(logits), axis=0)
-def app():
-    state = _get_state()
-    @st.cache(persist=False)
     def predict(transformed_image, caption_inputs):
-        outputs = state.model(pixel_values=transformed_image, **caption_inputs)
-        indices = np.where(caption_inputs['input_ids']==bert_tokenizer.mask_token_id)
-        preds = outputs.logits[indices][0]
-        sorted_indices = np.argsort(preds)[::-1] # Get reverse sorted scores
-        top_5_indices = sorted_indices[:5]
-        top_5_tokens = bert_tokenizer.convert_ids_to_tokens(top_5_indices)
-        top_5_scores = np.array(preds[top_5_indices])
-        return top_5_tokens, top_5_scores
-    @st.cache(persist=False)
     def load_model(ckpt):
         return FlaxCLIPVisionBertForMaskedLM.from_pretrained(ckpt)
-    mlm_checkpoints = ['flax-community/clip-vision-bert-cc12m-70k']
     dummy_data = pd.read_csv("cc12m_data/vqa_val.tsv", sep="\t")
     first_index = 20
-    # Init Session State
-    if state.image_file is None:
-        state.image_file = dummy_data.loc[first_index, "image_file"]
         caption = dummy_data.loc[first_index, "caption"].strip("- ")
-        ids = bert_tokenizer(caption)
-        ids[np.random.randint(0, len(ids))] = bert_tokenizer.mask_token_id
-        state.caption = bert_tokenizer.decode(ids)
-        state.caption_lang_id = dummy_data.loc[first_index, "lang_id"]
-        image_path = os.path.join("cc12m_data/images_vqa", state.image_file)
         image = plt.imread(image_path)
-        state.image = image
-    if state.model is None:
-        # Display Top-5 Predictions
-        with st.spinner("Loading model..."):
-            state.model = load_model(mlm_checkpoints[0])
     if st.button(
         "Get a random example",
         help="Get a random example from the 100 `seeded` image-text pairs.",
     ):
         sample = dummy_data.sample(1).reset_index()
-        state.image_file = sample.loc[0, "image_file"]
         caption = sample.loc[0, "caption"].strip("- ")
-        ids = bert_tokenizer(caption)
-        ids[np.random.randint(0, len(ids))] = bert_tokenizer.mask_token_id
-        state.caption = bert_tokenizer.decode(ids)
-        state.caption_lang_id = sample.loc[0, "lang_id"]
-        image_path = os.path.join("cc12m_data/images_vqa", state.image_file)
         image = plt.imread(image_path)
-        state.image = image
-    transformed_image = get_transformed_image(state.image)
     new_col1, new_col2 = st.beta_columns([5, 5])
     # Display Image
-    new_col1.image(state.image, use_column_width="always")
     # Display caption
     new_col2.write("Write your text with exactly one [MASK] token.")
     caption = new_col2.text_input(
         label="Text",
-        value=state.caption,
         help="Type your masked caption regarding the image above in one of the four languages.",
     )
     caption_inputs = get_text_attributes(caption)
     # Display Top-5 Predictions
     with st.spinner("Predicting..."):
-        logits = predict(transformed_image, dict(caption_inputs))
-    logits = softmax(logits)
-    labels, values = get_top_5_predictions(logits)
     fig = plotly_express_horizontal_bar_plot(values, labels)
-    st.plotly_chart(fig, use_container_width=True)

 from .utils import (
     get_text_attributes,
     get_top_5_predictions,
     get_transformed_image,
     plotly_express_horizontal_bar_plot,
+    bert_tokenizer,
 )
 import streamlit as st
 import pandas as pd
 import os
 import matplotlib.pyplot as plt
+from mtranslate import translate
 from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
     FlaxCLIPVisionBertForMaskedLM,
 )
 def softmax(logits):
     return np.exp(logits) / np.sum(np.exp(logits), axis=0)
+def app(state):
+    mlm_state = state
+    # @st.cache(persist=False) # TODO: Make this work with mlm_state. Currently not supported.
     def predict(transformed_image, caption_inputs):
+        outputs = model(pixel_values=transformed_image, **caption_inputs)
+        indices = np.where(caption_inputs["input_ids"] == bert_tokenizer.mask_token_id)[
+            1
+        ][0]
+        preds = outputs.logits[0][indices]
+        scores = np.array(preds)
+        return scores
+    # @st.cache(persist=False)
     def load_model(ckpt):
         return FlaxCLIPVisionBertForMaskedLM.from_pretrained(ckpt)
+    mlm_checkpoints = ["flax-community/clip-vision-bert-cc12m-70k"]
     dummy_data = pd.read_csv("cc12m_data/vqa_val.tsv", sep="\t")
     first_index = 20
+    # Init Session mlm_state
+    if mlm_state.mlm_image_file is None:
+        mlm_state.mlm_image_file = dummy_data.loc[first_index, "image_file"]
         caption = dummy_data.loc[first_index, "caption"].strip("- ")
+        ids = bert_tokenizer.encode(caption)
+        ids[np.random.randint(1, len(ids) - 1)] = bert_tokenizer.mask_token_id
+        mlm_state.caption = bert_tokenizer.decode(ids[1:-1])
+        mlm_state.caption_lang_id = dummy_data.loc[first_index, "lang_id"]
+        image_path = os.path.join("cc12m_data/images_vqa", mlm_state.mlm_image_file)
         image = plt.imread(image_path)
+        mlm_state.mlm_image = image
+    #if model is None:
+    # Display Top-5 Predictions
+    with st.spinner("Loading model..."):
+        model = load_model(mlm_checkpoints[0])
     if st.button(
         "Get a random example",
         help="Get a random example from the 100 `seeded` image-text pairs.",
     ):
         sample = dummy_data.sample(1).reset_index()
+        mlm_state.mlm_image_file = sample.loc[0, "image_file"]
         caption = sample.loc[0, "caption"].strip("- ")
+        ids = bert_tokenizer.encode(caption)
+        ids[np.random.randint(1, len(ids) - 1)] = bert_tokenizer.mask_token_id
+        mlm_state.caption = bert_tokenizer.decode(ids[1:-1])
+        mlm_state.caption_lang_id = sample.loc[0, "lang_id"]
+        image_path = os.path.join("cc12m_data/images_vqa", mlm_state.mlm_image_file)
         image = plt.imread(image_path)
+        mlm_state.mlm_image = image
+    transformed_image = get_transformed_image(mlm_state.mlm_image)
     new_col1, new_col2 = st.beta_columns([5, 5])
     # Display Image
+    new_col1.image(mlm_state.mlm_image, use_column_width="always")
     # Display caption
     new_col2.write("Write your text with exactly one [MASK] token.")
     caption = new_col2.text_input(
         label="Text",
+        value=mlm_state.caption,
         help="Type your masked caption regarding the image above in one of the four languages.",
     )
+    new_col2.markdown(
+        f"""**English Translation**: {caption if mlm_state.caption_lang_id == "en" else translate(caption, 'en')}"""
+    )
     caption_inputs = get_text_attributes(caption)
     # Display Top-5 Predictions
     with st.spinner("Predicting..."):
+        scores = predict(transformed_image, dict(caption_inputs))
+    scores = softmax(scores)
+    labels, values = get_top_5_predictions(scores)
+    # newer_col1, newer_col2 = st.beta_columns([6,4])
     fig = plotly_express_horizontal_bar_plot(values, labels)
+    st.dataframe(pd.DataFrame({"Tokens":labels, "English Translation": list(map(lambda x: translate(x),labels))}).T)
+    st.plotly_chart(fig, use_container_width=True)

apps/utils.py CHANGED Viewed

@@ -40,6 +40,7 @@ def get_transformed_image(image):
 bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-uncased")
 def get_text_attributes(text):
     return bert_tokenizer([text], return_token_type_ids=True, return_tensors="np")

 bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-uncased")
 def get_text_attributes(text):
     return bert_tokenizer([text], return_token_type_ids=True, return_tensors="np")

apps/vqa.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from .utils import (
     get_text_attributes,
     get_top_5_predictions,
@@ -15,29 +14,33 @@ import matplotlib.pyplot as plt
 import json
 from mtranslate import translate
-from session import _get_state
 from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
     FlaxCLIPVisionBertForSequenceClassification,
 )
 def softmax(logits):
     return np.exp(logits) / np.sum(np.exp(logits), axis=0)
-def app():
-    state = _get_state()
-    @st.cache(persist=True)
-    def predict(transformed_image, question_inputs):
-        return np.array(state.model(pixel_values=transformed_image, **question_inputs)[0][0])
-    @st.cache(persist=True)
     def load_model(ckpt):
         return FlaxCLIPVisionBertForSequenceClassification.from_pretrained(ckpt)
-    vqa_checkpoints = ["flax-community/clip-vision-bert-vqa-ft-6k"]  # TODO: Maybe add more checkpoints?
     dummy_data = pd.read_csv("dummy_vqa_multilingual.tsv", sep="\t")
     code_to_name = {
         "en": "English",
@@ -46,77 +49,76 @@ def app():
         "es": "Spanish",
     }
     with open("answer_reverse_mapping.json") as f:
         answer_reverse_mapping = json.load(f)
     first_index = 20
-    # Init Session State
-    if state.image_file is None:
-        state.image_file = dummy_data.loc[first_index, "image_file"]
-        state.question = dummy_data.loc[first_index, "question"].strip("- ")
-        state.answer_label = dummy_data.loc[first_index, "answer_label"]
-        state.question_lang_id = dummy_data.loc[first_index, "lang_id"]
-        state.answer_lang_id = dummy_data.loc[first_index, "lang_id"]
-        image_path = os.path.join("resized_images", state.image_file)
         image = plt.imread(image_path)
-        state.image = image
-    if state.model is None:
-        # Display Top-5 Predictions
-        with st.spinner("Loading model..."):
-            state.model = load_model(vqa_checkpoints[0])
     if st.button(
         "Get a random example",
         help="Get a random example from the 100 `seeded` image-text pairs.",
     ):
         sample = dummy_data.sample(1).reset_index()
-        state.image_file = sample.loc[0, "image_file"]
-        state.question = sample.loc[0, "question"].strip("- ")
-        state.answer_label = sample.loc[0, "answer_label"]
-        state.question_lang_id = sample.loc[0, "lang_id"]
-        state.answer_lang_id = sample.loc[0, "lang_id"]
-        image_path = os.path.join("resized_images", state.image_file)
         image = plt.imread(image_path)
-        state.image = image
-    transformed_image = get_transformed_image(state.image)
     new_col1, new_col2 = st.beta_columns([5, 5])
     # Display Image
-    new_col1.image(state.image, use_column_width="always")
     # Display Question
     question = new_col2.text_input(
         label="Question",
-        value=state.question,
         help="Type your question regarding the image above in one of the four languages.",
     )
     new_col2.markdown(
-        f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
     )
     question_inputs = get_text_attributes(question)
     # Select Language
     options = ["en", "de", "es", "fr"]
-    state.answer_lang_id = new_col2.selectbox(
         "Answer Language",
-        index=options.index(state.answer_lang_id),
         options=options,
         format_func=lambda x: code_to_name[x],
         help="The language to be used to show the top-5 labels.",
     )
-    actual_answer = answer_reverse_mapping[str(state.answer_label)]
     new_col2.markdown(
         "**Actual Answer**: "
-        + translate_labels([actual_answer], state.answer_lang_id)[0]
         + " ("
         + actual_answer
         + ")"
@@ -126,6 +128,6 @@ def app():
         logits = predict(transformed_image, dict(question_inputs))
     logits = softmax(logits)
     labels, values = get_top_5_predictions(logits, answer_reverse_mapping)
-    translated_labels = translate_labels(labels, state.answer_lang_id)
     fig = plotly_express_horizontal_bar_plot(values, translated_labels)
-    st.plotly_chart(fig, use_container_width=True)

 from .utils import (
     get_text_attributes,
     get_top_5_predictions,
 import json
 from mtranslate import translate
 from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
     FlaxCLIPVisionBertForSequenceClassification,
 )
 def softmax(logits):
     return np.exp(logits) / np.sum(np.exp(logits), axis=0)
+def app(state):
+    vqa_state = state
+    # @st.cache(persist=False)
+    def predict(transformed_image, question_inputs):
+        return np.array(
+            model(pixel_values=transformed_image, **question_inputs)[0][0]
+        )
+    # @st.cache(persist=False)
     def load_model(ckpt):
         return FlaxCLIPVisionBertForSequenceClassification.from_pretrained(ckpt)
+    vqa_checkpoints = [
+        "flax-community/clip-vision-bert-vqa-ft-6k"
+    ]  # TODO: Maybe add more checkpoints?
     dummy_data = pd.read_csv("dummy_vqa_multilingual.tsv", sep="\t")
     code_to_name = {
         "en": "English",
         "es": "Spanish",
     }
     with open("answer_reverse_mapping.json") as f:
         answer_reverse_mapping = json.load(f)
     first_index = 20
+    # Init Session vqa_state
+    if vqa_state.vqa_image_file is None:
+        vqa_state.vqa_image_file = dummy_data.loc[first_index, "image_file"]
+        vqa_state.question = dummy_data.loc[first_index, "question"].strip("- ")
+        vqa_state.answer_label = dummy_data.loc[first_index, "answer_label"]
+        vqa_state.question_lang_id = dummy_data.loc[first_index, "lang_id"]
+        vqa_state.answer_lang_id = dummy_data.loc[first_index, "lang_id"]
+        image_path = os.path.join("resized_images", vqa_state.vqa_image_file)
         image = plt.imread(image_path)
+        vqa_state.vqa_image = image
+    # if model is None:
+    # Display Top-5 Predictions
+    with st.spinner("Loading model..."):
+        model = load_model(vqa_checkpoints[0])
     if st.button(
         "Get a random example",
         help="Get a random example from the 100 `seeded` image-text pairs.",
     ):
         sample = dummy_data.sample(1).reset_index()
+        vqa_state.vqa_image_file = sample.loc[0, "image_file"]
+        vqa_state.question = sample.loc[0, "question"].strip("- ")
+        vqa_state.answer_label = sample.loc[0, "answer_label"]
+        vqa_state.question_lang_id = sample.loc[0, "lang_id"]
+        vqa_state.answer_lang_id = sample.loc[0, "lang_id"]
+        image_path = os.path.join("resized_images", vqa_state.vqa_image_file)
         image = plt.imread(image_path)
+        vqa_state.vqa_image = image
+    transformed_image = get_transformed_image(vqa_state.vqa_image)
     new_col1, new_col2 = st.beta_columns([5, 5])
     # Display Image
+    new_col1.image(vqa_state.vqa_image, use_column_width="always")
     # Display Question
     question = new_col2.text_input(
         label="Question",
+        value=vqa_state.question,
         help="Type your question regarding the image above in one of the four languages.",
     )
     new_col2.markdown(
+        f"""**English Translation**: {question if vqa_state.question_lang_id == "en" else translate(question, 'en')}"""
     )
     question_inputs = get_text_attributes(question)
     # Select Language
     options = ["en", "de", "es", "fr"]
+    vqa_state.answer_lang_id = new_col2.selectbox(
         "Answer Language",
+        index=options.index(vqa_state.answer_lang_id),
         options=options,
         format_func=lambda x: code_to_name[x],
         help="The language to be used to show the top-5 labels.",
     )
+    actual_answer = answer_reverse_mapping[str(vqa_state.answer_label)]
     new_col2.markdown(
         "**Actual Answer**: "
+        + translate_labels([actual_answer], vqa_state.answer_lang_id)[0]
         + " ("
         + actual_answer
         + ")"
         logits = predict(transformed_image, dict(question_inputs))
     logits = softmax(logits)
     labels, values = get_top_5_predictions(logits, answer_reverse_mapping)
+    translated_labels = translate_labels(labels, vqa_state.answer_lang_id)
     fig = plotly_express_horizontal_bar_plot(values, translated_labels)
+    st.plotly_chart(fig, use_container_width=True)

multiapp.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import streamlit as st
 class MultiApp:
-    def __init__(self):
         self.apps = []
     def add_app(self, title, func):
         self.apps.append({"title": title, "function": func})
     def run(self):
         st.sidebar.header("Tasks")
-        app = st.sidebar.radio("", self.apps, format_func=lambda app: app["title"])
-        app["function"]()

 import streamlit as st
+from session import _get_state
 class MultiApp:
+    def __init__(self, state):
         self.apps = []
+        self.state = state
     def add_app(self, title, func):
         self.apps.append({"title": title, "function": func})
     def run(self):
         st.sidebar.header("Tasks")
+        app = st.sidebar.radio(
+            "", self.apps, format_func=lambda app: app["title"]
+        )
+        app["function"](self.state)

resize_images.py CHANGED Viewed

@@ -7,7 +7,11 @@ def resize_images(path, new_path, num_pixels=300):
     if not os.path.exists(new_path):
         os.makedirs(new_path)
     for filename in os.listdir(path):
-        if not filename.startswith('.') and (filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png')):
             img = cv2.imread(os.path.join(path, filename))
             height, width, channels = img.shape
             if height > width:
@@ -16,8 +20,11 @@ def resize_images(path, new_path, num_pixels=300):
             else:
                 new_width = num_pixels
                 new_height = int(height * new_width / width)
-            img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
             cv2.imwrite(os.path.join(new_path, filename), img)
 # resize_images('./images/val2014', './resized_images/val2014')
-resize_images('./misc/article', './misc/article/resized', 500)

     if not os.path.exists(new_path):
         os.makedirs(new_path)
     for filename in os.listdir(path):
+        if not filename.startswith(".") and (
+            filename.endswith(".jpg")
+            or filename.endswith(".jpeg")
+            or filename.endswith(".png")
+        ):
             img = cv2.imread(os.path.join(path, filename))
             height, width, channels = img.shape
             if height > width:
             else:
                 new_width = num_pixels
                 new_height = int(height * new_width / width)
+            img = cv2.resize(
+                img, (new_width, new_height), interpolation=cv2.INTER_CUBIC
+            )
             cv2.imwrite(os.path.join(new_path, filename), img)
 # resize_images('./images/val2014', './resized_images/val2014')
+resize_images("./misc/article", "./misc/article/resized", 500)