gchhablani commited on
Commit
7a89f67
β€’
1 Parent(s): 0808df5

Update app

Browse files
app.py CHANGED
@@ -74,7 +74,7 @@ st.write(
74
  "[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
75
  )
76
 
77
- image_col, intro_col = st.beta_columns([2,8])
78
  image_col.image("./misc/mvqa-logo-white.png", use_column_width='always')
79
  intro_col.write(read_markdown('intro.md'))
80
  with st.beta_expander("Usage"):
@@ -83,7 +83,7 @@ with st.beta_expander("Usage"):
83
  with st.beta_expander("Article"):
84
  st.write(read_markdown("abstract.md"))
85
  st.write(read_markdown("caveats.md"))
86
- st.write("# Methodology")
87
  st.image(
88
  "./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining."
89
  )
@@ -110,7 +110,7 @@ if state.image_file is None:
110
 
111
  col1, col2 = st.beta_columns([6, 4])
112
 
113
- if col2.button("Get a random example", help="Get a random example from the 100 "):
114
  sample = dummy_data.sample(1).reset_index()
115
  state.image_file = sample.loc[0, "image_file"]
116
  state.question = sample.loc[0, "question"].strip("- ")
@@ -124,10 +124,11 @@ if col2.button("Get a random example", help="Get a random example from the 100 "
124
 
125
  col2.write("OR")
126
 
127
- uploaded_file = col2.file_uploader("Upload your image", type=["png", "jpg", "jpeg"])
128
  if uploaded_file is not None:
129
- state.image_file = os.path.join("images/val2014", uploaded_file.name)
130
- state.image = np.array(Image.open(uploaded_file))
 
131
 
132
  transformed_image = get_transformed_image(state.image)
133
 
@@ -136,7 +137,7 @@ col1.image(state.image, use_column_width="auto")
136
 
137
  new_col1, new_col2 = st.beta_columns([5,5])
138
  # Display Question
139
- question = new_col1.text_input(label="Question", value=state.question)
140
  new_col1.markdown(
141
  f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
142
  )
@@ -150,9 +151,11 @@ state.answer_lang_id = new_col2.selectbox(
150
  index=options.index(state.answer_lang_id),
151
  options=options,
152
  format_func=lambda x: code_to_name[x],
 
153
  )
154
 
155
- new_col2.markdown("**Actual Answer in English**: " + answer_reverse_mapping[str(state.answer_label)])
 
156
 
157
  # Display Top-5 Predictions
158
  with st.spinner("Loading model..."):
 
74
  "[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
75
  )
76
 
77
+ image_col, intro_col = st.beta_columns([3,8])
78
  image_col.image("./misc/mvqa-logo-white.png", use_column_width='always')
79
  intro_col.write(read_markdown('intro.md'))
80
  with st.beta_expander("Usage"):
 
83
  with st.beta_expander("Article"):
84
  st.write(read_markdown("abstract.md"))
85
  st.write(read_markdown("caveats.md"))
86
+ st.write("## Methodology")
87
  st.image(
88
  "./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining."
89
  )
 
110
 
111
  col1, col2 = st.beta_columns([6, 4])
112
 
113
+ if col2.button("Get a random example", help="Get a random example from the 100 `seeded` image-text pairs."):
114
  sample = dummy_data.sample(1).reset_index()
115
  state.image_file = sample.loc[0, "image_file"]
116
  state.question = sample.loc[0, "question"].strip("- ")
 
124
 
125
  col2.write("OR")
126
 
127
+ uploaded_file = col2.file_uploader("Upload your image", type=["png", "jpg", "jpeg"], help="Upload a file of your choosing.")
128
  if uploaded_file is not None:
129
+ st.error("Uploading files does not work on HuggingFace spaces. This app only supports random examples for now.")
130
+ # state.image_file = os.path.join("images/val2014", uploaded_file.name)
131
+ # state.image = np.array(Image.open(uploaded_file))
132
 
133
  transformed_image = get_transformed_image(state.image)
134
 
 
137
 
138
  new_col1, new_col2 = st.beta_columns([5,5])
139
  # Display Question
140
+ question = new_col1.text_input(label="Question", value=state.question, help="Type your question regarding the image above in one of the four languages.")
141
  new_col1.markdown(
142
  f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
143
  )
 
151
  index=options.index(state.answer_lang_id),
152
  options=options,
153
  format_func=lambda x: code_to_name[x],
154
+ help="The language to be used to show the top-5 labels."
155
  )
156
 
157
+ actual_answer = answer_reverse_mapping[str(state.answer_label)]
158
+ new_col2.markdown("**Actual Answer**: " + translate_labels([actual_answer], state.answer_lang_id)[0]+" ("+actual_answer+")")
159
 
160
  # Display Top-5 Predictions
161
  with st.spinner("Loading model..."):
sections/abstract.md CHANGED
@@ -1,2 +1,2 @@
1
- # Abstract
2
  This project is focused on Mutilingual Visual Question Answering. Most of the existing datasets and models on this task work with English-only image-text pairs. Our intention here is to provide a Proof-of-Concept with our simple ViT+BERT model which can be trained on multilingual text checkpoints with pre-trained image encoders and made to perform well enough. Due to lack of good-quality multilingual data, we translate subsets of the Conceptual 12M dataset into English (already in English), French, German and Spanish using the Marian models. We achieved 0.49 accuracy on the multilingual validation set we created. With better captions, and hyperparameter-tuning, we expect to see higher performance.
 
1
+ ## Abstract
2
  This project is focused on Mutilingual Visual Question Answering. Most of the existing datasets and models on this task work with English-only image-text pairs. Our intention here is to provide a Proof-of-Concept with our simple ViT+BERT model which can be trained on multilingual text checkpoints with pre-trained image encoders and made to perform well enough. Due to lack of good-quality multilingual data, we translate subsets of the Conceptual 12M dataset into English (already in English), French, German and Spanish using the Marian models. We achieved 0.49 accuracy on the multilingual validation set we created. With better captions, and hyperparameter-tuning, we expect to see higher performance.
sections/acknowledgements.md CHANGED
@@ -1,4 +1,4 @@
1
- # Acknowledgements
2
  We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
3
 
4
  This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
 
1
+ ## Acknowledgements
2
  We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
3
 
4
  This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
sections/challenges.md CHANGED
@@ -1,4 +1,4 @@
1
- # Challenges and Technical Difficulties
2
  We faced challenges at every step of the way, despite having some example scripts and models ready by the πŸ€— team in Flax.
3
 
4
  - The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
 
1
+ ## Challenges and Technical Difficulties
2
  We faced challenges at every step of the way, despite having some example scripts and models ready by the πŸ€— team in Flax.
3
 
4
  - The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
sections/checkpoints.md CHANGED
@@ -1,4 +1,4 @@
1
- # Checkpoints
2
  - Pre-trained checkpoint: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
3
  - Fine-tuned on 45k pretrained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
4
  - Fine-tuned on 45k pretrained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
 
1
+ ## Checkpoints
2
  - Pre-trained checkpoint: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
3
  - Fine-tuned on 45k pretrained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
4
  - Fine-tuned on 45k pretrained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
sections/finetuning.md CHANGED
@@ -1,5 +1,9 @@
1
- ## Fine-tuning
2
- ### Dataset
 
 
3
  For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
4
- ### Model
 
 
5
  We use the `SequenceClassification` model as reference to create our own sequence classification model. In this, a classification layer is attached on top of the pre-trained BERT model in order to performance multi-class classification. 3129 answer labels are chosen, as is the convention for the English VQA task, which can be found [here](https://github.com/gchhablani/multilingual-vqa/blob/main/answer_mapping.json). These are the same labels used in fine-tuning of the VisualBERT models. The outputs shown here have been translated using the [`mtranslate`](https://github.com/mouuff/mtranslate) Google Translate API library. Then we use various pre-trained checkpoints and train the sequence classification model for various steps.
 
1
+ ### Fine-tuning
2
+
3
+ **Dataset**
4
+
5
  For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
6
+
7
+ **Model**
8
+
9
  We use the `SequenceClassification` model as reference to create our own sequence classification model. In this, a classification layer is attached on top of the pre-trained BERT model in order to performance multi-class classification. 3129 answer labels are chosen, as is the convention for the English VQA task, which can be found [here](https://github.com/gchhablani/multilingual-vqa/blob/main/answer_mapping.json). These are the same labels used in fine-tuning of the VisualBERT models. The outputs shown here have been translated using the [`mtranslate`](https://github.com/mouuff/mtranslate) Google Translate API library. Then we use various pre-trained checkpoints and train the sequence classification model for various steps.
sections/pretraining.md CHANGED
@@ -1,6 +1,10 @@
1
- ## Pretraining
2
  We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a ViT encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
3
- ### Dataset
 
 
4
  The dataset we use for pre-training is a cleaned version of [Conceptual 12M](https://github.com/google-research-datasets/conceptual-12m). The dataset is downloaded and then broken images are removed which gives us about 10M images. Then we use the MBart50 `mbart-large-50-one-to-many-mmt` checkpoint to translate the dataset into four different languages - English, French, German, and Spanish, keeping 2.5 million examples of each language.
5
- ### Model
 
 
6
  The model is shown in the image above. The `Dummy MLM Head` is actually combined with the MLM head but it never contributes to the MLM loss, hence the name (the predictions on these tokens are ignored). We create a custom model in Flax which integerates the ViT model inside BERT embeddings. We also use custom configs and modules in order to accomodate for these changes, and allow loading from BERT and ViT checkpoints. The image is fed to the ViT encoder and the text is fed to the word-embedding layers of BERT model. We use the `bert-base-multilingual-uncased` and `openai/clip-vit-base-patch32` checkpoints for BERT and ViT (actually CLIPVision) models, respectively. All our code is available on [GitHub](https://github.com/gchhablani/multilingual-vqa).
 
1
+ ### Pretraining
2
  We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a ViT encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
3
+
4
+ **Dataset**
5
+
6
  The dataset we use for pre-training is a cleaned version of [Conceptual 12M](https://github.com/google-research-datasets/conceptual-12m). The dataset is downloaded and then broken images are removed which gives us about 10M images. Then we use the MBart50 `mbart-large-50-one-to-many-mmt` checkpoint to translate the dataset into four different languages - English, French, German, and Spanish, keeping 2.5 million examples of each language.
7
+
8
+ **Model**
9
+
10
  The model is shown in the image above. The `Dummy MLM Head` is actually combined with the MLM head but it never contributes to the MLM loss, hence the name (the predictions on these tokens are ignored). We create a custom model in Flax which integerates the ViT model inside BERT embeddings. We also use custom configs and modules in order to accomodate for these changes, and allow loading from BERT and ViT checkpoints. The image is fed to the ViT encoder and the text is fed to the word-embedding layers of BERT model. We use the `bert-base-multilingual-uncased` and `openai/clip-vit-base-patch32` checkpoints for BERT and ViT (actually CLIPVision) models, respectively. All our code is available on [GitHub](https://github.com/gchhablani/multilingual-vqa).
sections/references.md CHANGED
@@ -1,4 +1,4 @@
1
- # References
2
  - [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
3
 
4
  - [VQA v2 Dataset](https://visualqa.org/challenge.html)
 
1
+ ## References
2
  - [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
3
 
4
  - [VQA v2 Dataset](https://visualqa.org/challenge.html)
sections/social_impact.md CHANGED
@@ -1,2 +1,2 @@
1
- # Social Impact
2
  Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.
 
1
+ ## Social Impact
2
  Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.