gchhablani commited on
Commit
b3c9da2
1 Parent(s): 1e0fe04
apps/article.py CHANGED
@@ -1,26 +1,76 @@
1
  import streamlit as st
2
  from apps.utils import read_markdown
3
  from streamlit_tensorboard import st_tensorboard
4
-
5
  def app(state):
 
 
6
  st.info("Welcome to our Multilingual-VQA demo. Please use the navigation sidebar to move to our demo, or scroll below to read all about our project. 🤗")
 
 
7
  st.write(read_markdown("intro.md"))
8
- st.write("## Methodology")
 
 
 
9
  st.write(read_markdown("pretraining.md"))
10
  st.image(
11
  "./misc/article/Multilingual-VQA.png",
12
- caption="Masked LM model for Image-text Pretraining.",
13
  )
14
  st.write("**Training Logs**")
15
  st_tensorboard(logdir='./logs/pretrain_logs', port=6006)
16
-
17
  st.write(read_markdown("finetuning.md"))
18
  st.write("**Training Logs**")
19
  st_tensorboard(logdir='./logs/finetune_logs', port=6007)
20
-
21
  st.write(read_markdown("challenges.md"))
 
22
  st.write(read_markdown("limitations.md"))
 
 
 
 
 
 
23
  st.write(read_markdown("social_impact.md"))
 
24
  st.write(read_markdown("references.md"))
 
25
  st.write(read_markdown("checkpoints.md"))
26
- st.write(read_markdown("acknowledgements.md"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from apps.utils import read_markdown
3
  from streamlit_tensorboard import st_tensorboard
4
+ from toc import Toc
5
  def app(state):
6
+ toc = Toc()
7
+ st.title("Table of contents")
8
  st.info("Welcome to our Multilingual-VQA demo. Please use the navigation sidebar to move to our demo, or scroll below to read all about our project. 🤗")
9
+ toc.placeholder()
10
+ toc.header("Introduction and Motivation")
11
  st.write(read_markdown("intro.md"))
12
+ toc.subheader("Novel Contributions")
13
+ st.write(read_markdown("contributions.md"))
14
+ toc.header("Methodology")
15
+ toc.subheader("Pre-training")
16
  st.write(read_markdown("pretraining.md"))
17
  st.image(
18
  "./misc/article/Multilingual-VQA.png",
19
+ caption="Masked LM model for Image-text Pre-training.",
20
  )
21
  st.write("**Training Logs**")
22
  st_tensorboard(logdir='./logs/pretrain_logs', port=6006)
23
+ toc.subheader("Finetuning")
24
  st.write(read_markdown("finetuning.md"))
25
  st.write("**Training Logs**")
26
  st_tensorboard(logdir='./logs/finetune_logs', port=6007)
27
+ toc.header("Challenges and Technical Difficulties")
28
  st.write(read_markdown("challenges.md"))
29
+ toc.header("Limitations")
30
  st.write(read_markdown("limitations.md"))
31
+ toc.header("Conclusion, Future Work, and Social Impact")
32
+ toc.subheader("Conclusion")
33
+ st.write(read_markdown("conclusion.md"))
34
+ toc.subheader("Future Work")
35
+ st.write(read_markdown("future_work.md"))
36
+ toc.subheader("Social Impact")
37
  st.write(read_markdown("social_impact.md"))
38
+ toc.header("References")
39
  st.write(read_markdown("references.md"))
40
+ toc.header("Checkpoints")
41
  st.write(read_markdown("checkpoints.md"))
42
+ toc.subheader("Other Checkpoints")
43
+ st.write(read_markdown("other_checkpoints.md"))
44
+ toc.header("Acknowledgements")
45
+ st.write(read_markdown("acknowledgements.md"))
46
+
47
+
48
+
49
+
50
+
51
+ toc.title("Title")
52
+
53
+ for a in range(10):
54
+ st.write("Blabla...")
55
+
56
+ toc.header("Header 1")
57
+
58
+ for a in range(10):
59
+ st.write("Blabla...")
60
+
61
+ toc.header("Header 2")
62
+
63
+ for a in range(10):
64
+ st.write("Blabla...")
65
+
66
+ toc.subheader("Subheader 1")
67
+
68
+ for a in range(10):
69
+ st.write("Blabla...")
70
+
71
+ toc.subheader("Subheader 2")
72
+
73
+ for a in range(10):
74
+ st.write("Blabla...")
75
+
76
+ toc.generate()
sections/acknowledgements.md CHANGED
@@ -1,4 +1,3 @@
1
- ## Acknowledgements
2
  We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
3
 
4
  This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
 
 
1
  We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
2
 
3
  This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
sections/challenges.md CHANGED
@@ -1,4 +1,3 @@
1
- ## Challenges and Technical Difficulties
2
  We faced challenges at every step of the way, despite having some example scripts and models ready by the 🤗 team in Flax.
3
 
4
  - The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
 
 
1
  We faced challenges at every step of the way, despite having some example scripts and models ready by the 🤗 team in Flax.
2
 
3
  - The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
sections/checkpoints.md CHANGED
@@ -1,11 +1,3 @@
1
- ## Checkpoints
2
  - Pre-trained checkpoint at 60k steps: [clip-vision-bert-cc12m-60k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-60k)
3
  - Pre-trained checkpoint at 70k steps: [clip-vision-bert-cc12m-70k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k)
4
  - Fine-tuned checkpoint at 6k steps on 60k pre-trained checkpoint: [clip-vision-bert-vqa-ft-6k](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k)
5
- ### Other checkpoints:
6
- - All pre-trained checkpoints: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
7
- - Fine-tuned checkpoints on 45k pre-trained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
8
- - Fine-tuned checkpoints on 45k pre-trained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
9
- - Fine-tuned checkpoints on 60k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft)
10
- - Fine-tuned checkpoints on 70k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-70k-ft)
11
- - From scratch (without MLM pre-training) model: [multilingual-vqa-ft](https://huggingface.co/flax-community/multilingual-vqa-ft)
 
 
1
  - Pre-trained checkpoint at 60k steps: [clip-vision-bert-cc12m-60k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-60k)
2
  - Pre-trained checkpoint at 70k steps: [clip-vision-bert-cc12m-70k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k)
3
  - Fine-tuned checkpoint at 6k steps on 60k pre-trained checkpoint: [clip-vision-bert-vqa-ft-6k](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k)
 
 
 
 
 
 
 
sections/conclusion.md ADDED
File without changes
sections/contributions.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Our novel contributions include:
2
+ - A [multilingual variant of the Conceptual-12M dataset](https://huggingface.co/datasets/flax-community/conceptual-12m-mbart-50-multilingual) containing 2.5M image-text pairs each in four languages - English, French, German and Spanish, translated using mBART-50 model.
3
+ - [Multilingual variants of the VQAv2 train and validation sets](https://huggingface.co/datasets/flax-community/multilingual-vqa) containing four times the original data in English, French, German and Spanish, translated using Marian models.
4
+ - [A fusion of CLIP Vision Transformer and BERT model](https://github.com/gchhablani/multilingual-vqa/tree/main/models/flax_clip_vision_bert) where BERT embeddings are concatenated with visual embeddings at the very beginning and passed through BERT self-attention layers. This is based on the [VisualBERT](https://arxiv.org/abs/1908.03557) model.
5
+ - A [pre-trained checkpooint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) on our multilingual with **67.85%** validation accuracy.
6
+ - A [fine-tuned checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) on our multilingual variant of the VQAv2 dataset with **49.76%** validation accuracy.
sections/finetuning.md CHANGED
@@ -1,5 +1,3 @@
1
- ### Fine-tuning
2
-
3
  **Dataset**
4
 
5
  For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
 
 
 
1
  **Dataset**
2
 
3
  For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
sections/future_work.md ADDED
File without changes
sections/intro.md CHANGED
@@ -16,12 +16,4 @@ While building a low-resource non-English VQA approach has several benefits of i
16
 
17
  We follow the two-staged training approach, our pre-training task being text-only Masked Language Modeling (MLM). Our pre-training dataset comes from Conceptual-12M dataset where we use mBART-50 for translation. Our fine-tuning dataset is taken from the VQAv2 dataset and its translation is done using MarianMT models.
18
 
19
- Our checkpoints achieve a **validation accuracy of 0.69 on our MLM** task, while our fine-tuned model is able to achieve a **validation accuracy of 0.49 on our multilingual VQAv2 validation set**. With better captions, hyperparameter-tuning, and further training, we expect to see higher performance.
20
-
21
- ### Novel Contributions
22
- Our novel contributions include:
23
- - A [multilingual variant of the Conceptual-12M dataset](https://huggingface.co/datasets/flax-community/conceptual-12m-mbart-50-multilingual) containing 2.5M image-text pairs each in four languages - English, French, German and Spanish, translated using mBART-50 model.
24
- - [Multilingual variants of the VQAv2 train and validation sets](https://huggingface.co/datasets/flax-community/multilingual-vqa) containing four times the original data in English, French, German and Spanish, translated using Marian models.
25
- - [A fusion of CLIP Vision Transformer and BERT model](https://github.com/gchhablani/multilingual-vqa/tree/main/models/flax_clip_vision_bert) where BERT embeddings are concatenated with visual embeddings at the very beginning and passed through BERT self-attention layers. This is based on the [VisualBERT](https://arxiv.org/abs/1908.03557) model.
26
- - A [pre-trained checkpooint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) on our multilingual with **67.85%** validation accuracy.
27
- - A [fine-tuned checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) on our multilingual variant of the VQAv2 dataset with **49.76%** validation accuracy.
 
16
 
17
  We follow the two-staged training approach, our pre-training task being text-only Masked Language Modeling (MLM). Our pre-training dataset comes from Conceptual-12M dataset where we use mBART-50 for translation. Our fine-tuning dataset is taken from the VQAv2 dataset and its translation is done using MarianMT models.
18
 
19
+ Our checkpoints achieve a **validation accuracy of 0.69 on our MLM** task, while our fine-tuned model is able to achieve a **validation accuracy of 0.49 on our multilingual VQAv2 validation set**. With better captions, hyperparameter-tuning, and further training, we expect to see higher performance.
 
 
 
 
 
 
 
 
sections/limitations.md CHANGED
@@ -1,2 +1 @@
1
- ## Limitations and Bias
2
  - Our best fine-tuned model only achieves 0.49 accuracy on the multilingual validation data that we create. This could be because of not-so-great quality translations, sub-optimal hyperparameters and lack of ample training. In future, we hope to improve this model by addressing such concerns.
 
 
1
  - Our best fine-tuned model only achieves 0.49 accuracy on the multilingual validation data that we create. This could be because of not-so-great quality translations, sub-optimal hyperparameters and lack of ample training. In future, we hope to improve this model by addressing such concerns.
sections/other_checkpoints.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - All pre-trained checkpoints: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
2
+ - Fine-tuned checkpoints on 45k pre-trained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
3
+ - Fine-tuned checkpoints on 45k pre-trained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
4
+ - Fine-tuned checkpoints on 60k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft)
5
+ - Fine-tuned checkpoints on 70k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-70k-ft)
6
+ - From scratch (without MLM pre-training) model: [multilingual-vqa-ft](https://huggingface.co/flax-community/multilingual-vqa-ft)
sections/pretraining.md CHANGED
@@ -1,4 +1,3 @@
1
- ### Pretraining
2
  We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a CLIP Vision (ViT transformer) encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
3
 
4
  **Dataset**
 
 
1
  We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a CLIP Vision (ViT transformer) encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
2
 
3
  **Dataset**
sections/references.md CHANGED
@@ -1,4 +1,3 @@
1
- ## References
2
  - [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
3
 
4
  - [VQA v2 Dataset](https://visualqa.org/challenge.html)
 
 
1
  - [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
2
 
3
  - [VQA v2 Dataset](https://visualqa.org/challenge.html)
sections/social_impact.md CHANGED
@@ -1,2 +1 @@
1
- ## Social Impact
2
  Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.
 
 
1
  Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.
toc.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ class Toc:
4
+
5
+ def __init__(self):
6
+ self._items = []
7
+ self._placeholder = None
8
+
9
+ def title(self, text):
10
+ self._markdown(text, "h1")
11
+
12
+ def header(self, text):
13
+ self._markdown(text, "h2", " " * 2)
14
+
15
+ def subheader(self, text):
16
+ self._markdown(text, "h3", " " * 4)
17
+
18
+ def placeholder(self, sidebar=False):
19
+ self._placeholder = st.sidebar.empty() if sidebar else st.empty()
20
+
21
+ def generate(self):
22
+ if self._placeholder:
23
+ self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
24
+
25
+ def _markdown(self, text, level, space=""):
26
+ key = "".join(filter(str.isalnum, text)).lower()
27
+
28
+ st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
29
+ self._items.append(f"{space}* <a href='#{key}'>{text}</a>")