Spaces:
Runtime error
Runtime error
gchhablani
commited on
Commit
•
b3c9da2
1
Parent(s):
1e0fe04
Add ToC
Browse files- apps/article.py +56 -6
- sections/acknowledgements.md +0 -1
- sections/challenges.md +0 -1
- sections/checkpoints.md +0 -8
- sections/conclusion.md +0 -0
- sections/contributions.md +6 -0
- sections/finetuning.md +0 -2
- sections/future_work.md +0 -0
- sections/intro.md +1 -9
- sections/limitations.md +0 -1
- sections/other_checkpoints.md +6 -0
- sections/pretraining.md +0 -1
- sections/references.md +0 -1
- sections/social_impact.md +0 -1
- toc.py +29 -0
apps/article.py
CHANGED
@@ -1,26 +1,76 @@
|
|
1 |
import streamlit as st
|
2 |
from apps.utils import read_markdown
|
3 |
from streamlit_tensorboard import st_tensorboard
|
4 |
-
|
5 |
def app(state):
|
|
|
|
|
6 |
st.info("Welcome to our Multilingual-VQA demo. Please use the navigation sidebar to move to our demo, or scroll below to read all about our project. 🤗")
|
|
|
|
|
7 |
st.write(read_markdown("intro.md"))
|
8 |
-
|
|
|
|
|
|
|
9 |
st.write(read_markdown("pretraining.md"))
|
10 |
st.image(
|
11 |
"./misc/article/Multilingual-VQA.png",
|
12 |
-
caption="Masked LM model for Image-text
|
13 |
)
|
14 |
st.write("**Training Logs**")
|
15 |
st_tensorboard(logdir='./logs/pretrain_logs', port=6006)
|
16 |
-
|
17 |
st.write(read_markdown("finetuning.md"))
|
18 |
st.write("**Training Logs**")
|
19 |
st_tensorboard(logdir='./logs/finetune_logs', port=6007)
|
20 |
-
|
21 |
st.write(read_markdown("challenges.md"))
|
|
|
22 |
st.write(read_markdown("limitations.md"))
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
st.write(read_markdown("social_impact.md"))
|
|
|
24 |
st.write(read_markdown("references.md"))
|
|
|
25 |
st.write(read_markdown("checkpoints.md"))
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from apps.utils import read_markdown
|
3 |
from streamlit_tensorboard import st_tensorboard
|
4 |
+
from toc import Toc
|
5 |
def app(state):
|
6 |
+
toc = Toc()
|
7 |
+
st.title("Table of contents")
|
8 |
st.info("Welcome to our Multilingual-VQA demo. Please use the navigation sidebar to move to our demo, or scroll below to read all about our project. 🤗")
|
9 |
+
toc.placeholder()
|
10 |
+
toc.header("Introduction and Motivation")
|
11 |
st.write(read_markdown("intro.md"))
|
12 |
+
toc.subheader("Novel Contributions")
|
13 |
+
st.write(read_markdown("contributions.md"))
|
14 |
+
toc.header("Methodology")
|
15 |
+
toc.subheader("Pre-training")
|
16 |
st.write(read_markdown("pretraining.md"))
|
17 |
st.image(
|
18 |
"./misc/article/Multilingual-VQA.png",
|
19 |
+
caption="Masked LM model for Image-text Pre-training.",
|
20 |
)
|
21 |
st.write("**Training Logs**")
|
22 |
st_tensorboard(logdir='./logs/pretrain_logs', port=6006)
|
23 |
+
toc.subheader("Finetuning")
|
24 |
st.write(read_markdown("finetuning.md"))
|
25 |
st.write("**Training Logs**")
|
26 |
st_tensorboard(logdir='./logs/finetune_logs', port=6007)
|
27 |
+
toc.header("Challenges and Technical Difficulties")
|
28 |
st.write(read_markdown("challenges.md"))
|
29 |
+
toc.header("Limitations")
|
30 |
st.write(read_markdown("limitations.md"))
|
31 |
+
toc.header("Conclusion, Future Work, and Social Impact")
|
32 |
+
toc.subheader("Conclusion")
|
33 |
+
st.write(read_markdown("conclusion.md"))
|
34 |
+
toc.subheader("Future Work")
|
35 |
+
st.write(read_markdown("future_work.md"))
|
36 |
+
toc.subheader("Social Impact")
|
37 |
st.write(read_markdown("social_impact.md"))
|
38 |
+
toc.header("References")
|
39 |
st.write(read_markdown("references.md"))
|
40 |
+
toc.header("Checkpoints")
|
41 |
st.write(read_markdown("checkpoints.md"))
|
42 |
+
toc.subheader("Other Checkpoints")
|
43 |
+
st.write(read_markdown("other_checkpoints.md"))
|
44 |
+
toc.header("Acknowledgements")
|
45 |
+
st.write(read_markdown("acknowledgements.md"))
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
toc.title("Title")
|
52 |
+
|
53 |
+
for a in range(10):
|
54 |
+
st.write("Blabla...")
|
55 |
+
|
56 |
+
toc.header("Header 1")
|
57 |
+
|
58 |
+
for a in range(10):
|
59 |
+
st.write("Blabla...")
|
60 |
+
|
61 |
+
toc.header("Header 2")
|
62 |
+
|
63 |
+
for a in range(10):
|
64 |
+
st.write("Blabla...")
|
65 |
+
|
66 |
+
toc.subheader("Subheader 1")
|
67 |
+
|
68 |
+
for a in range(10):
|
69 |
+
st.write("Blabla...")
|
70 |
+
|
71 |
+
toc.subheader("Subheader 2")
|
72 |
+
|
73 |
+
for a in range(10):
|
74 |
+
st.write("Blabla...")
|
75 |
+
|
76 |
+
toc.generate()
|
sections/acknowledgements.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
## Acknowledgements
|
2 |
We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
|
3 |
|
4 |
This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
|
|
|
|
|
1 |
We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
|
2 |
|
3 |
This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
|
sections/challenges.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
## Challenges and Technical Difficulties
|
2 |
We faced challenges at every step of the way, despite having some example scripts and models ready by the 🤗 team in Flax.
|
3 |
|
4 |
- The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
|
|
|
|
|
1 |
We faced challenges at every step of the way, despite having some example scripts and models ready by the 🤗 team in Flax.
|
2 |
|
3 |
- The dataset we used - Conceptual 12M took 2-3 days to translate using MBart (since we didn't have Marian at the time). The major bottleneck was implementing the translation efficiently. We tried using `mtranslate` first but it turned out to be too slow, even with multiprocessing.
|
sections/checkpoints.md
CHANGED
@@ -1,11 +1,3 @@
|
|
1 |
-
## Checkpoints
|
2 |
- Pre-trained checkpoint at 60k steps: [clip-vision-bert-cc12m-60k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-60k)
|
3 |
- Pre-trained checkpoint at 70k steps: [clip-vision-bert-cc12m-70k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k)
|
4 |
- Fine-tuned checkpoint at 6k steps on 60k pre-trained checkpoint: [clip-vision-bert-vqa-ft-6k](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k)
|
5 |
-
### Other checkpoints:
|
6 |
-
- All pre-trained checkpoints: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
|
7 |
-
- Fine-tuned checkpoints on 45k pre-trained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
|
8 |
-
- Fine-tuned checkpoints on 45k pre-trained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
|
9 |
-
- Fine-tuned checkpoints on 60k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft)
|
10 |
-
- Fine-tuned checkpoints on 70k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-70k-ft)
|
11 |
-
- From scratch (without MLM pre-training) model: [multilingual-vqa-ft](https://huggingface.co/flax-community/multilingual-vqa-ft)
|
|
|
|
|
1 |
- Pre-trained checkpoint at 60k steps: [clip-vision-bert-cc12m-60k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-60k)
|
2 |
- Pre-trained checkpoint at 70k steps: [clip-vision-bert-cc12m-70k](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k)
|
3 |
- Fine-tuned checkpoint at 6k steps on 60k pre-trained checkpoint: [clip-vision-bert-vqa-ft-6k](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sections/conclusion.md
ADDED
File without changes
|
sections/contributions.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Our novel contributions include:
|
2 |
+
- A [multilingual variant of the Conceptual-12M dataset](https://huggingface.co/datasets/flax-community/conceptual-12m-mbart-50-multilingual) containing 2.5M image-text pairs each in four languages - English, French, German and Spanish, translated using mBART-50 model.
|
3 |
+
- [Multilingual variants of the VQAv2 train and validation sets](https://huggingface.co/datasets/flax-community/multilingual-vqa) containing four times the original data in English, French, German and Spanish, translated using Marian models.
|
4 |
+
- [A fusion of CLIP Vision Transformer and BERT model](https://github.com/gchhablani/multilingual-vqa/tree/main/models/flax_clip_vision_bert) where BERT embeddings are concatenated with visual embeddings at the very beginning and passed through BERT self-attention layers. This is based on the [VisualBERT](https://arxiv.org/abs/1908.03557) model.
|
5 |
+
- A [pre-trained checkpooint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) on our multilingual with **67.85%** validation accuracy.
|
6 |
+
- A [fine-tuned checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) on our multilingual variant of the VQAv2 dataset with **49.76%** validation accuracy.
|
sections/finetuning.md
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
### Fine-tuning
|
2 |
-
|
3 |
**Dataset**
|
4 |
|
5 |
For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
|
|
|
|
|
|
|
1 |
**Dataset**
|
2 |
|
3 |
For fine-tuning, we use the [VQA 2.0](https://visualqa.org/) dataset - particularly, the `train` and `validation` sets. We translate all the questions into the four languages specified above using language-specific MarianMT models. This is because MarianMT models return better labels and are faster, hence, are better for fine-tuning. We get 4x the number of examples in each subset.
|
sections/future_work.md
ADDED
File without changes
|
sections/intro.md
CHANGED
@@ -16,12 +16,4 @@ While building a low-resource non-English VQA approach has several benefits of i
|
|
16 |
|
17 |
We follow the two-staged training approach, our pre-training task being text-only Masked Language Modeling (MLM). Our pre-training dataset comes from Conceptual-12M dataset where we use mBART-50 for translation. Our fine-tuning dataset is taken from the VQAv2 dataset and its translation is done using MarianMT models.
|
18 |
|
19 |
-
Our checkpoints achieve a **validation accuracy of 0.69 on our MLM** task, while our fine-tuned model is able to achieve a **validation accuracy of 0.49 on our multilingual VQAv2 validation set**. With better captions, hyperparameter-tuning, and further training, we expect to see higher performance.
|
20 |
-
|
21 |
-
### Novel Contributions
|
22 |
-
Our novel contributions include:
|
23 |
-
- A [multilingual variant of the Conceptual-12M dataset](https://huggingface.co/datasets/flax-community/conceptual-12m-mbart-50-multilingual) containing 2.5M image-text pairs each in four languages - English, French, German and Spanish, translated using mBART-50 model.
|
24 |
-
- [Multilingual variants of the VQAv2 train and validation sets](https://huggingface.co/datasets/flax-community/multilingual-vqa) containing four times the original data in English, French, German and Spanish, translated using Marian models.
|
25 |
-
- [A fusion of CLIP Vision Transformer and BERT model](https://github.com/gchhablani/multilingual-vqa/tree/main/models/flax_clip_vision_bert) where BERT embeddings are concatenated with visual embeddings at the very beginning and passed through BERT self-attention layers. This is based on the [VisualBERT](https://arxiv.org/abs/1908.03557) model.
|
26 |
-
- A [pre-trained checkpooint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) on our multilingual with **67.85%** validation accuracy.
|
27 |
-
- A [fine-tuned checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) on our multilingual variant of the VQAv2 dataset with **49.76%** validation accuracy.
|
|
|
16 |
|
17 |
We follow the two-staged training approach, our pre-training task being text-only Masked Language Modeling (MLM). Our pre-training dataset comes from Conceptual-12M dataset where we use mBART-50 for translation. Our fine-tuning dataset is taken from the VQAv2 dataset and its translation is done using MarianMT models.
|
18 |
|
19 |
+
Our checkpoints achieve a **validation accuracy of 0.69 on our MLM** task, while our fine-tuned model is able to achieve a **validation accuracy of 0.49 on our multilingual VQAv2 validation set**. With better captions, hyperparameter-tuning, and further training, we expect to see higher performance.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sections/limitations.md
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
## Limitations and Bias
|
2 |
- Our best fine-tuned model only achieves 0.49 accuracy on the multilingual validation data that we create. This could be because of not-so-great quality translations, sub-optimal hyperparameters and lack of ample training. In future, we hope to improve this model by addressing such concerns.
|
|
|
|
|
1 |
- Our best fine-tuned model only achieves 0.49 accuracy on the multilingual validation data that we create. This could be because of not-so-great quality translations, sub-optimal hyperparameters and lack of ample training. In future, we hope to improve this model by addressing such concerns.
|
sections/other_checkpoints.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- All pre-trained checkpoints: [multilingual-vqa](https://huggingface.co/flax-community/multilingual-vqa)
|
2 |
+
- Fine-tuned checkpoints on 45k pre-trained checkpoint: [multilingual-vqa-pt-45k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft)
|
3 |
+
- Fine-tuned checkpoints on 45k pre-trained checkpoint with AdaFactor (others use AdamW): [multilingual-vqa-pt-45k-ft-adf](https://huggingface.co/flax-community/multilingual-vqa-pt-45k-ft-adf)
|
4 |
+
- Fine-tuned checkpoints on 60k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft)
|
5 |
+
- Fine-tuned checkpoints on 70k pre-trained checkpoint: [multilingual-vqa-pt-60k-ft](https://huggingface.co/flax-community/multilingual-vqa-pt-70k-ft)
|
6 |
+
- From scratch (without MLM pre-training) model: [multilingual-vqa-ft](https://huggingface.co/flax-community/multilingual-vqa-ft)
|
sections/pretraining.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
### Pretraining
|
2 |
We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a CLIP Vision (ViT transformer) encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
|
3 |
|
4 |
**Dataset**
|
|
|
|
|
1 |
We follow an approach similar to [VisualBERT](https://arxiv.org/abs/1908.03557). Instead of using a FasterRCNN to get image features, we use a CLIP Vision (ViT transformer) encoder. The pre-training task is text-only MLM (Masked Language Modeling). We mask only the text tokens and try to predict the masked tokens. The VisualBERT authors also use a sentence-image matching task where two captions are matched against an image, but we skip this for the sake of simplicity.
|
2 |
|
3 |
**Dataset**
|
sections/references.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
## References
|
2 |
- [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
|
3 |
|
4 |
- [VQA v2 Dataset](https://visualqa.org/challenge.html)
|
|
|
|
|
1 |
- [Conceptual 12M Dataset](https://github.com/google-research-datasets/conceptual-12m)
|
2 |
|
3 |
- [VQA v2 Dataset](https://visualqa.org/challenge.html)
|
sections/social_impact.md
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
## Social Impact
|
2 |
Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.
|
|
|
|
|
1 |
Multilingual Visual Question Answering has not received a lot of attention. There are very few multilingual VQA datasets, and that is what we wanted to address here. Our initial plan was to include 4 high-resource and 4 low-resource languages in our training data. However, the existing translations do not perform as well and we would have received poor labels, not to mention, with a longer training time. We hope to improve this in the future by using better translators (for e.g. Google Translate API) to get more multilingual data, especially in low-resource languages. Regardless, our aim with this project was to provide with a pipeline approach to deal with Multilingual visuo-linguistic pretraining and perform Multilingual Visual Question Answering.
|
toc.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
class Toc:
|
4 |
+
|
5 |
+
def __init__(self):
|
6 |
+
self._items = []
|
7 |
+
self._placeholder = None
|
8 |
+
|
9 |
+
def title(self, text):
|
10 |
+
self._markdown(text, "h1")
|
11 |
+
|
12 |
+
def header(self, text):
|
13 |
+
self._markdown(text, "h2", " " * 2)
|
14 |
+
|
15 |
+
def subheader(self, text):
|
16 |
+
self._markdown(text, "h3", " " * 4)
|
17 |
+
|
18 |
+
def placeholder(self, sidebar=False):
|
19 |
+
self._placeholder = st.sidebar.empty() if sidebar else st.empty()
|
20 |
+
|
21 |
+
def generate(self):
|
22 |
+
if self._placeholder:
|
23 |
+
self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
|
24 |
+
|
25 |
+
def _markdown(self, text, level, space=""):
|
26 |
+
key = "".join(filter(str.isalnum, text)).lower()
|
27 |
+
|
28 |
+
st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
|
29 |
+
self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
|