gchhablani commited on
Commit
fb3c77c
β€’
1 Parent(s): 36c3aaa

Update layout

Browse files
app.py CHANGED
@@ -1,14 +1,8 @@
1
  from apps import mlm, vqa
2
- import os
3
  import streamlit as st
4
  from session import _get_state
5
  from multiapp import MultiApp
6
-
7
-
8
- def read_markdown(path, parent="./sections/"):
9
- with open(os.path.join(parent, path)) as f:
10
- return f.read()
11
-
12
 
13
  def main():
14
  state = _get_state()
@@ -24,12 +18,6 @@ def main():
24
  "[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
25
  )
26
 
27
- image_col, intro_col = st.beta_columns([3, 8])
28
- image_col.image("./misc/mvqa-logo-3-white.png", use_column_width="always")
29
- intro_col.write(read_markdown("intro.md"))
30
- with st.beta_expander("Usage"):
31
- st.write(read_markdown("usage.md"))
32
-
33
  with st.beta_expander("Article"):
34
  st.write(read_markdown("abstract.md"))
35
  st.write(read_markdown("caveats.md"))
 
1
  from apps import mlm, vqa
 
2
  import streamlit as st
3
  from session import _get_state
4
  from multiapp import MultiApp
5
+ from apps.utils import read_markdown
 
 
 
 
 
6
 
7
  def main():
8
  state = _get_state()
 
18
  "[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
19
  )
20
 
 
 
 
 
 
 
21
  with st.beta_expander("Article"):
22
  st.write(read_markdown("abstract.md"))
23
  st.write(read_markdown("caveats.md"))
apps/mlm.py CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
12
  import os
13
  import matplotlib.pyplot as plt
14
  from mtranslate import translate
15
-
16
 
17
  from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
18
  FlaxCLIPVisionBertForMaskedLM,
@@ -25,6 +25,10 @@ def softmax(logits):
25
  def app(state):
26
  mlm_state = state
27
 
 
 
 
 
28
  # @st.cache(persist=False) # TODO: Make this work with mlm_state. Currently not supported.
29
  def predict(transformed_image, caption_inputs):
30
  outputs = mlm_state.mlm_model(pixel_values=transformed_image, **caption_inputs)
 
12
  import os
13
  import matplotlib.pyplot as plt
14
  from mtranslate import translate
15
+ from .utils import read_markdown
16
 
17
  from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
18
  FlaxCLIPVisionBertForMaskedLM,
 
25
  def app(state):
26
  mlm_state = state
27
 
28
+ with st.beta_expander("Usage"):
29
+ st.write(read_markdown("mlm_usage.md"))
30
+ st.write(read_markdown("mlm_intro.md"))
31
+
32
  # @st.cache(persist=False) # TODO: Make this work with mlm_state. Currently not supported.
33
  def predict(transformed_image, caption_inputs):
34
  outputs = mlm_state.mlm_model(pixel_values=transformed_image, **caption_inputs)
apps/utils.py CHANGED
@@ -1,5 +1,5 @@
1
  import json
2
-
3
  import numpy as np
4
  import plotly.express as px
5
  import torch
@@ -81,3 +81,8 @@ def plotly_express_horizontal_bar_plot(values, labels):
81
  orientation="h",
82
  )
83
  return fig
 
 
 
 
 
 
1
  import json
2
+ import os
3
  import numpy as np
4
  import plotly.express as px
5
  import torch
 
81
  orientation="h",
82
  )
83
  return fig
84
+
85
+
86
+ def read_markdown(path, parent="./sections/"):
87
+ with open(os.path.join(parent, path)) as f:
88
+ return f.read()
apps/vqa.py CHANGED
@@ -14,7 +14,7 @@ import matplotlib.pyplot as plt
14
  import json
15
 
16
  from mtranslate import translate
17
-
18
 
19
  from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
20
  FlaxCLIPVisionBertForSequenceClassification,
@@ -28,6 +28,10 @@ def softmax(logits):
28
  def app(state):
29
  vqa_state = state
30
 
 
 
 
 
31
  # @st.cache(persist=False)
32
  def predict(transformed_image, question_inputs):
33
  return np.array(
 
14
  import json
15
 
16
  from mtranslate import translate
17
+ from .utils import read_markdown
18
 
19
  from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
20
  FlaxCLIPVisionBertForSequenceClassification,
 
28
  def app(state):
29
  vqa_state = state
30
 
31
+ with st.beta_expander("Usage"):
32
+ st.write(read_markdown("vqa_usage.md"))
33
+ st.write(read_markdown("vqa_intro.md"))
34
+
35
  # @st.cache(persist=False)
36
  def predict(transformed_image, question_inputs):
37
  return np.array(
multiapp.py CHANGED
@@ -10,6 +10,7 @@ class MultiApp:
10
  self.apps.append({"title": title, "function": func})
11
 
12
  def run(self):
 
13
  st.sidebar.header("Tasks")
14
  app = st.sidebar.radio(
15
  "", self.apps, format_func=lambda app: app["title"]
 
10
  self.apps.append({"title": title, "function": func})
11
 
12
  def run(self):
13
+ logo = st.image("./misc/mvqa-logo-3-white.png")
14
  st.sidebar.header("Tasks")
15
  app = st.sidebar.radio(
16
  "", self.apps, format_func=lambda app: app["title"]
sections/mlm_intro.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ This demo uses a [CLIP-Vision-Bert model checkpoint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) pre-trained using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish, giving 2.5M examples in each language.
2
+
3
+ The model can be used for mask-filling as shown in this demo. The caption can be present or written in any of the following: English, French, German and Spanish.
4
+
5
+ For more details, click on `Usage` or `Article` πŸ€— above.
sections/mlm_usage.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ - This demo loads the `FlaxCLIPVisionBertForMaskedLM` present in the `model` directory of this repository. The checkpoint is loaded from [`flax-community/clip-vision-bert-cc12m-70k`](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) which is pre-trained checkpoint with 70k steps. 100 random validation set examples are present in the `cc12m_data/vqa_val.tsv` with respective images in the `cc12m_data/images_data` directory.
2
+
3
+ - We provide `English Translation` of the caption for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
4
+
5
+ - The model predicts the scores for tokens from the `bert-base-multilingual-uncased` checkpoint.
6
+
7
+ - The top-5 predictions are displayed below and their respective confidence scores are shown in form of a bar plot.
sections/{intro.md β†’ vqa_intro.md} RENAMED
@@ -1,5 +1,5 @@
1
- This demo uses a [ViTBert model checkpoint](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft/tree/main/ckpt-5999) fine-tuned on a [MarianMT](https://huggingface.co/transformers/model_doc/marian.html)-translated version of the [VQA v2 dataset](https://visualqa.org/challenge.html). The fine-tuning is performed after pre-training using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish.
2
 
3
  The model predicts one out of 3129 classes in English which can be found [here](https://huggingface.co/spaces/flax-community/Multilingual-VQA/blob/main/answer_reverse_mapping.json), and then the translated versions are provided based on the language chosen as `Answer Language`. The question can be present or written in any of the following: English, French, German and Spanish.
4
 
5
- For more details, click on `Usage` or `Article` πŸ€— below.
 
1
+ This demo uses a [CLIP-Vision-Bert model checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) fine-tuned on a [MarianMT](https://huggingface.co/transformers/model_doc/marian.html)-translated version of the [VQA v2 dataset](https://visualqa.org/challenge.html). The fine-tuning is performed after pre-training using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish.
2
 
3
  The model predicts one out of 3129 classes in English which can be found [here](https://huggingface.co/spaces/flax-community/Multilingual-VQA/blob/main/answer_reverse_mapping.json), and then the translated versions are provided based on the language chosen as `Answer Language`. The question can be present or written in any of the following: English, French, German and Spanish.
4
 
5
+ For more details, click on `Usage` or `Article` πŸ€— above.
sections/{usage.md β†’ vqa_usage.md} RENAMED
@@ -1,4 +1,4 @@
1
- - This demo loads the `FlaxCLIPVisionBertForSequenceClassificationModel` present in the `model` directory of this repository. The checkpoint is loaded from `ckpt/ckpt-60k-5999` which is pre-trained checkpoint with 60k steps and 5999 fine-tuning steps. 100 random validation set examples are present in the `dummy_vqa_multilingual.tsv` with respective images in the `images/val2014` directory.
2
 
3
  - We provide `English Translation` of the question for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
4
 
 
1
+ - This demo loads the `FlaxCLIPVisionBertForSequenceClassification` present in the `model` directory of this repository. The checkpoint is loaded from [`flax-community/clip-vision-bert-vqa-ft-6k`](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) which is pre-trained checkpoint with 60k steps and 6k fine-tuning steps. 100 random validation set examples are present in the `dummy_vqa_multilingual.tsv` with respective images in the `images/val2014` directory.
2
 
3
  - We provide `English Translation` of the question for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
4