Jeney commited on
Commit
253f1f9
1 Parent(s): af4825d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,27 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,79 @@
1
  ---
 
 
2
  license: apache-2.0
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - visual-question-answering
4
  license: apache-2.0
5
+ widget:
6
+ - text: "What's the animal doing?"
7
+ src: "https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg"
8
+ - text: "What is on top of the building?"
9
+ src: "https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg"
10
  ---
11
+
12
+ # Vision-and-Language Transformer (ViLT), fine-tuned on VQAv2
13
+
14
+ Vision-and-Language Transformer (ViLT) model fine-tuned on [VQAv2](https://visualqa.org/). It was introduced in the paper [ViLT: Vision-and-Language Transformer
15
+ Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Kim et al. and first released in [this repository](https://github.com/dandelin/ViLT).
16
+
17
+ Disclaimer: The team releasing ViLT did not write a model card for this model so this model card has been written by the Hugging Face team.
18
+
19
+ ## Intended uses & limitations
20
+
21
+ You can use the raw model for visual question answering.
22
+
23
+ ### How to use
24
+
25
+ Here is how to use this model in PyTorch:
26
+
27
+ ```python
28
+ from transformers import ViltProcessor, ViltForQuestionAnswering
29
+ import requests
30
+ from PIL import Image
31
+
32
+ # prepare image + question
33
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
34
+ image = Image.open(requests.get(url, stream=True).raw)
35
+ text = "How many cats are there?"
36
+
37
+ processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
38
+ model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
39
+
40
+ # prepare inputs
41
+ encoding = processor(image, text, return_tensors="pt")
42
+
43
+ # forward pass
44
+ outputs = model(**encoding)
45
+ logits = outputs.logits
46
+ idx = logits.argmax(-1).item()
47
+ print("Predicted answer:", model.config.id2label[idx])
48
+ ```
49
+
50
+ ## Training data
51
+
52
+ (to do)
53
+
54
+ ## Training procedure
55
+
56
+ ### Preprocessing
57
+
58
+ (to do)
59
+
60
+ ### Pretraining
61
+
62
+ (to do)
63
+
64
+ ## Evaluation results
65
+
66
+ (to do)
67
+
68
+ ### BibTeX entry and citation info
69
+
70
+ ```bibtex
71
+ @misc{kim2021vilt,
72
+ title={ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision},
73
+ author={Wonjae Kim and Bokyung Son and Ildoo Kim},
74
+ year={2021},
75
+ eprint={2102.03334},
76
+ archivePrefix={arXiv},
77
+ primaryClass={stat.ML}
78
+ }
79
+ ```
config.json ADDED
The diff for this file is too large to render. See raw diff
 
handler.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+ from transformers import ViltProcessor, ViltForQuestionAnswering
3
+
4
+
5
+ class EndpointHandler:
6
+ def __init__(self, path=""):
7
+ # load model and processor from path
8
+ self.processor = AutoTokenizer.from_pretrained(path)
9
+ self.model = ViltForQuestionAnswering.from_pretrained(path)
10
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
13
+ # process input
14
+ image = data.pop("image", data)
15
+ text = data.pop("text", data)
16
+ parameters = data.pop("parameters", None)
17
+
18
+ # preprocess
19
+ encoding = processor(image, text, return_tensors="pt")
20
+ outputs = model(**encoding)
21
+ # postprocess the prediction
22
+ logits = outputs.logits
23
+ idx = logits.argmax(-1).item()
24
+ return [{"answer": model.config.id2label[idx]}]
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "ViltFeatureExtractor",
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_std": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "resample": 3,
16
+ "size": 384,
17
+ "size_divisor": 32
18
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5f3409947b0369487ece7c5868f0040ceb67d25735dbb4ac5e99e03bab3a19
3
+ size 470435927
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 40, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff