Image-Text-to-Text
MLX
Safetensors
English
idefics2
multimodal
vision
prince-canuma commited on
Commit
4499481
1 Parent(s): 40c9900

Upload 12 files

Browse files
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ tags:
6
+ - multimodal
7
+ - vision
8
+ - image-text-to-text
9
+ - mlx
10
+ datasets:
11
+ - HuggingFaceM4/OBELICS
12
+ - laion/laion-coco
13
+ - wikipedia
14
+ - facebook/pmd
15
+ - pixparse/idl-wds
16
+ - pixparse/pdfa-eng-wds
17
+ - wendlerc/RenderedText
18
+ - HuggingFaceM4/the_cauldron
19
+ - teknium/OpenHermes-2.5
20
+ - GAIR/lima
21
+ - databricks/databricks-dolly-15k
22
+ - meta-math/MetaMathQA
23
+ - TIGER-Lab/MathInstruct
24
+ - microsoft/orca-math-word-problems-200k
25
+ - camel-ai/math
26
+ - AtlasUnified/atlas-math-sets
27
+ - tiedong/goat
28
+ - Lin-Chen/ShareGPT4V
29
+ - jxu124/llava_conversation_58k
30
+ ---
31
+
32
+ # mlx-community/idefics2-8b-chatty-4bit
33
+ This model was converted to MLX format from [`HuggingFaceM4/idefics2-8b-chatty`]() using mlx-vlm version **0.0.4**.
34
+ Refer to the [original model card](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) for more details on the model.
35
+ ## Use with mlx
36
+
37
+ ```bash
38
+ pip install -U mlx-vlm
39
+ ```
40
+
41
+ ```bash
42
+ python -m mlx_vlm.generate --model mlx-community/idefics2-8b-chatty-4bit --max-tokens 100 --temp 0.0
43
+ ```
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<end_of_utterance>": 32002,
3
+ "<fake_token_around_image>": 32000,
4
+ "<image>": 32001
5
+ }
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Idefics2ForConditionalGeneration"
4
+ ],
5
+ "image_token_id": 32001,
6
+ "model_type": "idefics2",
7
+ "perceiver_config": {
8
+ "model_type": "idefics2"
9
+ },
10
+ "quantization": {
11
+ "group_size": 64,
12
+ "bits": 4
13
+ },
14
+ "text_config": {
15
+ "max_position_embeddings": 32768,
16
+ "model_type": "mistral",
17
+ "pad_token_id": 0,
18
+ "rms_norm_eps": 1e-05,
19
+ "vocab_size": 32003
20
+ },
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.39.0.dev0",
24
+ "use_cache": true,
25
+ "vision_config": {
26
+ "hidden_size": 1152,
27
+ "image_size": 980,
28
+ "intermediate_size": 4304,
29
+ "model_type": "idefics2",
30
+ "num_attention_heads": 16,
31
+ "num_hidden_layers": 27,
32
+ "patch_size": 14
33
+ }
34
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9d1b3b32b71f68d7ccf83162c13bec313b50234d047e35feb1f7c1675a087f
3
+ size 5356662816
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a8f5fb23aef125ca282c0dc2359ec19c8f6671d40dd83e545b65e88835127ee
3
+ size 159878808
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_image_splitting": true,
4
+ "do_normalize": true,
5
+ "do_pad": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_processor_type": "Idefics2ImageProcessor",
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Idefics2Processor",
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 980,
24
+ "shortest_edge": 378
25
+ }
26
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_len": 64,
3
+ "processor_class": "Idefics2Processor"
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<fake_token_around_image>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<image>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<end_of_utterance>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<unk>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<unk>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<fake_token_around_image>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<image>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<end_of_utterance>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<fake_token_around_image>",
56
+ "<image>",
57
+ "<end_of_utterance>"
58
+ ],
59
+ "bos_token": "<s>",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "</s>",
62
+ "model_max_length": 1000000000000000019884624838656,
63
+ "pad_token": "<unk>",
64
+ "processor_class": "Idefics2Processor",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "LlamaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": true
70
+ }