camenduru commited on
Commit
f8c04f4
1 Parent(s): 0974d61

thanks to Lin-Chen ❤

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +34 -0
  3. config.json +23 -0
  4. preprocessor_config.json +28 -0
  5. pytorch_model.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ inference: false
4
+ ---
5
+
6
+ <br>
7
+ <br>
8
+
9
+ # ShareGPT4V Model Card
10
+
11
+ ## Model details
12
+
13
+ **Model type:**
14
+ This is the vision tower of ShareGPT4V-7B fine-tuned with our [ShareGPT4V dataset](https://huggingface.co/datasets/Lin-Chen/ShareGPT4V).
15
+
16
+ **Model date:**
17
+ This vision tower was trained in Nov 2023.
18
+
19
+ **Paper or resources for more information:**
20
+ [[Project](https://ShareGPT4V.github.io/)] [[Paper](https://huggingface.co/papers/2311.12793)] [[Code](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V)]
21
+
22
+ ## License
23
+ Llama 2 is licensed under the LLAMA 2 Community License,
24
+ Copyright (c) Meta Platforms, Inc. All Rights Reserved.
25
+
26
+ ## Intended use
27
+ **Primary intended uses:**
28
+ The primary use of this vision tower is research on large multimodal models and chatbots.
29
+
30
+ **Primary intended users:**
31
+ The primary intended users of the model are researchers and hobbyists in computer vision, natural language processing, machine learning, and artificial intelligence.
32
+
33
+ ## Training dataset
34
+ - 1.2M high-quality image-text pairs
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ShareGPT4V-7B_Pretrained_vit-large336-l12",
3
+ "architectures": [
4
+ "CLIPVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 336,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "bfloat16",
22
+ "transformers_version": "4.31.0"
23
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 336,
4
+ "width": 336
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 336
27
+ }
28
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a8a6740e800e3583e7ff04cceeb0a1a601fa3194d98e652e190922d4b8e309
3
+ size 607160538