Xueqing Wu commited on
Commit
99e8fc6
1 Parent(s): e20ef71

download files from hub

Browse files
.gitignore CHANGED
@@ -1,3 +1,3 @@
1
  __pycache__/
2
  *.pyc
3
- pretrained_models
 
1
  __pycache__/
2
  *.pyc
3
+ .idea/
Dockerfile CHANGED
@@ -31,27 +31,12 @@ RUN mkdir $HOME/.cache $HOME/.config \
31
  && rm ~/miniconda.sh \
32
  && conda clean -ya
33
 
34
- # From here are my stuff
35
-
36
- # Download models
37
- RUN pip install --no-cache-dir gdown && \
38
- mkdir -p ./pretrained_models/GLIP/checkpoints && \
39
- mkdir -p ./pretrained_models/GLIP/configs && \
40
- mkdir -p ./pretrained_models/xvlm && \
41
- wget -nc -q -P ./pretrained_models/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth && \
42
- wget -nc -q -P ./pretrained_models/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml && \
43
- gdown "https://drive.google.com/u/0/uc?id=1bv6_pZOsXW53EhlwU0ZgSk03uzFI61pN" -O ./pretrained_models/xvlm/retrieval_mscoco_checkpoint_9.pth
44
-
45
  # Python packages
46
  RUN --mount=target=requirements.txt,source=requirements.txt \
47
  pip install --no-cache-dir torch torchvision && \
48
  pip install --no-cache-dir git+https://github.com/openai/CLIP.git && \
49
  pip install --no-cache-dir -r requirements.txt
50
 
51
- RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('codellama/CodeLlama-7b-Python-hf')"
52
- RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-critic-generalist-7B')"
53
- RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-refiner-generalist-7B')"
54
-
55
  # Download GLIP dependencies, but unfortunately don't install yet...
56
  RUN git clone https://github.com/sachit-menon/GLIP
57
 
 
31
  && rm ~/miniconda.sh \
32
  && conda clean -ya
33
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Python packages
35
  RUN --mount=target=requirements.txt,source=requirements.txt \
36
  pip install --no-cache-dir torch torchvision && \
37
  pip install --no-cache-dir git+https://github.com/openai/CLIP.git && \
38
  pip install --no-cache-dir -r requirements.txt
39
 
 
 
 
 
40
  # Download GLIP dependencies, but unfortunately don't install yet...
41
  RUN git clone https://github.com/sachit-menon/GLIP
42
 
app.sh CHANGED
@@ -1,3 +1,4 @@
 
1
  cd GLIP
2
  python setup.py clean --all build develop --user
3
  cd ../
 
1
+ python download_files_from_hub.py
2
  cd GLIP
3
  python setup.py clean --all build develop --user
4
  cd ../
download_files_from_hub.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+
3
+ hf_hub_download(repo_id="GLIPModel/GLIP", filename="glip_large_model.pth",
4
+ local_dir="./pretrained_models/GLIP/checkpoints")
5
+ hf_hub_download(repo_id="VDebugger/xvlm_retrieval_mscoco", filename="retrieval_mscoco_checkpoint_9.pth",
6
+ local_dir="./pretrained_models/xvlm/")
pretrained_models/GLIP/configs/glip_Swin_L.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_large_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ SWINT:
12
+ EMBED_DIM: 192
13
+ DEPTHS: (2, 2, 18, 2)
14
+ NUM_HEADS: (6, 12, 24, 48)
15
+ WINDOW_SIZE: 12
16
+ OUT_CHANNELS: (192, 384, 768, 1536)
17
+ DROP_PATH_RATE: 0.4
18
+
19
+ LANGUAGE_BACKBONE:
20
+ FREEZE: False
21
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
22
+ MASK_SPECIAL: False
23
+
24
+ RPN:
25
+ USE_FPN: True
26
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
27
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
28
+ ASPECT_RATIOS: (1.0,)
29
+ SCALES_PER_OCTAVE: 1
30
+
31
+ DYHEAD:
32
+ CHANNELS: 256
33
+ NUM_CONVS: 8
34
+ USE_GN: True
35
+ USE_DYRELU: True
36
+ USE_DFCONV: True
37
+ USE_DYFUSE: True
38
+ TOPK: 9 # topk for selecting candidate positive samples from each level
39
+ SCORE_AGG: "MEAN"
40
+ LOG_SCALE: 0.0
41
+
42
+ USE_CHECKPOINT: True
43
+ FUSE_CONFIG:
44
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
45
+ EARLY_FUSE_ON: True
46
+ TYPE: "MHA-B"
47
+ USE_CLASSIFICATION_LOSS: False
48
+ USE_TOKEN_LOSS: False
49
+ USE_CONTRASTIVE_ALIGN_LOSS: False
50
+ CONTRASTIVE_HIDDEN_DIM: 64
51
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
52
+ USE_LAYER_SCALE: True
53
+ CLAMP_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_MAX_FOR_OVERFLOW: True
55
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
56
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
57
+ CLAMP_DOT_PRODUCT: True
58
+
59
+ DATASETS:
60
+
61
+ TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
62
+ TEST: ("coco_2017_val", )
63
+
64
+ ONE_HOT: False
65
+ FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
66
+ MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
67
+ OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
68
+ VG_COPY: 3 # 0.4 * 3 = ~1.2M
69
+ IN_COPY: 2 # 0.67 * 2 = ~1.33M
70
+ OI_COPY: 1 # 2M * 1 = 2M
71
+
72
+ DISABLE_SHUFFLE: False
73
+ ADD_DET_PROMPT: False
74
+ RANDOM_SAMPLE_NEG: 85
75
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
76
+ FURTHER_SCREEN: True
77
+ CAPTION_CONF: 0.5
78
+ CAPTION_NMS: -1.0
79
+ CAPTION_MIN_BOX: 1
80
+
81
+ SEPARATION_TOKENS: ". "
82
+
83
+ PACK_RANDOM_CAPTION_NUMBER: 20
84
+ NO_RANDOM_PACK_PROBABILITY: 0.4
85
+ RANDOM_PACK_PROB: 0.5
86
+ CAPTION_FORMAT_VERSION: "v2"
87
+
88
+ INPUT:
89
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
90
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
91
+ MIN_SIZE_TRAIN: 800
92
+ MAX_SIZE_TRAIN: 1333
93
+ MIN_SIZE_TEST: 800
94
+ MAX_SIZE_TEST: 1333
95
+
96
+ AUGMENT:
97
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
98
+
99
+ DATALOADER:
100
+ SIZE_DIVISIBILITY: 32
101
+
102
+ SOLVER:
103
+ OPTIMIZER: ADAMW
104
+ BASE_LR: 0.0001
105
+ LANG_LR: 0.00001
106
+ WEIGHT_DECAY: 0.01
107
+ WEIGHT_DECAY_SCHEDULE: True
108
+ STEPS: (0.67, 0.89)
109
+ MAX_ITER: 1000000
110
+ IMS_PER_BATCH: 64
111
+ WARMUP_ITERS: 2000
112
+ WARMUP_FACTOR: 0.001
113
+
114
+ FIND_UNUSED_PARAMETERS: False
115
+
116
+ CLIP_GRADIENTS:
117
+ ENABLED: True
118
+ CLIP_TYPE: "full_model"
119
+ CLIP_VALUE: 1.0
120
+ NORM_TYPE: 2.0
requirements.txt CHANGED
@@ -255,4 +255,4 @@ xxhash
255
  yacs
256
  yarl
257
  gradio
258
- # huggingface_hub
 
255
  yacs
256
  yarl
257
  gradio
258
+ huggingface_hub