Spaces:

VDebugger
/

VDebugger-generalist-for-VQA

Runtime error

App Files Files Community

Xueqing Wu commited on Sep 21, 2024

Commit

99e8fc6

1 Parent(s): e20ef71

download files from hub

Browse files

Files changed (6) hide show

.gitignore +1 -1
Dockerfile +0 -15
app.sh +1 -0
download_files_from_hub.py +6 -0
pretrained_models/GLIP/configs/glip_Swin_L.yaml +120 -0
requirements.txt +1 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 __pycache__/
 *.pyc
-pretrained_models

 __pycache__/
 *.pyc
+.idea/

Dockerfile CHANGED Viewed

@@ -31,27 +31,12 @@ RUN mkdir $HOME/.cache $HOME/.config \
     && rm ~/miniconda.sh \
     && conda clean -ya
-# From here are my stuff
-# Download models
-RUN pip install --no-cache-dir gdown && \
-    mkdir -p ./pretrained_models/GLIP/checkpoints && \
-    mkdir -p ./pretrained_models/GLIP/configs && \
-    mkdir -p ./pretrained_models/xvlm && \
-    wget -nc -q -P ./pretrained_models/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth && \
-    wget -nc -q -P ./pretrained_models/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml && \
-    gdown "https://drive.google.com/u/0/uc?id=1bv6_pZOsXW53EhlwU0ZgSk03uzFI61pN" -O ./pretrained_models/xvlm/retrieval_mscoco_checkpoint_9.pth
 # Python packages
 RUN --mount=target=requirements.txt,source=requirements.txt \
     pip install --no-cache-dir torch torchvision && \
     pip install --no-cache-dir git+https://github.com/openai/CLIP.git && \
     pip install --no-cache-dir -r requirements.txt
-RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('codellama/CodeLlama-7b-Python-hf')"
-RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-critic-generalist-7B')"
-RUN python -c "from transformers import AutoModel; _ = AutoModel.from_pretrained('VDebugger/VDebugger-refiner-generalist-7B')"
 # Download GLIP dependencies, but unfortunately don't install yet...
 RUN git clone https://github.com/sachit-menon/GLIP

     && rm ~/miniconda.sh \
     && conda clean -ya
 # Python packages
 RUN --mount=target=requirements.txt,source=requirements.txt \
     pip install --no-cache-dir torch torchvision && \
     pip install --no-cache-dir git+https://github.com/openai/CLIP.git && \
     pip install --no-cache-dir -r requirements.txt
 # Download GLIP dependencies, but unfortunately don't install yet...
 RUN git clone https://github.com/sachit-menon/GLIP

app.sh CHANGED Viewed

@@ -1,3 +1,4 @@
 cd GLIP
 python setup.py clean --all build develop --user
 cd ../

+python download_files_from_hub.py
 cd GLIP
 python setup.py clean --all build develop --user
 cd ../

download_files_from_hub.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from huggingface_hub import hf_hub_download
+hf_hub_download(repo_id="GLIPModel/GLIP", filename="glip_large_model.pth",
+                local_dir="./pretrained_models/GLIP/checkpoints")
+hf_hub_download(repo_id="VDebugger/xvlm_retrieval_mscoco", filename="retrieval_mscoco_checkpoint_9.pth",
+                local_dir="./pretrained_models/xvlm/")

pretrained_models/GLIP/configs/glip_Swin_L.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_large_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+    EMBED_DIM: 192
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (6, 12, 24, 48)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (192, 384, 768, 1536)
+    DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 8
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
+  TEST: ("coco_2017_val", )
+  ONE_HOT: False
+  FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
+  MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
+  OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
+  VG_COPY: 3 # 0.4 * 3 = ~1.2M
+  IN_COPY: 2 # 0.67 * 2 = ~1.33M
+  OI_COPY: 1 # 2M * 1 = 2M
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 1000000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

requirements.txt CHANGED Viewed

@@ -255,4 +255,4 @@ xxhash
 yacs
 yarl
 gradio
-# huggingface_hub

 yacs
 yarl
 gradio
+huggingface_hub