Spaces:
Paused
Paused
Update start.sh
Browse files
start.sh
CHANGED
|
@@ -5,6 +5,14 @@ echo "======================================================="
|
|
| 5 |
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
|
| 6 |
echo "======================================================="
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# ---------------------- Env base ----------------------
|
| 9 |
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
|
| 10 |
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
|
|
@@ -16,61 +24,75 @@ export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
|
|
| 16 |
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
|
| 17 |
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
|
| 18 |
|
| 19 |
-
# CUDA/NCCL/perf
|
| 20 |
-
export CUDA_MODULE_LOADING="
|
| 21 |
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
|
| 22 |
-
export CUDA_DEVICE_ORDER="
|
| 23 |
-
export PYTORCH_CUDA_ALLOC_CONF="
|
| 24 |
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
|
| 25 |
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
|
| 26 |
-
export NCCL_DEBUG="
|
| 27 |
-
export NCCL_ASYNC_ERROR_HANDLING=
|
| 28 |
-
export NCCL_P2P_DISABLE=
|
| 29 |
-
export NCCL_IB_DISABLE=
|
| 30 |
-
export NCCL_SOCKET_IFNAME="
|
| 31 |
export NCCL_BLOCKING_WAIT=1
|
| 32 |
export TORCH_NCCL_BLOCKING_WAIT=1
|
| 33 |
-
export NCCL_TIMEOUT=
|
| 34 |
|
| 35 |
# HF caches
|
| 36 |
-
export HF_HOME
|
| 37 |
unset TRANSFORMERS_CACHE
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
if nvidia-smi >/dev/null 2>&1; then
|
| 43 |
-
if [ "${DISABLE_BUILDER:-0}"
|
| 44 |
-
echo "
|
| 45 |
chmod +x /app/builder.sh || true
|
| 46 |
-
#
|
| 47 |
-
SELF_HF_REPO_ID=
|
| 48 |
-
HF_UPLOAD_WHEELS=0
|
| 49 |
-
BUILDER_TIMEOUT_SEC=6000000
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
echo "⚠️ Builder excedeu tempo/retornou erro; prosseguindo com a aplicação."
|
| 53 |
-
}
|
| 54 |
else
|
| 55 |
-
echo "
|
| 56 |
fi
|
| 57 |
else
|
| 58 |
-
echo "
|
| 59 |
fi
|
| 60 |
|
| 61 |
-
|
| 62 |
-
# ---------------------- Banner ----------------------
|
| 63 |
./info.sh || true
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
-
ls -la /app && ls -R /app | head -n 2000
|
| 67 |
-
ls -la /data && ls -R /data | head -n 2000
|
| 68 |
-
|
| 69 |
|
| 70 |
-
echo "🚀 Subindo serviços..."
|
| 71 |
-
|
| 72 |
-
# Exemplo: subir UI mínima SD Img2Img (ajuste conforme seu app)
|
| 73 |
-
# python app_animatediff_min.py
|
| 74 |
|
|
|
|
|
|
|
|
|
|
| 75 |
# Ou subir VINCIE UI se for o caso
|
| 76 |
-
python app_vince.py
|
|
|
|
| 5 |
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
|
| 6 |
echo "======================================================="
|
| 7 |
|
| 8 |
+
ls -la /app ls -R /app | head -n 2000
|
| 9 |
+
ls -la /data ls -R /data | head -n 2000
|
| 10 |
+
ls -la /app/.cache/huggingface -R /app/.cache/huggingface | head -n 2000
|
| 11 |
+
ls -la /app/.cache -R /app/.cache | head -n 2000
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
# ---------------------- Env base ----------------------
|
| 17 |
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
|
| 18 |
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
|
|
|
|
| 24 |
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
|
| 25 |
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
|
| 26 |
|
| 27 |
+
# CUDA/NCCL/perf single-node robust
|
| 28 |
+
export CUDA_MODULE_LOADING="LAZY"
|
| 29 |
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
|
| 30 |
+
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
| 31 |
+
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
|
| 32 |
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
|
| 33 |
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
|
| 34 |
+
export NCCL_DEBUG="INFO"
|
| 35 |
+
export NCCL_ASYNC_ERROR_HANDLING=1
|
| 36 |
+
export NCCL_P2P_DISABLE=0
|
| 37 |
+
export NCCL_IB_DISABLE=1
|
| 38 |
+
export NCCL_SOCKET_IFNAME="lo"
|
| 39 |
export NCCL_BLOCKING_WAIT=1
|
| 40 |
export TORCH_NCCL_BLOCKING_WAIT=1
|
| 41 |
+
export NCCL_TIMEOUT=600
|
| 42 |
|
| 43 |
# HF caches
|
| 44 |
+
export HF_HOME=/app/.cache/huggingface
|
| 45 |
unset TRANSFORMERS_CACHE
|
| 46 |
|
| 47 |
+
# ---------------------- Baixar o modelo antes de iniciar ----------------------
|
| 48 |
+
echo "Baixando o modelo ByteDance-Seed/VINCIE-3B para cache persistente..."
|
| 49 |
+
|
| 50 |
+
# Use Python para baixar o modelo via huggingface_hub
|
| 51 |
+
python -c "
|
| 52 |
+
import os
|
| 53 |
+
from huggingface_hub import snapshot_download
|
| 54 |
+
|
| 55 |
+
os.environ['HF_HOME'] = '/app/.cache/huggingface'
|
| 56 |
+
repo_id = 'ByteDance-Seed/VINCIE-3B'
|
| 57 |
+
snapshot_download(repo_id=repo_id, cache_dir=os.path.join(os.environ['HF_HOME'], 'hub'))
|
| 58 |
+
" || { echo "Erro ao baixar o modelo. Continuando..."; }
|
| 59 |
+
|
| 60 |
+
# Verificar se o cache foi criado
|
| 61 |
+
ls -la $HF_HOME/hub || echo "Cache não encontrado após download."
|
| 62 |
+
|
| 63 |
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------------------- Builder Apex/Q8 ----------------------
|
| 68 |
if nvidia-smi >/dev/null 2>&1; then
|
| 69 |
+
if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then
|
| 70 |
+
echo "Executando builder Apex/Q8..."
|
| 71 |
chmod +x /app/builder.sh || true
|
| 72 |
+
# Executa com GPU disponível, busca wheels no HF e compila se necessário (sem FlashAttention)
|
| 73 |
+
SELF_HF_REPO_ID=carlex3321/aduc-sdr # repo de wheels no HF
|
| 74 |
+
HF_UPLOAD_WHEELS=0 # publica wheels geradas
|
| 75 |
+
BUILDER_TIMEOUT_SEC=6000000 # tempo limite
|
| 76 |
+
Q8_REPO="" Q8_COMMIT="" # pin do LTX Q8
|
| 77 |
+
timeout ${BUILDER_TIMEOUT_SEC:-60000} bash -lc /app/builder.sh || echo "Builder excedeu tempo/retornou erro, prosseguindo com a aplicação."
|
|
|
|
|
|
|
| 78 |
else
|
| 79 |
+
echo "Builder desabilitado por DISABLE_BUILDER=1"
|
| 80 |
fi
|
| 81 |
else
|
| 82 |
+
echo "GPU não visível, pulando builder Apex/Q8."
|
| 83 |
fi
|
| 84 |
|
| 85 |
+
# Configuráveis
|
|
|
|
| 86 |
./info.sh || true
|
| 87 |
+
ls -la /app ls -R /app | head -n 2000
|
| 88 |
+
ls -la /data ls -R /data | head -n 2000
|
| 89 |
+
ls -la /app/.cache/huggingface -R /app/.cache/huggingface | head -n 2000
|
| 90 |
+
ls -la /app/.cache -R /app/.cache | head -n 2000
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
echo "Subindo serviços..."
|
| 96 |
+
# ---------------------- Banner ----------------------
|
| 97 |
+
python /app/vince.py
|
| 98 |
# Ou subir VINCIE UI se for o caso
|
|
|