carlex3321 commited on
Commit
47d5744
·
verified ·
1 Parent(s): aa104ac

Update start.sh

Browse files
Files changed (1) hide show
  1. start.sh +57 -35
start.sh CHANGED
@@ -5,6 +5,14 @@ echo "======================================================="
5
  echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
6
  echo "======================================================="
7
 
 
 
 
 
 
 
 
 
8
  # ---------------------- Env base ----------------------
9
  export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
10
  export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
@@ -16,61 +24,75 @@ export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
16
  export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
17
  export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
18
 
19
- # CUDA/NCCL/perf single-node robust
20
- export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
21
  export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
22
- export CUDA_DEVICE_ORDER="${CUDA_DEVICE_ORDER:-PCI_BUS_ID}"
23
- export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-max_split_size_mb:512,garbage_collection_threshold:0.8}"
24
  export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
25
  export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
26
- export NCCL_DEBUG="${NCCL_DEBUG:-INFO}"
27
- export NCCL_ASYNC_ERROR_HANDLING="${NCCL_ASYNC_ERROR_HANDLING:-1}"
28
- export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-0}"
29
- export NCCL_IB_DISABLE="${NCCL_IB_DISABLE:-1}"
30
- export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}"
31
  export NCCL_BLOCKING_WAIT=1
32
  export TORCH_NCCL_BLOCKING_WAIT=1
33
- export NCCL_TIMEOUT="${NCCL_TIMEOUT:-600}"
34
 
35
  # HF caches
36
- export HF_HOME="${HF_HOME:-/app/.cache/huggingface}"
37
  unset TRANSFORMERS_CACHE
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # ---------------------- Builder (Apex + Q8) ----------------------
41
- # Executa com GPU disponível; busca wheels no HF e compila se necessário (sem FlashAttention)
 
 
42
  if nvidia-smi >/dev/null 2>&1; then
43
- if [ "${DISABLE_BUILDER:-0}" = "0" ]; then
44
- echo "🔧 Executando builder (Apex + Q8)..."
45
  chmod +x /app/builder.sh || true
46
- # Configuráveis:
47
- SELF_HF_REPO_ID="carlex3321/aduc-sdr" # repo de wheels no HF
48
- HF_UPLOAD_WHEELS=0 # publica wheels geradas
49
- BUILDER_TIMEOUT_SEC=6000000 # tempo limite
50
- #Q8_REPO / Q8_COMMIT # pin do LTX Q8
51
- ( timeout ${BUILDER_TIMEOUT_SEC:-60000} bash -lc "/app/builder.sh" ) || {
52
- echo "⚠️ Builder excedeu tempo/retornou erro; prosseguindo com a aplicação."
53
- }
54
  else
55
- echo "ℹ️ Builder desabilitado por DISABLE_BUILDER=1"
56
  fi
57
  else
58
- echo "⚠️ GPU não visível; pulando builder (Apex/Q8)."
59
  fi
60
 
61
-
62
- # ---------------------- Banner ----------------------
63
  ./info.sh || true
 
 
 
 
64
 
65
 
66
- ls -la /app && ls -R /app | head -n 2000
67
- ls -la /data && ls -R /data | head -n 2000
68
-
69
 
70
- echo "🚀 Subindo serviços..."
71
-
72
- # Exemplo: subir UI mínima SD Img2Img (ajuste conforme seu app)
73
- # python app_animatediff_min.py
74
 
 
 
 
75
  # Ou subir VINCIE UI se for o caso
76
- python app_vince.py
 
5
  echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
6
  echo "======================================================="
7
 
8
+ ls -la /app ls -R /app | head -n 2000
9
+ ls -la /data ls -R /data | head -n 2000
10
+ ls -la /app/.cache/huggingface -R /app/.cache/huggingface | head -n 2000
11
+ ls -la /app/.cache -R /app/.cache | head -n 2000
12
+
13
+
14
+
15
+
16
  # ---------------------- Env base ----------------------
17
  export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
18
  export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
 
24
  export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
25
  export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
26
 
27
+ # CUDA/NCCL/perf single-node robust
28
+ export CUDA_MODULE_LOADING="LAZY"
29
  export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
30
+ export CUDA_DEVICE_ORDER="PCI_BUS_ID"
31
+ export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
32
  export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
33
  export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
34
+ export NCCL_DEBUG="INFO"
35
+ export NCCL_ASYNC_ERROR_HANDLING=1
36
+ export NCCL_P2P_DISABLE=0
37
+ export NCCL_IB_DISABLE=1
38
+ export NCCL_SOCKET_IFNAME="lo"
39
  export NCCL_BLOCKING_WAIT=1
40
  export TORCH_NCCL_BLOCKING_WAIT=1
41
+ export NCCL_TIMEOUT=600
42
 
43
  # HF caches
44
+ export HF_HOME=/app/.cache/huggingface
45
  unset TRANSFORMERS_CACHE
46
 
47
+ # ---------------------- Baixar o modelo antes de iniciar ----------------------
48
+ echo "Baixando o modelo ByteDance-Seed/VINCIE-3B para cache persistente..."
49
+
50
+ # Use Python para baixar o modelo via huggingface_hub
51
+ python -c "
52
+ import os
53
+ from huggingface_hub import snapshot_download
54
+
55
+ os.environ['HF_HOME'] = '/app/.cache/huggingface'
56
+ repo_id = 'ByteDance-Seed/VINCIE-3B'
57
+ snapshot_download(repo_id=repo_id, cache_dir=os.path.join(os.environ['HF_HOME'], 'hub'))
58
+ " || { echo "Erro ao baixar o modelo. Continuando..."; }
59
+
60
+ # Verificar se o cache foi criado
61
+ ls -la $HF_HOME/hub || echo "Cache não encontrado após download."
62
+
63
 
64
+
65
+
66
+
67
+ # ---------------------- Builder Apex/Q8 ----------------------
68
  if nvidia-smi >/dev/null 2>&1; then
69
+ if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then
70
+ echo "Executando builder Apex/Q8..."
71
  chmod +x /app/builder.sh || true
72
+ # Executa com GPU disponível, busca wheels no HF e compila se necessário (sem FlashAttention)
73
+ SELF_HF_REPO_ID=carlex3321/aduc-sdr # repo de wheels no HF
74
+ HF_UPLOAD_WHEELS=0 # publica wheels geradas
75
+ BUILDER_TIMEOUT_SEC=6000000 # tempo limite
76
+ Q8_REPO="" Q8_COMMIT="" # pin do LTX Q8
77
+ timeout ${BUILDER_TIMEOUT_SEC:-60000} bash -lc /app/builder.sh || echo "Builder excedeu tempo/retornou erro, prosseguindo com a aplicação."
 
 
78
  else
79
+ echo "Builder desabilitado por DISABLE_BUILDER=1"
80
  fi
81
  else
82
+ echo "GPU não visível, pulando builder Apex/Q8."
83
  fi
84
 
85
+ # Configuráveis
 
86
  ./info.sh || true
87
+ ls -la /app ls -R /app | head -n 2000
88
+ ls -la /data ls -R /data | head -n 2000
89
+ ls -la /app/.cache/huggingface -R /app/.cache/huggingface | head -n 2000
90
+ ls -la /app/.cache -R /app/.cache | head -n 2000
91
 
92
 
 
 
 
93
 
 
 
 
 
94
 
95
+ echo "Subindo serviços..."
96
+ # ---------------------- Banner ----------------------
97
+ python /app/vince.py
98
  # Ou subir VINCIE UI se for o caso