vost commited on
Commit
713c92e
·
verified ·
1 Parent(s): 98eaea5

Upload 3 files

Browse files
APPs/SD-Next/Dockerfile ADDED
File without changes
APPs/llama-swap/config.yaml ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ healthCheckTimeout: 1500
2
+ startPort: 65001
3
+ ttl: 1200
4
+
5
+ # --- Macros Globais ---
6
+ macros:
7
+ "base_vRAM": >-
8
+ --privileged --label ai-type=worker --ulimit memlock=-1 --ulimit stack=67108864
9
+ --device /dev/dri:/dev/dri
10
+ --device /dev/accel/accel0:/dev/accel/accel0
11
+ -v /home/lvivas/Modelos/models:/root/.cache/
12
+ -v /tmp/cache_file:/tmp/cache_file
13
+ -v /home/lvivas/Modelos/grammar:/tmp/grammar
14
+ -e ZES_ENABLE_SYSMAN=1 -e GGML_SYCL_UNIFIED_SHARED_MEMORY=1 -e GGML_SYCL_OVERRIDE_ALLOCATOR=1
15
+
16
+ "base_Opts": "--ctx-size 32768 --no-webui --gpu-layers 99 --fit on -t 10 --prio 3 --poll 25 --perf --mlock --no-mmap --split-mode none --flash-attn on"
17
+
18
+ "kv_cache_opt": >-
19
+ --cache-type-k q8_0 --cache-type-v q8_0 --swa-full
20
+ --ctx-checkpoints 10 --batch-size 4096 --ubatch-size 1024
21
+
22
+ "draft_qwen_08": >-
23
+ -hfd mradermacher/Huihui-Qwen3.5-0.8B-abliterated-GGUF:Q5_K_M
24
+ --draft 5 -ctkd q8_0 -ctvd q8_0
25
+
26
+ "run_intel": >-
27
+ -e ONEAPI_DEVICE_SELECTOR=level_zero:0
28
+ ghcr.io/ggml-org/llama.cpp:server-intel
29
+
30
+ #Last Stable b8445
31
+ "run_vulkan": >-
32
+ ghcr.io/ggml-org/llama.cpp:server-vulkan
33
+
34
+ "run_test": >-
35
+ ghcr.io/ggml-org/llama.cpp:server-vulkan
36
+
37
+ "run_openvino": >-
38
+ -e GGML_OPENVINO_DEVICE=GPU
39
+ -e GGML_OPENVINO_PREFILL_CHUNK_SIZE=256
40
+ ghcr.io/ggml-org/llama.cpp:server-openvino
41
+
42
+ "chat_default": >-
43
+ ${kv_cache_opt} ${base_Opts} --reasoning off
44
+ --temp 0.6 --top-p 0.95 --top-k 50 --min-p 0.05
45
+ --repeat-penalty 1.05 --presence-penalty 1.0 --repeat-last-n 1024
46
+ --parallel 2
47
+
48
+ "code_default": >-
49
+ ${base_Opts} ${kv_cache_opt} --reasoning on
50
+ --temp 0.2 --top-p 0.95 --top-k 20 --min-p 0.00
51
+ --repeat-penalty 1.15 --parallel 1
52
+
53
+ # --- Grupos ---
54
+ groups:
55
+ "Coding":
56
+ swap: false
57
+ exclusive: true
58
+ members:
59
+ - "Qwen3.5-9B"
60
+ - "nomic-embed-text-v1.5"
61
+
62
+ "Chat":
63
+ swap: false
64
+ exclusive: true
65
+ members:
66
+ - "JOSIE-4B"
67
+ - "Darkidol-Ballad-9B"
68
+
69
+ hooks:
70
+ on_startup:
71
+ preload:
72
+ - "Darkidol-Ballad-9B"
73
+
74
+ # --- Modelos ---
75
+ models:
76
+ "nomic-embed-text-v1.5":
77
+ proxy: "http://127.0.0.1:${PORT}"
78
+ cmd: |
79
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name nomic-V
80
+ ${run_vulkan}
81
+ -hf nomic-ai/nomic-embed-text-v1.5-GGUF:Q4_K_M
82
+ --embeddings --ctx-size 8192 --gpu-layers 99 --parallel 1 --flash-attn on
83
+ cmdStop: "docker stop nomic-V"
84
+
85
+ "JOSIE-4B":
86
+ proxy: "http://127.0.0.1:${PORT}"
87
+ cmd: |
88
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name JOSIE
89
+ ${run_openvino}
90
+ -hf mradermacher/JOSIE-4B-Instruct-GGUF:Q4_K_M
91
+ ${chat_default}
92
+ cmdStop: "docker stop JOSIE"
93
+
94
+ "Qwen3.5-9B":
95
+ proxy: "http://127.0.0.1:${PORT}"
96
+ cmd: |
97
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name Qwen3.5-9B
98
+ ${run_intel}
99
+ -hf mradermacher/Qwen3.5-9B-ultra-heretic-GGUF:Q4_K_M
100
+ ${code_default} --seed 3407
101
+ cmdStop: "docker stop Qwen3.5-9B"
102
+
103
+ "Character-Creator":
104
+ proxy: "http://127.0.0.1:${PORT}"
105
+ cmd: |
106
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name Character-Creator
107
+ ${run_vulkan}
108
+ -hf mradermacher/Llama-3.3-8B-Character-Creator-V2-GGUF:Q4_K_M
109
+ ${chat_default}
110
+ cmdStop: "docker stop Character-Creator"
111
+
112
+ "Impish_Bloodmoon_12B":
113
+ proxy: "http://127.0.0.1:${PORT}"
114
+ cmd: |
115
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name Impish_Bloodmoon_12B
116
+ ${run_vulkan}
117
+ -hf SicariusSicariiStuff/Impish_Bloodmoon_12B_GGUF:Q4_K_M
118
+ ${chat_default}
119
+ cmdStop: "docker stop Impish_Bloodmoon_12B"
120
+
121
+ "Darkidol-Ballad-9B":
122
+ proxy: "http://127.0.0.1:${PORT}"
123
+ cmd: |
124
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name Darkidol-Ballad-9B
125
+ ${run_vulkan} -hf mradermacher/Darkidol-Ballad-9B-GGUF:Q5_K_M
126
+ ${chat_default} ${draft_qwen_08}
127
+ cmdStop: "docker stop Darkidol-Ballad-9B"
128
+
129
+ "Darkidol-Ballad-9B-I":
130
+ proxy: "http://127.0.0.1:${PORT}"
131
+ cmd: |
132
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name Darkidol-Ballad-9B
133
+ ${run_intel}
134
+ -hf mradermacher/Darkidol-Ballad-9B-GGUF:Q4_K_M
135
+ ${chat_default}
136
+ cmdStop: "docker stop Darkidol-Ballad-9B"
137
+
138
+ "TEST-MODELS":
139
+ proxy: "http://127.0.0.1:${PORT}"
140
+ cmd: |
141
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name TEST-MODELS
142
+ ${run_test}
143
+ -hf ReadyArt/Omega-Evolution-9B-v2.0-GGUF:Q5_K_M
144
+ ${draft_qwen_08}
145
+ --gpu-layers 99 --ctx-size 8192 --no-context-shift
146
+ -np 1 --flash-attn on --fit on --swa-full
147
+ --mlock --no-mmap -b 4096 --ubatch-size 1024
148
+ --temp 0.6 --min-p 0.05 --reasoning off --no-warmup
149
+ --grammar-file /tmp/grammar/strict_xml.gbnf
150
+
151
+ cmdStop: "docker stop TEST-MODELS"
152
+
153
+ "00-HELP":
154
+ proxy: "http://127.0.0.1:${PORT}"
155
+ cmd: |
156
+ docker run --pull always --rm ${base_vRAM} -p ${PORT}:8080 --name I00-HELP
157
+ ${run_test} -h
158
+ cmdStop: "docker stop I00-HELP"
159
+
160
+ filters:
161
+ stripParams: "top_p, top_k, min_p, presence_penalty, frequency_penalty, repeat_last"
APPs/llama-swap/docker-compose.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: '3.8'
2
+
3
+ services:
4
+ llama-swap:
5
+ container_name: llamaswap
6
+ image: ghcr.io/mostlygeek/llama-swap:vulkan
7
+ restart: unless-stopped
8
+ pull_policy: always
9
+ network_mode: host
10
+ privileged: true
11
+ shm_size: '32gb'
12
+ volumes:
13
+ # Novo mapeamento unificado de cache (compatível com HF migration)
14
+ - /home/lvivas/Modelos/models:/root/.cache
15
+ # Acesso às gramáticas para o orquestrador validar os caminhos
16
+ - /home/lvivas/Modelos/grammar:/tmp/grammar
17
+ # Configurações do projeto
18
+ - ./config:/config
19
+ # Comunicação com o Docker Host (indispensável)
20
+ - /var/run/docker.sock:/var/run/docker.sock
21
+ - /usr/bin/docker:/usr/bin/docker
22
+ # Cache de contexto compartilhado
23
+ - /tmp/cache_file:/tmp/cache_file
24
+ environment:
25
+ # Opcional: Garante que o orquestrador saiba onde procurar as coisas
26
+ - XDG_CACHE_HOME=/root/.cache
27
+ devices:
28
+ - /dev/dri:/dev/dri
29
+ - /dev/accel/accel0:/dev/accel/accel0
30
+ group_add:
31
+ - "44"
32
+ - "991"
33
+ cap_add:
34
+ - SYS_ADMIN
35
+ - SYS_RAWIO
36
+ - IPC_LOCK
37
+ - SYS_RESOURCE
38
+ ulimits:
39
+ memlock: -1
40
+ stack: 67108864
41
+ entrypoint: /app/llama-swap -config /config/config.yaml --listen :65000
42
+ healthcheck:
43
+ test: ["CMD", "curl", "-f", "http://localhost:65000/"]
44
+ interval: 30s
45
+ timeout: 10s
46
+ retries: 3
47
+
48
+ # ESTE BLOCO DEVE ESTAR ALINHADO COM O llama-swap:
49
+ watch-config:
50
+ image: docker:latest
51
+ container_name: llamaswap-watcher
52
+ restart: unless-stopped
53
+ volumes:
54
+ - ./config:/config
55
+ - /home/lvivas/Modelos/models:/root/.cache
56
+ - /var/run/docker.sock:/var/run/docker.sock
57
+ - /tmp/cache_file:/tmp/cache_file
58
+ # O entrypoint garante que o script tenha permissão antes de rodar
59
+ entrypoint: ["sh", "-c", "chmod +x /config/watch-config.sh && /config/watch-config.sh"]