feat(ollama): migrate from Ollama to llama.cpp + llama-swap
Replace the Ollama service with a custom ROCm image combining ghcr.io/ggml-org/llama.cpp:server-rocm and llama-swap v199. Main motivations: - Unblock qwen35 HF GGUFs (qwen35 architecture not supported in Ollama 0.20.4 for HF-imported models) - Stay current with llama.cpp upstream without waiting for Ollama releases Changes: - ollama/Dockerfile: build llama-swap on top of llama.cpp:server-rocm - ollama/llama-swap.yaml: define 4 models with full sampler config, GPU offload, and mmproj for the two multimodal HF fine-tunes - ollama/docker-compose.yml: replace Ollama image with local build; fix broken volume mount (was /ubuntu/.ollama, now explicit /models) - ollama/Caddyfile: update upstream port 11434→8080 (llama-swap default) - ai/docker-compose.yml: switch Open WebUI from OLLAMA_BASE_URL to OPENAI_API_BASE_URL pointing at llama-swap /v1 endpoint
This commit is contained in:
parent
299d712400
commit
3034f987d7
@ -8,7 +8,9 @@ services:
|
||||
- "/srv/docker/ai/data/data:/app/backend/data" # Double data is intentional
|
||||
- "/srv/docker/ai/data/.webui_secret_key:/app/backend/.webui_secret_key"
|
||||
environment:
|
||||
- OLLAMA_BASE_URL=https://ollama.lan.poldebra.me
|
||||
- OPENAI_API_BASE_URL=https://ollama.lan.poldebra.me/v1
|
||||
- OPENAI_API_KEY=sk-no-key-required
|
||||
- ENABLE_OLLAMA_API=false
|
||||
networks:
|
||||
internal:
|
||||
ipv4_address: 172.24.0.5
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
X-Forwarded-Host {host}
|
||||
X-Forwarded-Port {server_port}
|
||||
}
|
||||
reverse_proxy 172.23.0.5:11434 {
|
||||
reverse_proxy 172.23.0.5:8080 {
|
||||
header_up X-Forwarded-Proto {scheme}
|
||||
}
|
||||
}
|
||||
|
||||
12
ollama/Dockerfile
Normal file
12
ollama/Dockerfile
Normal file
@ -0,0 +1,12 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM ghcr.io/ggml-org/llama.cpp:server-rocm
|
||||
|
||||
ARG LLAMA_SWAP_VERSION=v199
|
||||
ADD https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_199_linux_amd64.tar.gz /tmp/llama-swap.tar.gz
|
||||
RUN tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin llama-swap \
|
||||
&& chmod +x /usr/local/bin/llama-swap \
|
||||
&& rm /tmp/llama-swap.tar.gz
|
||||
|
||||
EXPOSE 8080
|
||||
ENTRYPOINT ["/usr/local/bin/llama-swap"]
|
||||
CMD ["-config", "/etc/llama-swap/config.yaml", "-listen", ":8080"]
|
||||
@ -1,15 +1,19 @@
|
||||
services:
|
||||
app:
|
||||
image: ollama/ollama:rocm
|
||||
build: .
|
||||
image: local/llama-swap-rocm:latest
|
||||
restart: unless-stopped
|
||||
hostname: ollama
|
||||
container_name: ollama
|
||||
user: 1000:1000
|
||||
volumes:
|
||||
- "/srv/docker/ollama/data:/ubuntu/.ollama"
|
||||
- "/srv/docker/ollama/data/models:/models:ro"
|
||||
- "./llama-swap.yaml:/etc/llama-swap/config.yaml:ro"
|
||||
devices:
|
||||
- "/dev/kfd:/dev/kfd"
|
||||
- "/dev/dri:/dev/dri"
|
||||
group_add:
|
||||
- video
|
||||
- render
|
||||
networks:
|
||||
internal:
|
||||
ipv4_address: 172.23.0.5
|
||||
|
||||
44
ollama/llama-swap.yaml
Normal file
44
ollama/llama-swap.yaml
Normal file
@ -0,0 +1,44 @@
|
||||
healthCheckTimeout: 180
|
||||
logLevel: info
|
||||
|
||||
models:
|
||||
"qwen3.5:9b":
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
--host 0.0.0.0 --port ${PORT}
|
||||
--model /models/qwen3.5-9b.gguf
|
||||
--alias qwen3.5:9b
|
||||
--n-gpu-layers 999
|
||||
--ctx-size 8192
|
||||
--temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
|
||||
|
||||
"qwen3.5:9bctxSmall":
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
--host 0.0.0.0 --port ${PORT}
|
||||
--model /models/qwen3.5-9b.gguf
|
||||
--alias qwen3.5:9bctxSmall
|
||||
--n-gpu-layers 999
|
||||
--ctx-size 131072
|
||||
--temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
|
||||
|
||||
"hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m":
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
--host 0.0.0.0 --port ${PORT}
|
||||
--model /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.q4_k_m.gguf
|
||||
--mmproj /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.mmproj.gguf
|
||||
--alias "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m"
|
||||
--n-gpu-layers 999
|
||||
--ctx-size 32768
|
||||
|
||||
"hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m":
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
--host 0.0.0.0 --port ${PORT}
|
||||
--model /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.q4_k_m.gguf
|
||||
--mmproj /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.mmproj.gguf
|
||||
--alias "hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m"
|
||||
--n-gpu-layers 999
|
||||
--ctx-size 32768
|
||||
--temp 0.6 --top-k 20 --top-p 0.95 --repeat-penalty 1
|
||||
Loading…
x
Reference in New Issue
Block a user