From 3034f987d721fc7b4b0e3ca31b5fa8e9348ce441 Mon Sep 17 00:00:00 2001 From: Davide Polonio Date: Thu, 9 Apr 2026 23:14:43 +0200 Subject: [PATCH] feat(ollama): migrate from Ollama to llama.cpp + llama-swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the Ollama service with a custom ROCm image combining ghcr.io/ggml-org/llama.cpp:server-rocm and llama-swap v199. Main motivations: - Unblock qwen35 HF GGUFs (qwen35 architecture not supported in Ollama 0.20.4 for HF-imported models) - Stay current with llama.cpp upstream without waiting for Ollama releases Changes: - ollama/Dockerfile: build llama-swap on top of llama.cpp:server-rocm - ollama/llama-swap.yaml: define 4 models with full sampler config, GPU offload, and mmproj for the two multimodal HF fine-tunes - ollama/docker-compose.yml: replace Ollama image with local build; fix broken volume mount (was /ubuntu/.ollama, now explicit /models) - ollama/Caddyfile: update upstream port 11434→8080 (llama-swap default) - ai/docker-compose.yml: switch Open WebUI from OLLAMA_BASE_URL to OPENAI_API_BASE_URL pointing at llama-swap /v1 endpoint --- ai/docker-compose.yml | 4 +++- ollama/Caddyfile | 2 +- ollama/Dockerfile | 12 +++++++++++ ollama/docker-compose.yml | 10 ++++++--- ollama/llama-swap.yaml | 44 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 ollama/Dockerfile create mode 100644 ollama/llama-swap.yaml diff --git a/ai/docker-compose.yml b/ai/docker-compose.yml index 1f9422a..36aa555 100644 --- a/ai/docker-compose.yml +++ b/ai/docker-compose.yml @@ -8,7 +8,9 @@ services: - "/srv/docker/ai/data/data:/app/backend/data" # Double data is intentional - "/srv/docker/ai/data/.webui_secret_key:/app/backend/.webui_secret_key" environment: - - OLLAMA_BASE_URL=https://ollama.lan.poldebra.me + - OPENAI_API_BASE_URL=https://ollama.lan.poldebra.me/v1 + - OPENAI_API_KEY=sk-no-key-required + - ENABLE_OLLAMA_API=false networks: internal: ipv4_address: 172.24.0.5 diff --git a/ollama/Caddyfile b/ollama/Caddyfile index 977fa8a..11c4e89 100644 --- a/ollama/Caddyfile +++ b/ollama/Caddyfile @@ -21,7 +21,7 @@ X-Forwarded-Host {host} X-Forwarded-Port {server_port} } - reverse_proxy 172.23.0.5:11434 { + reverse_proxy 172.23.0.5:8080 { header_up X-Forwarded-Proto {scheme} } } diff --git a/ollama/Dockerfile b/ollama/Dockerfile new file mode 100644 index 0000000..e83b713 --- /dev/null +++ b/ollama/Dockerfile @@ -0,0 +1,12 @@ +# syntax=docker/dockerfile:1 +FROM ghcr.io/ggml-org/llama.cpp:server-rocm + +ARG LLAMA_SWAP_VERSION=v199 +ADD https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_199_linux_amd64.tar.gz /tmp/llama-swap.tar.gz +RUN tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin llama-swap \ + && chmod +x /usr/local/bin/llama-swap \ + && rm /tmp/llama-swap.tar.gz + +EXPOSE 8080 +ENTRYPOINT ["/usr/local/bin/llama-swap"] +CMD ["-config", "/etc/llama-swap/config.yaml", "-listen", ":8080"] diff --git a/ollama/docker-compose.yml b/ollama/docker-compose.yml index b97db48..83ab5e2 100644 --- a/ollama/docker-compose.yml +++ b/ollama/docker-compose.yml @@ -1,15 +1,19 @@ services: app: - image: ollama/ollama:rocm + build: . + image: local/llama-swap-rocm:latest restart: unless-stopped hostname: ollama container_name: ollama - user: 1000:1000 volumes: - - "/srv/docker/ollama/data:/ubuntu/.ollama" + - "/srv/docker/ollama/data/models:/models:ro" + - "./llama-swap.yaml:/etc/llama-swap/config.yaml:ro" devices: - "/dev/kfd:/dev/kfd" - "/dev/dri:/dev/dri" + group_add: + - video + - render networks: internal: ipv4_address: 172.23.0.5 diff --git a/ollama/llama-swap.yaml b/ollama/llama-swap.yaml new file mode 100644 index 0000000..d9c919c --- /dev/null +++ b/ollama/llama-swap.yaml @@ -0,0 +1,44 @@ +healthCheckTimeout: 180 +logLevel: info + +models: + "qwen3.5:9b": + cmd: | + /app/llama-server + --host 0.0.0.0 --port ${PORT} + --model /models/qwen3.5-9b.gguf + --alias qwen3.5:9b + --n-gpu-layers 999 + --ctx-size 8192 + --temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5 + + "qwen3.5:9bctxSmall": + cmd: | + /app/llama-server + --host 0.0.0.0 --port ${PORT} + --model /models/qwen3.5-9b.gguf + --alias qwen3.5:9bctxSmall + --n-gpu-layers 999 + --ctx-size 131072 + --temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5 + + "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m": + cmd: | + /app/llama-server + --host 0.0.0.0 --port ${PORT} + --model /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.q4_k_m.gguf + --mmproj /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.mmproj.gguf + --alias "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m" + --n-gpu-layers 999 + --ctx-size 32768 + + "hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m": + cmd: | + /app/llama-server + --host 0.0.0.0 --port ${PORT} + --model /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.q4_k_m.gguf + --mmproj /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.mmproj.gguf + --alias "hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m" + --n-gpu-layers 999 + --ctx-size 32768 + --temp 0.6 --top-k 20 --top-p 0.95 --repeat-penalty 1