From 3034f987d721fc7b4b0e3ca31b5fa8e9348ce441 Mon Sep 17 00:00:00 2001
From: Davide Polonio <poloniodavide@gmail.com>
Date: Thu, 9 Apr 2026 23:14:43 +0200
Subject: [PATCH] feat(ollama): migrate from Ollama to llama.cpp + llama-swap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the Ollama service with a custom ROCm image combining
ghcr.io/ggml-org/llama.cpp:server-rocm and llama-swap v199.

Main motivations:
- Unblock qwen35 HF GGUFs (qwen35 architecture not supported in
  Ollama 0.20.4 for HF-imported models)
- Stay current with llama.cpp upstream without waiting for Ollama releases

Changes:
- ollama/Dockerfile: build llama-swap on top of llama.cpp:server-rocm
- ollama/llama-swap.yaml: define 4 models with full sampler config,
  GPU offload, and mmproj for the two multimodal HF fine-tunes
- ollama/docker-compose.yml: replace Ollama image with local build;
  fix broken volume mount (was /ubuntu/.ollama, now explicit /models)
- ollama/Caddyfile: update upstream port 11434→8080 (llama-swap default)
- ai/docker-compose.yml: switch Open WebUI from OLLAMA_BASE_URL to
  OPENAI_API_BASE_URL pointing at llama-swap /v1 endpoint
---
 ai/docker-compose.yml     |  4 +++-
 ollama/Caddyfile          |  2 +-
 ollama/Dockerfile         | 12 +++++++++++
 ollama/docker-compose.yml | 10 ++++++---
 ollama/llama-swap.yaml    | 44 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 ollama/Dockerfile
 create mode 100644 ollama/llama-swap.yaml

diff --git a/ai/docker-compose.yml b/ai/docker-compose.yml
index 1f9422a..36aa555 100644
--- a/ai/docker-compose.yml
+++ b/ai/docker-compose.yml
@@ -8,7 +8,9 @@ services:
       - "/srv/docker/ai/data/data:/app/backend/data" # Double data is intentional
       - "/srv/docker/ai/data/.webui_secret_key:/app/backend/.webui_secret_key"
     environment:
-      - OLLAMA_BASE_URL=https://ollama.lan.poldebra.me
+      - OPENAI_API_BASE_URL=https://ollama.lan.poldebra.me/v1
+      - OPENAI_API_KEY=sk-no-key-required
+      - ENABLE_OLLAMA_API=false
     networks:
       internal:
         ipv4_address: 172.24.0.5
diff --git a/ollama/Caddyfile b/ollama/Caddyfile
index 977fa8a..11c4e89 100644
--- a/ollama/Caddyfile
+++ b/ollama/Caddyfile
@@ -21,7 +21,7 @@
           X-Forwarded-Host {host}
           X-Forwarded-Port {server_port}
         }
-        reverse_proxy 172.23.0.5:11434 {
+        reverse_proxy 172.23.0.5:8080 {
             header_up X-Forwarded-Proto {scheme}
         }
     }
diff --git a/ollama/Dockerfile b/ollama/Dockerfile
new file mode 100644
index 0000000..e83b713
--- /dev/null
+++ b/ollama/Dockerfile
@@ -0,0 +1,12 @@
+# syntax=docker/dockerfile:1
+FROM ghcr.io/ggml-org/llama.cpp:server-rocm
+
+ARG LLAMA_SWAP_VERSION=v199
+ADD https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_199_linux_amd64.tar.gz /tmp/llama-swap.tar.gz
+RUN tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin llama-swap \
+ && chmod +x /usr/local/bin/llama-swap \
+ && rm /tmp/llama-swap.tar.gz
+
+EXPOSE 8080
+ENTRYPOINT ["/usr/local/bin/llama-swap"]
+CMD ["-config", "/etc/llama-swap/config.yaml", "-listen", ":8080"]
diff --git a/ollama/docker-compose.yml b/ollama/docker-compose.yml
index b97db48..83ab5e2 100644
--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -1,15 +1,19 @@
 services:
   app:
-    image: ollama/ollama:rocm
+    build: .
+    image: local/llama-swap-rocm:latest
     restart: unless-stopped
     hostname: ollama
     container_name: ollama
-    user: 1000:1000
     volumes:
-      - "/srv/docker/ollama/data:/ubuntu/.ollama"
+      - "/srv/docker/ollama/data/models:/models:ro"
+      - "./llama-swap.yaml:/etc/llama-swap/config.yaml:ro"
     devices:
       - "/dev/kfd:/dev/kfd"
       - "/dev/dri:/dev/dri"
+    group_add:
+      - video
+      - render
     networks:
       internal:
         ipv4_address: 172.23.0.5
diff --git a/ollama/llama-swap.yaml b/ollama/llama-swap.yaml
new file mode 100644
index 0000000..d9c919c
--- /dev/null
+++ b/ollama/llama-swap.yaml
@@ -0,0 +1,44 @@
+healthCheckTimeout: 180
+logLevel: info
+
+models:
+  "qwen3.5:9b":
+    cmd: |
+      /app/llama-server
+      --host 0.0.0.0 --port ${PORT}
+      --model /models/qwen3.5-9b.gguf
+      --alias qwen3.5:9b
+      --n-gpu-layers 999
+      --ctx-size 8192
+      --temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
+
+  "qwen3.5:9bctxSmall":
+    cmd: |
+      /app/llama-server
+      --host 0.0.0.0 --port ${PORT}
+      --model /models/qwen3.5-9b.gguf
+      --alias qwen3.5:9bctxSmall
+      --n-gpu-layers 999
+      --ctx-size 131072
+      --temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
+
+  "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m":
+    cmd: |
+      /app/llama-server
+      --host 0.0.0.0 --port ${PORT}
+      --model /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.q4_k_m.gguf
+      --mmproj /models/HauhauCS-Qwen3.5-9B-Uncensored-Aggressive.mmproj.gguf
+      --alias "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m"
+      --n-gpu-layers 999
+      --ctx-size 32768
+
+  "hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m":
+    cmd: |
+      /app/llama-server
+      --host 0.0.0.0 --port ${PORT}
+      --model /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.q4_k_m.gguf
+      --mmproj /models/Jackrong-Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2.mmproj.gguf
+      --alias "hf.co/Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF:q4_k_m"
+      --n-gpu-layers 999
+      --ctx-size 32768
+      --temp 0.6 --top-k 20 --top-p 0.95 --repeat-penalty 1