feat(ollama): add persistence in Ollama container

Re-enable qwen3.5:9b and qwen3.5:9bctxSmall using fresh unsloth/Qwen3.5-9B-GGUF
quantization, which uses the correct rope.dimension_sections format (4 elements)
compatible with this llama.cpp build. Both models include the mmproj for
multimodal support. The old Ollama-extracted GGUF (mrope_sections, 3 elements)
has been removed.
This commit is contained in:
Davide Polonio 2026-04-10 10:57:34 +02:00
parent ebc71492c3
commit 8ab4213b62

View File

@ -2,10 +2,27 @@ healthCheckTimeout: 180
logLevel: info logLevel: info
models: models:
# NOTE: qwen3.5:9b and qwen3.5:9bctxSmall are disabled — the GGUF extracted "qwen3.5:9b":
# from Ollama uses the old mrope_sections format (3 elements) which this cmd: |
# llama.cpp build rejects. Download a fresh quantization from HuggingFace /app/llama-server
# (e.g. bartowski/Qwen3.5-9B-GGUF) and add them back. --host 0.0.0.0 --port ${PORT}
--model /models/Qwen3.5-9B-Q4_K_M.gguf
--mmproj /models/Qwen3.5-9B-mmproj-F16.gguf
--alias qwen3.5:9b
--n-gpu-layers 999
--ctx-size 8192
--temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
"qwen3.5:9bctxSmall":
cmd: |
/app/llama-server
--host 0.0.0.0 --port ${PORT}
--model /models/Qwen3.5-9B-Q4_K_M.gguf
--mmproj /models/Qwen3.5-9B-mmproj-F16.gguf
--alias qwen3.5:9bctxSmall
--n-gpu-layers 999
--ctx-size 131072
--temp 1 --top-k 20 --top-p 0.95 --presence-penalty 1.5
"hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m": "hf.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive:q4_k_m":
cmd: | cmd: |