# Containerized Unsloth llama.cpp MTP guide for DGX Spark (GB10 / ARM64) # # Base image is pulled from NGC (nvcr.io), NOT Docker Hub, because Docker Hub # does not publish official ARM64 CUDA 13 images and the DGX Spark is aarch64. # Pinned to the 13.0.x line to stay clear of the CUDA 13.2 gibberish issue # that Unsloth and NVIDIA warn about. # # If 13.0.2 is not published for ARM64 on NGC, drop to 13.0.1 (confirmed in use). ARG CUDA_VERSION=13.0.2 ARG UBUNTU_VERSION=24.04 # ------------------------------------------------------------ # Stage 1: Build llama.cpp following the Unsloth llama.cpp MTP guide logic # ------------------------------------------------------------ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build # DGX Spark / GB10 container build support. Build stage only. ENV LD_LIBRARY_PATH=/usr/local/cuda-13/compat:/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Same dependency set as the Unsloth guide. RUN apt-get update && apt-get install -y --no-install-recommends \ pciutils \ build-essential \ cmake \ curl \ libcurl4-openssl-dev \ libssl-dev \ git \ ca-certificates \ pkg-config \ && rm -rf /var/lib/apt/lists/* # ARM64 CUDA stub linking support for the container build (libcuda is provided # by the host driver at runtime, but the linker needs a stub at build time). RUN set -eux; \ for d in /usr/local/cuda/targets/aarch64-linux /usr/local/cuda/targets/sbsa-linux; do \ if [ -d "$d/lib" ]; then CUDA_LIB_DIR="$d/lib"; break; fi; \ done; \ test -n "${CUDA_LIB_DIR:-}" || (echo "Could not find CUDA target lib directory" && exit 1); \ ln -sf "$CUDA_LIB_DIR/stubs/libcuda.so" "$CUDA_LIB_DIR/libcuda.so"; \ ln -sf "$CUDA_LIB_DIR/stubs/libcuda.so" "$CUDA_LIB_DIR/libcuda.so.1" WORKDIR /src # Unsloth guide source. RUN git clone https://github.com/ggml-org/llama.cpp # Core CMake flags are identical to the Unsloth guide: # -DBUILD_SHARED_LIBS=OFF # -DGGML_CUDA=ON # # DGX Spark adaptation: target GB10 (sm_121). # 121 is the correct primary value for GB10. It translates to 121a inside # the llama.cpp repo (virtual + real targets). If you want to trim PTX # portability code / compile time, you can swap to 121a-real, but 121 is # what NVIDIA's own playbooks use. RUN cmake llama.cpp -B llama.cpp/build \ -DBUILD_SHARED_LIBS=OFF \ -DGGML_CUDA=ON \ -DLLAMA_OPENSSL=ON \ -DCMAKE_CUDA_ARCHITECTURES=121 # Same target list as the Unsloth guide. RUN cmake --build llama.cpp/build \ --config Release \ -j"$(nproc)" \ --clean-first \ --target llama-cli llama-mtmd-cli llama-server llama-gguf-split # Same copy pattern as the guide, into /out for the runtime stage. RUN set -eux; \ mkdir -p /out/bin; \ cp llama.cpp/build/bin/llama-* /out/bin/; \ ls -lh /out/bin; \ test -x /out/bin/llama-server # ------------------------------------------------------------ # Stage 2: Runtime image (slim) # ------------------------------------------------------------ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ libcurl4 \ libssl3 \ libgomp1 \ && rm -rf /var/lib/apt/lists/* COPY --from=build /out/bin/ /usr/local/bin/ # Runtime: let the NVIDIA container runtime mount the host driver normally. ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility # LLAMA_CACHE pins where -hf downloads/caches the GGUF. Set to /models so it # lands in the host-mounted volume (compose maps ./models -> /models) and # persists across container rebuilds. This must be a directory path, not an # HF repo string. ENV LLAMA_CACHE=/models EXPOSE 8080 ENTRYPOINT ["llama-server"]