# Containerized Unsloth llama.cpp MTP guide for DGX Spark (GB10 / ARM64)
#
# Base image is pulled from NGC (nvcr.io), NOT Docker Hub, because Docker Hub
# does not publish official ARM64 CUDA 13 images and the DGX Spark is aarch64.
# Pinned to the 13.0.x line to stay clear of the CUDA 13.2 gibberish issue
# that Unsloth and NVIDIA warn about.
#
# If 13.0.2 is not published for ARM64 on NGC, drop to 13.0.1 (confirmed in use).

ARG CUDA_VERSION=13.0.2
ARG UBUNTU_VERSION=24.04

# ------------------------------------------------------------
# Stage 1: Build llama.cpp following the Unsloth llama.cpp MTP guide logic
# ------------------------------------------------------------
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build

# DGX Spark / GB10 container build support. Build stage only.
ENV LD_LIBRARY_PATH=/usr/local/cuda-13/compat:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Same dependency set as the Unsloth guide.
RUN apt-get update && apt-get install -y --no-install-recommends \
    pciutils \
    build-essential \
    cmake \
    curl \
    libcurl4-openssl-dev \
    libssl-dev \
    git \
    ca-certificates \
    pkg-config \
    && rm -rf /var/lib/apt/lists/*

# ARM64 CUDA stub linking support for the container build (libcuda is provided
# by the host driver at runtime, but the linker needs a stub at build time).
RUN set -eux; \
  for d in /usr/local/cuda/targets/aarch64-linux /usr/local/cuda/targets/sbsa-linux; do \
    if [ -d "$d/lib" ]; then CUDA_LIB_DIR="$d/lib"; break; fi; \
  done; \
  test -n "${CUDA_LIB_DIR:-}" || (echo "Could not find CUDA target lib directory" && exit 1); \
  ln -sf "$CUDA_LIB_DIR/stubs/libcuda.so" "$CUDA_LIB_DIR/libcuda.so"; \
  ln -sf "$CUDA_LIB_DIR/stubs/libcuda.so" "$CUDA_LIB_DIR/libcuda.so.1"

WORKDIR /src

# Unsloth guide source.
RUN git clone https://github.com/ggml-org/llama.cpp

# Core CMake flags are identical to the Unsloth guide:
#   -DBUILD_SHARED_LIBS=OFF
#   -DGGML_CUDA=ON
#
# DGX Spark adaptation: target GB10 (sm_121).
#   121 is the correct primary value for GB10. It translates to 121a inside
#   the llama.cpp repo (virtual + real targets). If you want to trim PTX
#   portability code / compile time, you can swap to 121a-real, but 121 is
#   what NVIDIA's own playbooks use.
RUN cmake llama.cpp -B llama.cpp/build \
    -DBUILD_SHARED_LIBS=OFF \
    -DGGML_CUDA=ON \
    -DLLAMA_OPENSSL=ON \
    -DCMAKE_CUDA_ARCHITECTURES=121

# Same target list as the Unsloth guide.
RUN cmake --build llama.cpp/build \
    --config Release \
    -j"$(nproc)" \
    --clean-first \
    --target llama-cli llama-mtmd-cli llama-server llama-gguf-split

# Same copy pattern as the guide, into /out for the runtime stage.
RUN set -eux; \
    mkdir -p /out/bin; \
    cp llama.cpp/build/bin/llama-* /out/bin/; \
    ls -lh /out/bin; \
    test -x /out/bin/llama-server

# ------------------------------------------------------------
# Stage 2: Runtime image (slim)
# ------------------------------------------------------------
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

RUN apt-get update && apt-get install -y --no-install-recommends \
    ca-certificates \
    curl \
    libcurl4 \
    libssl3 \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

COPY --from=build /out/bin/ /usr/local/bin/

# Runtime: let the NVIDIA container runtime mount the host driver normally.
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
# LLAMA_CACHE pins where -hf downloads/caches the GGUF. Set to /models so it
# lands in the host-mounted volume (compose maps ./models -> /models) and
# persists across container rebuilds. This must be a directory path, not an
# HF repo string.
ENV LLAMA_CACHE=/models

EXPOSE 8080

ENTRYPOINT ["llama-server"]