services:
  llama-mtp:
    build:
      context: .
      args:
        # If 13.0.2 is not available for ARM64 on NGC, change to 13.0.1.
        CUDA_VERSION: "13.0.2"
        UBUNTU_VERSION: "24.04"
    image: llama-mtp-unsloth:spark
    container_name: llama-mtp
    ports:
      - "8080:8080"
    volumes:
      # Persist the auto-downloaded GGUF across rebuilds. Mount target /models
      # matches LLAMA_CACHE=/models set in the Dockerfile.
      - ./models:/models
    # Command follows the Unsloth llama.cpp MTP run command.
    # Additions vs the guide: --host 0.0.0.0 and --port 8080 so Open WebUI can
    # reach the server. -ngl 99 / -c 8192 / -fa on / -np 1 come from the model's
    # HF card (MTP does not support -np > 1 or --mmproj yet).
    command: >
      -hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_XL
      --host 0.0.0.0
      --port 8080
      -ngl 99
      -c 8192
      -fa on
      -np 1
      --spec-type draft-mtp
      --spec-draft-n-max 2
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped