services: llama-mtp: build: context: . args: # If 13.0.2 is not available for ARM64 on NGC, change to 13.0.1. CUDA_VERSION: "13.0.2" UBUNTU_VERSION: "24.04" image: llama-mtp-unsloth:spark container_name: llama-mtp ports: - "8080:8080" volumes: # Persist the auto-downloaded GGUF across rebuilds. Mount target /models # matches LLAMA_CACHE=/models set in the Dockerfile. - ./models:/models # Command follows the Unsloth llama.cpp MTP run command. # Additions vs the guide: --host 0.0.0.0 and --port 8080 so Open WebUI can # reach the server. -ngl 99 / -c 8192 / -fa on / -np 1 come from the model's # HF card (MTP does not support -np > 1 or --mmproj yet). command: > -hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_XL --host 0.0.0.0 --port 8080 -ngl 99 -c 8192 -fa on -np 1 --spec-type draft-mtp --spec-draft-n-max 2 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] restart: unless-stopped