Files
nixpkgs/pkgs/development/python-modules/vllm/default.nix
Daniel Fahey 11dcb32fb2 python3Packages.vllm: fix CPU build with upstream oneDNN patches
- Add upstream patches from vLLM that properly handle oneDNN integration for CPU builds, see https://github.com/vllm-project/vllm/pull/26401
- Set `FETCHCONTENT_SOURCE_DIR_ONEDNN` via environment variable to `oneDNN.src` instead of using cmake flag with `lib.getDev oneDNN`
- Remove unused `pythonOlder` import
- Remove broken flag for CPU support
2025-10-10 10:42:46 +01:00

507 lines
13 KiB
Nix

{
lib,
stdenv,
python,
buildPythonPackage,
fetchFromGitHub,
fetchpatch,
symlinkJoin,
autoAddDriverRunpath,
# build system
cmake,
jinja2,
ninja,
packaging,
setuptools,
setuptools-scm,
# dependencies
which,
torch,
outlines,
psutil,
ray,
pandas,
pyarrow,
sentencepiece,
numpy,
transformers,
xformers,
xgrammar,
numba,
fastapi,
uvicorn,
pydantic,
aioprometheus,
pynvml,
openai,
pyzmq,
tiktoken,
torchaudio,
torchvision,
py-cpuinfo,
lm-format-enforcer,
prometheus-fastapi-instrumentator,
cupy,
cbor2,
pybase64,
gguf,
einops,
importlib-metadata,
partial-json-parser,
compressed-tensors,
mistral-common,
msgspec,
numactl,
tokenizers,
oneDNN,
blake3,
depyf,
opencv-python-headless,
cachetools,
llguidance,
python-json-logger,
python-multipart,
llvmPackages,
opentelemetry-sdk,
opentelemetry-api,
opentelemetry-exporter-otlp,
bitsandbytes,
flashinfer,
py-libnuma,
setproctitle,
openai-harmony,
# internal dependency - for overriding in overlays
vllm-flash-attn ? null,
cudaSupport ? torch.cudaSupport,
cudaPackages ? { },
rocmSupport ? torch.rocmSupport,
rocmPackages ? { },
gpuTargets ? [ ],
}:
let
inherit (lib)
lists
strings
trivial
;
inherit (cudaPackages) flags;
shouldUsePkg =
pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
# see CMakeLists.txt, grepping for CUTLASS_REVISION
# https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
cutlass = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
tag = "v4.0.0";
hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
};
flashmla = stdenv.mkDerivation {
pname = "flashmla";
# https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
version = "1.0.0";
# grep for GIT_TAG in the following file
# https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
src = fetchFromGitHub {
owner = "vllm-project";
repo = "FlashMLA";
rev = "5f65b85703c7ed75fda01e06495077caad207c3f";
hash = "sha256-DO9EFNSoAgyfRRc095v1UjT+Zdzk4cFY0+n28FVEwI0=";
};
dontConfigure = true;
# flashmla normally relies on `git submodule update` to fetch cutlass
buildPhase = ''
rm -rf csrc/cutlass
ln -sf ${cutlass} csrc/cutlass
'';
installPhase = ''
cp -rva . $out
'';
};
vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
pname = "vllm-flash-attn";
# https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
version = "2.7.2.post1";
# grep for GIT_TAG in the following file
# https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
src = fetchFromGitHub {
owner = "vllm-project";
repo = "flash-attention";
rev = "ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a";
hash = "sha256-2r0Habd/kBpvM4/aQFIYyj+uQAa3M9gjk3DcBZHFNfA=";
};
patches = [
# fix Hopper build failure
# https://github.com/Dao-AILab/flash-attention/pull/1719
# https://github.com/Dao-AILab/flash-attention/pull/1723
(fetchpatch {
url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch";
hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4=";
})
(fetchpatch {
url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch";
hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A=";
})
];
dontConfigure = true;
# vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
buildPhase = ''
rm -rf csrc/cutlass
ln -sf ${cutlass} csrc/cutlass
''
+ lib.optionalString rocmSupport ''
rm -rf csrc/composable_kernel;
ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
'';
installPhase = ''
cp -rva . $out
'';
}) vllm-flash-attn;
cpuSupport = !cudaSupport && !rocmSupport;
# https://github.com/pytorch/pytorch/blob/v2.8.0/torch/utils/cpp_extension.py#L2411-L2414
supportedTorchCudaCapabilities =
let
real = [
"3.5"
"3.7"
"5.0"
"5.2"
"5.3"
"6.0"
"6.1"
"6.2"
"7.0"
"7.2"
"7.5"
"8.0"
"8.6"
"8.7"
"8.9"
"9.0"
"9.0a"
# Blackwell (SM100+) capabilities temporarily disabled due to CUTLASS API incompatibility
# FlashMLA kernels require CUTLASS v4.2.1+ APIs not available in bundled v4.0.0
# TODO: Re-enable when vLLM upgrades CUTLASS (see https://github.com/vllm-project/vllm/pull/24673)
# "10.0"
# "10.0a"
# "10.1"
# "10.1a"
# "10.3"
# "10.3a"
# "12.0"
# "12.0a"
# "12.1"
# "12.1a"
];
ptx = lists.map (x: "${x}+PTX") real;
in
real ++ ptx;
# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
# of the first list *from* the second list. That means:
# lists.subtractLists a b = b - a
# For CUDA
supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
gpuArchWarner =
supported: unsupported:
trivial.throwIf (supported == [ ]) (
"No supported GPU targets specified. Requested GPU targets: "
+ strings.concatStringsSep ", " unsupported
) supported;
# Create the gpuTargetString.
gpuTargetString = strings.concatStringsSep ";" (
if gpuTargets != [ ] then
# If gpuTargets is specified, it always takes priority.
gpuTargets
else if cudaSupport then
gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
else if rocmSupport then
rocmPackages.clr.gpuTargets
else
throw "No GPU targets specified"
);
mergedCudaLibraries = with cudaPackages; [
cuda_cudart # cuda_runtime.h, -lcudart
cuda_cccl
libcurand # curand_kernel.h
libcusparse # cusparse.h
libcusolver # cusolverDn.h
cuda_nvtx
cuda_nvrtc
# cusparselt # cusparseLt.h
libcublas
];
# Some packages are not available on all platforms
nccl = shouldUsePkg (cudaPackages.nccl or null);
getAllOutputs = p: [
(lib.getBin p)
(lib.getLib p)
(lib.getDev p)
];
in
buildPythonPackage rec {
pname = "vllm";
version = "0.11.0";
pyproject = true;
stdenv = torch.stdenv;
src = fetchFromGitHub {
owner = "vllm-project";
repo = "vllm";
tag = "v${version}";
hash = "sha256-uYK/e9McEyrDTACMk5S0cGCjai9rf6HMR9dpPL7ISYc=";
};
patches = [
./0002-setup.py-nix-support-respect-cmakeFlags.patch
./0003-propagate-pythonpath.patch
./0005-drop-intel-reqs.patch
# TODO: Remove the below patches when included in vLLM release
(fetchpatch {
url = "https://github.com/vllm-project/vllm/commit/9705fba7b727a3b9c275b012258608531e2223d1.patch";
hash = "sha256-DxRGLiwkegMlMjqFmFc0igpaVv06/Y2WjL+ISoIOET4=";
})
# patch above is previous commit needed to apply patch below
# oneDNN / CPU fix from https://github.com/vllm-project/vllm/pull/26401
(fetchpatch {
url = "https://github.com/vllm-project/vllm/commit/d7be1f2a480bdc62a6a1ec0126a401e3d42985fe.patch";
hash = "sha256-Zi1k5wiOPjsbWHFKpcLq9Ns43wIP37Mbvesi5K80zaQ=";
})
];
postPatch = ''
# pythonRelaxDeps does not cover build-system
substituteInPlace pyproject.toml \
--replace-fail "torch ==" "torch >=" \
--replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
# Ignore the python version check because it hard-codes minor versions and
# lags behind `ray`'s python interpreter support
substituteInPlace CMakeLists.txt \
--replace-fail \
'set(PYTHON_SUPPORTED_VERSIONS' \
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
# Pass build environment PYTHONPATH to vLLM's Python configuration scripts
substituteInPlace CMakeLists.txt \
--replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
'';
nativeBuildInputs = [
which
]
++ lib.optionals rocmSupport [
rocmPackages.hipcc
]
++ lib.optionals cudaSupport [
cudaPackages.cuda_nvcc
autoAddDriverRunpath
]
++ lib.optionals isCudaJetson [
cudaPackages.autoAddCudaCompatRunpath
];
build-system = [
cmake
jinja2
ninja
packaging
setuptools
setuptools-scm
torch
];
buildInputs =
lib.optionals cpuSupport [
oneDNN
]
++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
numactl
]
++ lib.optionals cudaSupport (
mergedCudaLibraries
++ (with cudaPackages; [
nccl
cudnn
libcufile
])
)
++ lib.optionals rocmSupport (
with rocmPackages;
[
clr
rocthrust
rocprim
hipsparse
hipblas
]
)
++ lib.optionals stdenv.cc.isClang [
llvmPackages.openmp
];
dependencies = [
aioprometheus
blake3
cachetools
cbor2
depyf
fastapi
llguidance
lm-format-enforcer
numpy
openai
opencv-python-headless
outlines
pandas
prometheus-fastapi-instrumentator
py-cpuinfo
pyarrow
pybase64
pydantic
python-json-logger
python-multipart
pyzmq
ray
sentencepiece
tiktoken
tokenizers
msgspec
gguf
einops
importlib-metadata
partial-json-parser
compressed-tensors
mistral-common
torch
torchaudio
torchvision
transformers
uvicorn
xformers
xgrammar
numba
opentelemetry-sdk
opentelemetry-api
opentelemetry-exporter-otlp
bitsandbytes
setproctitle
openai-harmony
# vLLM needs Torch's compiler to be present in order to use torch.compile
torch.stdenv.cc
]
++ uvicorn.optional-dependencies.standard
++ aioprometheus.optional-dependencies.starlette
++ lib.optionals stdenv.targetPlatform.isLinux [
py-libnuma
psutil
]
++ lib.optionals cudaSupport [
cupy
pynvml
flashinfer
];
dontUseCmakeConfigure = true;
cmakeFlags = [
]
++ lib.optionals cudaSupport [
(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
(lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
(lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
(lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
(lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
(lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
}}")
(lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
(lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
(lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
];
env =
lib.optionalAttrs cudaSupport {
VLLM_TARGET_DEVICE = "cuda";
CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
}
// lib.optionalAttrs rocmSupport {
VLLM_TARGET_DEVICE = "rocm";
# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
ROCM_HOME = "${rocmPackages.clr}";
}
// lib.optionalAttrs cpuSupport {
VLLM_TARGET_DEVICE = "cpu";
FETCHCONTENT_SOURCE_DIR_ONEDNN = "${oneDNN.src}";
};
preConfigure = ''
# See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
# There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
export MAX_JOBS="$NIX_BUILD_CORES"
'';
pythonRelaxDeps = true;
pythonImportsCheck = [ "vllm" ];
passthru = {
# make internal dependency available to overlays
vllm-flash-attn = vllm-flash-attn';
# updates the cutlass fetcher instead
skipBulkUpdate = true;
};
meta = {
description = "High-throughput and memory-efficient inference and serving engine for LLMs";
changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
homepage = "https://github.com/vllm-project/vllm";
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [
happysalada
lach
daniel-fahey
];
badPlatforms = [
# CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
# find_isa Function invoked with incorrect arguments for function named:
# find_isa
"x86_64-darwin"
];
};
}