110 lines
2.8 KiB
Nix
110 lines
2.8 KiB
Nix
# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
|
||
# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
|
||
# requires the CUDA toolkit (via nvcc) to be available.
|
||
#
|
||
# This means that if you plan to use flashinfer, you will need to set the
|
||
# environment variable `CUDA_HOME` to `cudatoolkit`.
|
||
{
|
||
lib,
|
||
config,
|
||
buildPythonPackage,
|
||
fetchFromGitHub,
|
||
|
||
# build-system
|
||
setuptools,
|
||
|
||
# nativeBuildInputs
|
||
cmake,
|
||
ninja,
|
||
cudaPackages,
|
||
|
||
# dependencies
|
||
click,
|
||
einops,
|
||
numpy,
|
||
pynvml,
|
||
tabulate,
|
||
torch,
|
||
tqdm,
|
||
}:
|
||
|
||
buildPythonPackage rec {
|
||
pname = "flashinfer";
|
||
version = "0.3.1";
|
||
pyproject = true;
|
||
|
||
src = fetchFromGitHub {
|
||
owner = "flashinfer-ai";
|
||
repo = "flashinfer";
|
||
tag = "v${version}";
|
||
fetchSubmodules = true;
|
||
hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs=";
|
||
};
|
||
|
||
build-system = [ setuptools ];
|
||
|
||
nativeBuildInputs = [
|
||
cmake
|
||
ninja
|
||
(lib.getBin cudaPackages.cuda_nvcc)
|
||
];
|
||
|
||
dontUseCmakeConfigure = true;
|
||
|
||
buildInputs = with cudaPackages; [
|
||
cuda_cccl
|
||
cuda_cudart
|
||
libcublas
|
||
libcurand
|
||
];
|
||
|
||
# FlashInfer offers two installation modes:
|
||
#
|
||
# JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
|
||
# compiled kernels cached for future use. JIT mode allows fast installation,
|
||
# as no CUDA kernels are pre-compiled, making it ideal for development and
|
||
# testing. JIT version is also available as a sdist in PyPI.
|
||
#
|
||
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
|
||
# reducing runtime compilation overhead. If a required kernel is not
|
||
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
|
||
# recommended for production environments.
|
||
#
|
||
# Here we use opt for the AOT version.
|
||
preConfigure = ''
|
||
export FLASHINFER_ENABLE_AOT=1
|
||
export TORCH_NVCC_FLAGS="--maxrregcount=64"
|
||
export MAX_JOBS="$NIX_BUILD_CORES"
|
||
'';
|
||
|
||
FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
|
||
|
||
pythonRemoveDeps = [
|
||
"nvidia-cudnn-frontend"
|
||
];
|
||
dependencies = [
|
||
click
|
||
einops
|
||
numpy
|
||
pynvml
|
||
tabulate
|
||
torch
|
||
tqdm
|
||
];
|
||
|
||
meta = {
|
||
broken = !torch.cudaSupport || !config.cudaSupport;
|
||
homepage = "https://flashinfer.ai/";
|
||
description = "Library and kernel generator for Large Language Models";
|
||
longDescription = ''
|
||
FlashInfer is a library and kernel generator for Large Language Models
|
||
that provides high-performance implementation of LLM GPU kernels such as
|
||
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
|
||
and inference, and delivers state-of-the-art performance across diverse
|
||
scenarios.
|
||
'';
|
||
license = lib.licenses.asl20;
|
||
maintainers = with lib.maintainers; [ breakds ];
|
||
};
|
||
}
|