# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a # requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always # requires the CUDA toolkit (via nvcc) to be available. # # This means that if you plan to use flashinfer, you will need to set the # environment variable `CUDA_HOME` to `cudatoolkit`. { lib, config, buildPythonPackage, fetchFromGitHub, # build-system setuptools, # nativeBuildInputs cmake, ninja, cudaPackages, # dependencies click, einops, numpy, pynvml, tabulate, torch, tqdm, }: buildPythonPackage rec { pname = "flashinfer"; version = "0.3.1"; pyproject = true; src = fetchFromGitHub { owner = "flashinfer-ai"; repo = "flashinfer"; tag = "v${version}"; fetchSubmodules = true; hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs="; }; build-system = [ setuptools ]; nativeBuildInputs = [ cmake ninja (lib.getBin cudaPackages.cuda_nvcc) ]; dontUseCmakeConfigure = true; buildInputs = with cudaPackages; [ cuda_cccl cuda_cudart libcublas libcurand ]; # FlashInfer offers two installation modes: # # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with # compiled kernels cached for future use. JIT mode allows fast installation, # as no CUDA kernels are pre-compiled, making it ideal for development and # testing. JIT version is also available as a sdist in PyPI. # # AOT mode: Core CUDA kernels are pre-compiled and included in the library, # reducing runtime compilation overhead. If a required kernel is not # pre-compiled, it will be compiled at runtime using JIT. AOT mode is # recommended for production environments. # # Here we use opt for the AOT version. preConfigure = '' export FLASHINFER_ENABLE_AOT=1 export TORCH_NVCC_FLAGS="--maxrregcount=64" export MAX_JOBS="$NIX_BUILD_CORES" ''; FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities; pythonRemoveDeps = [ "nvidia-cudnn-frontend" ]; dependencies = [ click einops numpy pynvml tabulate torch tqdm ]; meta = { broken = !torch.cudaSupport || !config.cudaSupport; homepage = "https://flashinfer.ai/"; description = "Library and kernel generator for Large Language Models"; longDescription = '' FlashInfer is a library and kernel generator for Large Language Models that provides high-performance implementation of LLM GPU kernels such as FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving and inference, and delivers state-of-the-art performance across diverse scenarios. ''; license = lib.licenses.asl20; maintainers = with lib.maintainers; [ breakds ]; }; }