95 lines
1.8 KiB
Nix
95 lines
1.8 KiB
Nix
{
|
|
lib,
|
|
buildPythonPackage,
|
|
fetchFromGitHub,
|
|
replaceVars,
|
|
|
|
# build-system
|
|
cmake,
|
|
pybind11,
|
|
setuptools,
|
|
|
|
# dependencies
|
|
sentencepiece,
|
|
tiktoken,
|
|
tokenizers,
|
|
|
|
# tests
|
|
pytestCheckHook,
|
|
transformers,
|
|
}:
|
|
|
|
let
|
|
# https://github.com/meta-pytorch/tokenizers/blob/v1.0.1/CMakeLists.txt#L174-L175
|
|
pybind11-src = fetchFromGitHub {
|
|
owner = "pybind";
|
|
repo = "pybind11";
|
|
tag = "v2.13.6";
|
|
hash = "sha256-SNLdtrOjaC3lGHN9MAqTf51U9EzNKQLyTMNPe0GcdrU=";
|
|
};
|
|
in
|
|
buildPythonPackage rec {
|
|
pname = "pytorch-tokenizers";
|
|
version = "1.0.1";
|
|
pyproject = true;
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "meta-pytorch";
|
|
repo = "tokenizers";
|
|
tag = "v${version}";
|
|
fetchSubmodules = true;
|
|
hash = "sha256-1BGazimbauNBN/VfLiuhk21VEhbP07GEpPc+GAfKTQY=";
|
|
};
|
|
|
|
patches = [
|
|
(replaceVars ./dont-fetch-pybind11.patch {
|
|
pybind11 = pybind11-src;
|
|
})
|
|
];
|
|
|
|
postPatch = ''
|
|
substituteInPlace pyproject.toml \
|
|
--replace-fail '"pip>=23",' "" \
|
|
--replace-fail '"pytest",' ""
|
|
'';
|
|
|
|
build-system = [
|
|
cmake
|
|
pybind11
|
|
setuptools
|
|
];
|
|
dontUseCmakeConfigure = true;
|
|
|
|
dependencies = [
|
|
sentencepiece
|
|
tiktoken
|
|
tokenizers
|
|
];
|
|
|
|
pythonImportsCheck = [
|
|
"pytorch_tokenizers"
|
|
"pytorch_tokenizers.pytorch_tokenizers_cpp"
|
|
];
|
|
|
|
preCheck = ''
|
|
rm -rf pytorch_tokenizers
|
|
'';
|
|
|
|
nativeCheckInputs = [
|
|
pytestCheckHook
|
|
transformers
|
|
];
|
|
|
|
disabledTestPaths = [
|
|
# Require downloading models from huggingface
|
|
"test/test_hf_tokenizer.py"
|
|
];
|
|
|
|
meta = {
|
|
description = "C++ implementations for various tokenizers (sentencepiece, tiktoken, etc.)";
|
|
homepage = "https://github.com/meta-pytorch/tokenizers";
|
|
license = lib.licenses.bsd3;
|
|
maintainers = with lib.maintainers; [ GaetanLepage ];
|
|
};
|
|
}
|