Files
nixpkgs/pkgs/development/python-modules/unstructured/default.nix
2025-09-26 19:47:12 +00:00

286 lines
4.5 KiB
Nix

{
lib,
buildPythonPackage,
fetchFromGitHub,
# build-system
setuptools,
# core networking and async dependencies
anyio,
backoff,
certifi,
httpcore,
httpx,
h11,
nest-asyncio,
requests,
requests-toolbelt,
sniffio,
urllib3,
# core parsing and processing
beautifulsoup4,
chardet,
charset-normalizer,
emoji,
filetype,
html5lib,
idna,
joblib,
# jsonpath-python,
nltk,
olefile,
orderly-set,
python-dateutil,
python-iso639,
python-magic,
python-oxmsg,
rapidfuzz,
regex,
soupsieve,
webencodings,
# core data handling
dataclasses-json,
deepdiff,
marshmallow,
mypy-extensions,
packaging,
typing-extensions,
typing-inspect,
# core system utilities
cffi,
cryptography,
psutil,
pycparser,
six,
tqdm,
wrapt,
# document format support
markdown,
pdfminer-six,
pdfplumber,
# pi-heif,
pikepdf,
pypandoc,
pypdf,
python-docx,
unstructured-client,
# unstructured-pytesseract,
# optional dependencies
# csv
pytz,
tzdata,
# markdown
importlib-metadata,
zipp,
# pdf
opencv-python,
paddlepaddle,
pdf2image,
# unstructured-paddleocr,
# pptx
lxml,
pillow,
python-pptx,
xlsxwriter,
# xslx
et-xmlfile,
networkx,
numpy,
openpyxl,
pandas,
xlrd,
# huggingface
langdetect,
sacremoses,
sentencepiece,
torch,
transformers,
# local-inference
unstructured-inference,
# test dependencies
pytestCheckHook,
black,
coverage,
click,
freezegun,
# , label-studio-sdk
mypy,
pytest-cov-stub,
pytest-mock,
vcrpy,
grpcio,
}:
let
version = "0.18.15";
in
buildPythonPackage rec {
pname = "unstructured";
inherit version;
pyproject = true;
src = fetchFromGitHub {
owner = "Unstructured-IO";
repo = "unstructured";
tag = version;
hash = "sha256-rzspozQQ+WrS3cKAGe9O7clAIDo4P/6PdZzCXIRdNn8=";
};
build-system = [ setuptools ];
dependencies = [
# Base dependencies
anyio
backoff
beautifulsoup4
certifi
cffi
chardet
charset-normalizer
click
cryptography
dataclasses-json
deepdiff
emoji
filetype
h11
html5lib
httpcore
httpx
idna
joblib
# jsonpath-python
langdetect
lxml
marshmallow
mypy-extensions
nest-asyncio
nltk
numpy
olefile
orderly-set
packaging
psutil
pycparser
pypdf
python-dateutil
python-iso639
python-magic
python-oxmsg
rapidfuzz
regex
requests
requests-toolbelt
six
sniffio
soupsieve
tqdm
typing-extensions
typing-inspect
unstructured-client
urllib3
webencodings
wrapt
];
optional-dependencies = rec {
all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
csv = [
numpy
pandas
python-dateutil
pytz
tzdata
];
docx = [
lxml
python-docx
typing-extensions
];
epub = [ pypandoc ];
req-markdown = [
importlib-metadata
markdown
zipp
];
odt = [
lxml
pypandoc
python-docx
typing-extensions
];
org = [
pypandoc
];
paddleocr = [
opencv-python
# paddlepaddle # 3.12 not supported for now
pdf2image
# unstructured-paddleocr
];
pdf = [
pdf2image
pdfminer-six
pdfplumber
# pi-heif
pikepdf
pypdf
unstructured-inference
# unstructured-pytesseract
];
pptx = [
lxml
pillow
python-pptx
xlsxwriter
];
xlsx = [
et-xmlfile
networkx
numpy
openpyxl
pandas
xlrd
];
huggingface = [
langdetect
sacremoses
sentencepiece
torch
transformers
];
};
pythonImportsCheck = [ "unstructured" ];
# test try to download punkt from nltk
# figure out how to make it available to enable the tests
doCheck = false;
nativeCheckInputs = [
pytestCheckHook
black
coverage
click
freezegun
mypy
pytest-cov-stub
pytest-mock
vcrpy
grpcio
];
meta = with lib; {
description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
mainProgram = "unstructured-ingest";
homepage = "https://github.com/Unstructured-IO/unstructured";
changelog = "https://github.com/Unstructured-IO/unstructured/blob/${src.tag}/CHANGELOG.md";
license = licenses.asl20;
maintainers = with maintainers; [ happysalada ];
};
}