286 lines
4.5 KiB
Nix
286 lines
4.5 KiB
Nix
{
|
|
lib,
|
|
buildPythonPackage,
|
|
fetchFromGitHub,
|
|
|
|
# build-system
|
|
setuptools,
|
|
|
|
# core networking and async dependencies
|
|
anyio,
|
|
backoff,
|
|
certifi,
|
|
httpcore,
|
|
httpx,
|
|
h11,
|
|
nest-asyncio,
|
|
requests,
|
|
requests-toolbelt,
|
|
sniffio,
|
|
urllib3,
|
|
|
|
# core parsing and processing
|
|
beautifulsoup4,
|
|
chardet,
|
|
charset-normalizer,
|
|
emoji,
|
|
filetype,
|
|
html5lib,
|
|
idna,
|
|
joblib,
|
|
# jsonpath-python,
|
|
nltk,
|
|
olefile,
|
|
orderly-set,
|
|
python-dateutil,
|
|
python-iso639,
|
|
python-magic,
|
|
python-oxmsg,
|
|
rapidfuzz,
|
|
regex,
|
|
soupsieve,
|
|
webencodings,
|
|
|
|
# core data handling
|
|
dataclasses-json,
|
|
deepdiff,
|
|
marshmallow,
|
|
mypy-extensions,
|
|
packaging,
|
|
typing-extensions,
|
|
typing-inspect,
|
|
|
|
# core system utilities
|
|
cffi,
|
|
cryptography,
|
|
psutil,
|
|
pycparser,
|
|
six,
|
|
tqdm,
|
|
wrapt,
|
|
|
|
# document format support
|
|
markdown,
|
|
pdfminer-six,
|
|
pdfplumber,
|
|
# pi-heif,
|
|
pikepdf,
|
|
pypandoc,
|
|
pypdf,
|
|
python-docx,
|
|
unstructured-client,
|
|
# unstructured-pytesseract,
|
|
# optional dependencies
|
|
# csv
|
|
pytz,
|
|
tzdata,
|
|
# markdown
|
|
importlib-metadata,
|
|
zipp,
|
|
# pdf
|
|
opencv-python,
|
|
paddlepaddle,
|
|
pdf2image,
|
|
# unstructured-paddleocr,
|
|
# pptx
|
|
lxml,
|
|
pillow,
|
|
python-pptx,
|
|
xlsxwriter,
|
|
# xslx
|
|
et-xmlfile,
|
|
networkx,
|
|
numpy,
|
|
openpyxl,
|
|
pandas,
|
|
xlrd,
|
|
# huggingface
|
|
langdetect,
|
|
sacremoses,
|
|
sentencepiece,
|
|
torch,
|
|
transformers,
|
|
# local-inference
|
|
unstructured-inference,
|
|
# test dependencies
|
|
pytestCheckHook,
|
|
black,
|
|
coverage,
|
|
click,
|
|
freezegun,
|
|
# , label-studio-sdk
|
|
mypy,
|
|
pytest-cov-stub,
|
|
pytest-mock,
|
|
vcrpy,
|
|
grpcio,
|
|
}:
|
|
let
|
|
version = "0.18.15";
|
|
in
|
|
buildPythonPackage rec {
|
|
pname = "unstructured";
|
|
inherit version;
|
|
pyproject = true;
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "Unstructured-IO";
|
|
repo = "unstructured";
|
|
tag = version;
|
|
hash = "sha256-rzspozQQ+WrS3cKAGe9O7clAIDo4P/6PdZzCXIRdNn8=";
|
|
};
|
|
|
|
build-system = [ setuptools ];
|
|
|
|
dependencies = [
|
|
# Base dependencies
|
|
anyio
|
|
backoff
|
|
beautifulsoup4
|
|
certifi
|
|
cffi
|
|
chardet
|
|
charset-normalizer
|
|
click
|
|
cryptography
|
|
dataclasses-json
|
|
deepdiff
|
|
emoji
|
|
filetype
|
|
h11
|
|
html5lib
|
|
httpcore
|
|
httpx
|
|
idna
|
|
joblib
|
|
# jsonpath-python
|
|
langdetect
|
|
lxml
|
|
marshmallow
|
|
mypy-extensions
|
|
nest-asyncio
|
|
nltk
|
|
numpy
|
|
olefile
|
|
orderly-set
|
|
packaging
|
|
psutil
|
|
pycparser
|
|
pypdf
|
|
python-dateutil
|
|
python-iso639
|
|
python-magic
|
|
python-oxmsg
|
|
rapidfuzz
|
|
regex
|
|
requests
|
|
requests-toolbelt
|
|
six
|
|
sniffio
|
|
soupsieve
|
|
tqdm
|
|
typing-extensions
|
|
typing-inspect
|
|
unstructured-client
|
|
urllib3
|
|
webencodings
|
|
wrapt
|
|
];
|
|
|
|
optional-dependencies = rec {
|
|
all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
|
|
csv = [
|
|
numpy
|
|
pandas
|
|
python-dateutil
|
|
pytz
|
|
tzdata
|
|
];
|
|
docx = [
|
|
lxml
|
|
python-docx
|
|
typing-extensions
|
|
];
|
|
epub = [ pypandoc ];
|
|
req-markdown = [
|
|
importlib-metadata
|
|
markdown
|
|
zipp
|
|
];
|
|
odt = [
|
|
lxml
|
|
pypandoc
|
|
python-docx
|
|
typing-extensions
|
|
];
|
|
org = [
|
|
pypandoc
|
|
];
|
|
paddleocr = [
|
|
opencv-python
|
|
# paddlepaddle # 3.12 not supported for now
|
|
pdf2image
|
|
# unstructured-paddleocr
|
|
];
|
|
pdf = [
|
|
pdf2image
|
|
pdfminer-six
|
|
pdfplumber
|
|
# pi-heif
|
|
pikepdf
|
|
pypdf
|
|
unstructured-inference
|
|
# unstructured-pytesseract
|
|
];
|
|
pptx = [
|
|
lxml
|
|
pillow
|
|
python-pptx
|
|
xlsxwriter
|
|
];
|
|
xlsx = [
|
|
et-xmlfile
|
|
networkx
|
|
numpy
|
|
openpyxl
|
|
pandas
|
|
xlrd
|
|
];
|
|
huggingface = [
|
|
langdetect
|
|
sacremoses
|
|
sentencepiece
|
|
torch
|
|
transformers
|
|
];
|
|
};
|
|
|
|
pythonImportsCheck = [ "unstructured" ];
|
|
|
|
# test try to download punkt from nltk
|
|
# figure out how to make it available to enable the tests
|
|
doCheck = false;
|
|
|
|
nativeCheckInputs = [
|
|
pytestCheckHook
|
|
black
|
|
coverage
|
|
click
|
|
freezegun
|
|
mypy
|
|
pytest-cov-stub
|
|
pytest-mock
|
|
vcrpy
|
|
grpcio
|
|
];
|
|
|
|
meta = with lib; {
|
|
description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
|
|
mainProgram = "unstructured-ingest";
|
|
homepage = "https://github.com/Unstructured-IO/unstructured";
|
|
changelog = "https://github.com/Unstructured-IO/unstructured/blob/${src.tag}/CHANGELOG.md";
|
|
license = licenses.asl20;
|
|
maintainers = with maintainers; [ happysalada ];
|
|
};
|
|
}
|