diff --git a/nixos/doc/manual/release-notes/rl-2511.section.md b/nixos/doc/manual/release-notes/rl-2511.section.md index d176a850962f..dcb03a56919f 100644 --- a/nixos/doc/manual/release-notes/rl-2511.section.md +++ b/nixos/doc/manual/release-notes/rl-2511.section.md @@ -50,6 +50,8 @@ - [go-httpbin](https://github.com/mccutchen/go-httpbin), a reasonably complete and well-tested golang port of httpbin, with zero dependencies outside the go stdlib. Available as [services.go-httpbin](#opt-services.go-httpbin.enable). +- [llama-swap](https://github.com/mostlygeek/llama-swap), a light weight transparent proxy server that provides automatic model swapping to llama.cpp's server (or any server with an OpenAI compatible endpoint). Available as [](#opt-services.llama-swap.enable). + - [tuwunel](https://matrix-construct.github.io/tuwunel/), a federated chat server implementing the Matrix protocol, forked from Conduwuit. Available as [services.matrix-tuwunel](#opt-services.matrix-tuwunel.enable). - [Broadcast Box](https://github.com/Glimesh/broadcast-box), a WebRTC broadcast server. Available as [services.broadcast-box](options.html#opt-services.broadcast-box.enable). diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix index 6390837ca4cb..5326f1b216d5 100644 --- a/nixos/modules/module-list.nix +++ b/nixos/modules/module-list.nix @@ -1217,6 +1217,7 @@ ./services/networking/libreswan.nix ./services/networking/livekit-ingress.nix ./services/networking/livekit.nix + ./services/networking/llama-swap.nix ./services/networking/lldpd.nix ./services/networking/logmein-hamachi.nix ./services/networking/lokinet.nix diff --git a/nixos/modules/services/networking/llama-swap.nix b/nixos/modules/services/networking/llama-swap.nix new file mode 100644 index 000000000000..c23107a89d58 --- /dev/null +++ b/nixos/modules/services/networking/llama-swap.nix @@ -0,0 +1,124 @@ +{ + config, + lib, + pkgs, + ... +}: +let + cfg = config.services.llama-swap; + settingsFormat = pkgs.formats.yaml { }; + configFile = settingsFormat.generate "config.yaml" cfg.settings; +in +{ + options.services.llama-swap = { + enable = lib.mkEnableOption "enable the llama-swap service"; + + package = lib.mkPackageOption pkgs "llama-swap" { }; + + port = lib.mkOption { + default = 8080; + example = 11343; + type = lib.types.port; + description = '' + Port that llama-swap listens on. + ''; + }; + + openFirewall = lib.mkOption { + type = lib.types.bool; + default = false; + description = '' + Whether to open the firewall for llama-swap. + This adds {option}`port` to [](#opt-networking.firewall.allowedTCPPorts). + ''; + }; + + settings = lib.mkOption { + type = lib.types.submodule { freeformType = settingsFormat.type; }; + description = '' + llama-swap configuration. Refer to the [llama-swap example configuration](https://github.com/mostlygeek/llama-swap/blob/main/config.example.yaml) + for details on supported values. + ''; + example = lib.literalExpression '' + let + llama-cpp = pkgs.llama-cpp.override { rocmSupport = true; }; + llama-server = lib.getExe' llama-cpp "llama-server"; + in + { + healthCheckTimeout = 60; + models = { + "some-model" = { + cmd = "$\{llama-server\} --port ''\${PORT} -m /var/lib/llama-cpp/models/some-model.gguf -ngl 0 --no-webui"; + aliases = [ + "the-best" + ]; + }; + "other-model" = { + proxy = "http://127.0.0.1:5555"; + cmd = "$\{llama-server\} --port 5555 -m /var/lib/llama-cpp/models/other-model.gguf -ngl 0 -c 4096 -np 4 --no-webui"; + concurrencyLimit = 4; + }; + }; + }; + ''; + }; + }; + config = lib.mkIf cfg.enable { + systemd.services.llama-swap = { + description = "Model swapping for LLaMA C++ Server (or any local OpenAPI compatible server)"; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "exec"; + ExecStart = "${lib.getExe cfg.package} --listen :${toString cfg.port} --config ${configFile}"; + Restart = "on-failure"; + RestartSec = 3; + + # for GPU acceleration + PrivateDevices = false; + + # hardening + DynamicUser = true; + CapabilityBoundingSet = ""; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + "AF_UNIX" + ]; + NoNewPrivileges = true; + PrivateMounts = true; + PrivateTmp = true; + PrivateUsers = true; + ProtectClock = true; + ProtectControlGroups = true; + ProtectHome = true; + ProtectKernelLogs = true; + ProtectKernelModules = true; + ProtectKernelTunables = true; + ProtectSystem = "strict"; + MemoryDenyWriteExecute = true; + LockPersonality = true; + RemoveIPC = true; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + SystemCallArchitectures = "native"; + SystemCallFilter = [ + "@system-service" + "~@privileged" + ]; + SystemCallErrorNumber = "EPERM"; + ProtectProc = "invisible"; + ProtectHostname = true; + ProcSubset = "pid"; + }; + }; + networking.firewall = lib.mkIf cfg.openFirewall { allowedTCPPorts = [ cfg.port ]; }; + }; + + meta.maintainers = with lib.maintainers; [ + jk + podium868909 + ]; +}