llama-swap: init module

Co-authored-by: podium868909 <89096245@proton.me>
2025-06-16 19:57:05 +08:00
parent 615db5eecf
commit 110edff547
3 changed files with 127 additions and 0 deletions
--- a/nixos/doc/manual/release-notes/rl-2511.section.md
+++ b/nixos/doc/manual/release-notes/rl-2511.section.md
@@ -50,6 +50,8 @@

 - [go-httpbin](https://github.com/mccutchen/go-httpbin), a reasonably complete and well-tested golang port of httpbin, with zero dependencies outside the go stdlib. Available as [services.go-httpbin](#opt-services.go-httpbin.enable).

+- [llama-swap](https://github.com/mostlygeek/llama-swap), a light weight transparent proxy server that provides automatic model swapping to llama.cpp's server (or any server with an OpenAI compatible endpoint). Available as [](#opt-services.llama-swap.enable).
+
 - [tuwunel](https://matrix-construct.github.io/tuwunel/), a federated chat server implementing the Matrix protocol, forked from Conduwuit. Available as [services.matrix-tuwunel](#opt-services.matrix-tuwunel.enable).

 - [Broadcast Box](https://github.com/Glimesh/broadcast-box), a WebRTC broadcast server. Available as [services.broadcast-box](options.html#opt-services.broadcast-box.enable).
--- a/nixos/modules/module-list.nix
+++ b/nixos/modules/module-list.nix
@@ -1217,6 +1217,7 @@
  ./services/networking/libreswan.nix
  ./services/networking/livekit-ingress.nix
  ./services/networking/livekit.nix
+  ./services/networking/llama-swap.nix
  ./services/networking/lldpd.nix
  ./services/networking/logmein-hamachi.nix
  ./services/networking/lokinet.nix
--- a/nixos/modules/services/networking/llama-swap.nix
+++ b/nixos/modules/services/networking/llama-swap.nix
@@ -0,0 +1,124 @@
+{
+  config,
+  lib,
+  pkgs,
+  ...
+}:
+let
+  cfg = config.services.llama-swap;
+  settingsFormat = pkgs.formats.yaml { };
+  configFile = settingsFormat.generate "config.yaml" cfg.settings;
+in
+{
+  options.services.llama-swap = {
+    enable = lib.mkEnableOption "enable the llama-swap service";
+
+    package = lib.mkPackageOption pkgs "llama-swap" { };
+
+    port = lib.mkOption {
+      default = 8080;
+      example = 11343;
+      type = lib.types.port;
+      description = ''
+        Port that llama-swap listens on.
+      '';
+    };
+
+    openFirewall = lib.mkOption {
+      type = lib.types.bool;
+      default = false;
+      description = ''
+        Whether to open the firewall for llama-swap.
+        This adds {option}`port` to [](#opt-networking.firewall.allowedTCPPorts).
+      '';
+    };
+
+    settings = lib.mkOption {
+      type = lib.types.submodule { freeformType = settingsFormat.type; };
+      description = ''
+        llama-swap configuration. Refer to the [llama-swap example configuration](https://github.com/mostlygeek/llama-swap/blob/main/config.example.yaml)
+        for details on supported values.
+      '';
+      example = lib.literalExpression ''
+        let
+          llama-cpp = pkgs.llama-cpp.override { rocmSupport = true; };
+          llama-server = lib.getExe' llama-cpp "llama-server";
+        in
+        {
+          healthCheckTimeout = 60;
+          models = {
+            "some-model" = {
+              cmd = "$\{llama-server\} --port ''\${PORT} -m /var/lib/llama-cpp/models/some-model.gguf -ngl 0 --no-webui";
+              aliases = [
+                "the-best"
+              ];
+            };
+            "other-model" = {
+              proxy = "http://127.0.0.1:5555";
+              cmd = "$\{llama-server\} --port 5555 -m /var/lib/llama-cpp/models/other-model.gguf -ngl 0 -c 4096 -np 4 --no-webui";
+              concurrencyLimit = 4;
+            };
+          };
+        };
+      '';
+    };
+  };
+  config = lib.mkIf cfg.enable {
+    systemd.services.llama-swap = {
+      description = "Model swapping for LLaMA C++ Server (or any local OpenAPI compatible server)";
+      after = [ "network.target" ];
+      wantedBy = [ "multi-user.target" ];
+
+      serviceConfig = {
+        Type = "exec";
+        ExecStart = "${lib.getExe cfg.package} --listen :${toString cfg.port} --config ${configFile}";
+        Restart = "on-failure";
+        RestartSec = 3;
+
+        # for GPU acceleration
+        PrivateDevices = false;
+
+        # hardening
+        DynamicUser = true;
+        CapabilityBoundingSet = "";
+        RestrictAddressFamilies = [
+          "AF_INET"
+          "AF_INET6"
+          "AF_UNIX"
+        ];
+        NoNewPrivileges = true;
+        PrivateMounts = true;
+        PrivateTmp = true;
+        PrivateUsers = true;
+        ProtectClock = true;
+        ProtectControlGroups = true;
+        ProtectHome = true;
+        ProtectKernelLogs = true;
+        ProtectKernelModules = true;
+        ProtectKernelTunables = true;
+        ProtectSystem = "strict";
+        MemoryDenyWriteExecute = true;
+        LockPersonality = true;
+        RemoveIPC = true;
+        RestrictNamespaces = true;
+        RestrictRealtime = true;
+        RestrictSUIDSGID = true;
+        SystemCallArchitectures = "native";
+        SystemCallFilter = [
+          "@system-service"
+          "~@privileged"
+        ];
+        SystemCallErrorNumber = "EPERM";
+        ProtectProc = "invisible";
+        ProtectHostname = true;
+        ProcSubset = "pid";
+      };
+    };
+    networking.firewall = lib.mkIf cfg.openFirewall { allowedTCPPorts = [ cfg.port ]; };
+  };
+
+  meta.maintainers = with lib.maintainers; [
+    jk
+    podium868909
+  ];
+}