From d6b326d6590f23a2fc833e2649f3c7afc77130a4 Mon Sep 17 00:00:00 2001 From: Jacek Galowicz Date: Thu, 3 Jul 2025 09:46:59 +0000 Subject: [PATCH] test-driver: Implement debugging breakpoint hooks Co-authored-by: Maximilian Bosch --- .../writing-nixos-tests.section.md | 51 ++++++++++++++++++ nixos/doc/manual/redirects.json | 6 +++ nixos/lib/test-driver/default.nix | 2 + .../test-driver/src/test_driver/__init__.py | 10 ++++ .../lib/test-driver/src/test_driver/debug.py | 53 +++++++++++++++++++ .../lib/test-driver/src/test_driver/driver.py | 11 ++++ nixos/lib/test-script-prepend.py | 2 + nixos/lib/testing/run.nix | 39 +++++++++++--- 8 files changed, 167 insertions(+), 7 deletions(-) create mode 100644 nixos/lib/test-driver/src/test_driver/debug.py diff --git a/nixos/doc/manual/development/writing-nixos-tests.section.md b/nixos/doc/manual/development/writing-nixos-tests.section.md index 6dc84e6f7b67..f69ea279abb5 100644 --- a/nixos/doc/manual/development/writing-nixos-tests.section.md +++ b/nixos/doc/manual/development/writing-nixos-tests.section.md @@ -340,3 +340,54 @@ id-prefix: test-opt- list-id: test-options-list source: @NIXOS_TEST_OPTIONS_JSON@ ``` + +## Accessing VMs in the sandbox with SSH {#sec-test-sandbox-breakpoint} + +As explained in [](#sec-nixos-test-ssh-access), it's possible to configure an +SSH backdoor based on AF_VSOCK. This can be used to SSH into a VM of a running +build in a sandbox. + +This can be done when something in the test fails, e.g. + +```nix +{ + nodes.machine = {}; + + sshBackdoor.enable = true; + enableDebugHook = true; + + testScript = '' + start_all() + machine.succeed("false") # this will fail + ''; +} +``` + +For the AF_VSOCK feature to work, `/dev/vhost-vsock` is needed in the sandbox +which can be done with e.g. + +``` +nix-build -A nixosTests.foo --option sandbox-paths /dev/vhost-vsock +``` + +This will halt the test execution on a test-failure and print instructions +on how to enter the sandbox shell of the VM test. Inside, one can log into +e.g. `machine` with + +``` +ssh -F ./ssh_config vsock/3 +``` + +As described in [](#sec-nixos-test-ssh-access), the numbers for vsock start at +`3` instead of `1`. So the first VM in the network (sorted alphabetically) can +be accessed with `vsock/3`. + +Alternatively, it's possible to explicitly set a breakpoint with +`debug.breakpoint()`. This also has the benefit, that one can step through +`testScript` with `pdb` like this: + +``` +$ sudo /nix/store/eeeee-attach +bash# telnet 127.0.0.1 4444 +pdb$ … +``` diff --git a/nixos/doc/manual/redirects.json b/nixos/doc/manual/redirects.json index e7c195379cc7..e08b4d31c36c 100644 --- a/nixos/doc/manual/redirects.json +++ b/nixos/doc/manual/redirects.json @@ -1902,6 +1902,9 @@ "test-opt-sshBackdoor.vsockOffset": [ "index.html#test-opt-sshBackdoor.vsockOffset" ], + "test-opt-enableDebugHook": [ + "index.html#test-opt-enableDebugHook" + ], "test-opt-defaults": [ "index.html#test-opt-defaults" ], @@ -2010,6 +2013,9 @@ "sec-nixos-test-testing-hardware-features": [ "index.html#sec-nixos-test-testing-hardware-features" ], + "sec-test-sandbox-breakpoint": [ + "index.html#sec-test-sandbox-breakpoint" + ], "chap-developing-the-test-driver": [ "index.html#chap-developing-the-test-driver" ], diff --git a/nixos/lib/test-driver/default.nix b/nixos/lib/test-driver/default.nix index 91db5d8be3c2..bb07bba363a1 100644 --- a/nixos/lib/test-driver/default.nix +++ b/nixos/lib/test-driver/default.nix @@ -14,6 +14,7 @@ extraPythonPackages ? (_: [ ]), nixosTests, }: + python3Packages.buildPythonApplication { pname = "nixos-test-driver"; version = "1.1"; @@ -32,6 +33,7 @@ python3Packages.buildPythonApplication { junit-xml ptpython ipython + remote-pdb ] ++ extraPythonPackages python3Packages; diff --git a/nixos/lib/test-driver/src/test_driver/__init__.py b/nixos/lib/test-driver/src/test_driver/__init__.py index 86e663da9b7d..823a948bc41b 100755 --- a/nixos/lib/test-driver/src/test_driver/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/__init__.py @@ -5,6 +5,7 @@ from pathlib import Path import ptpython.ipython +from test_driver.debug import Debug, DebugAbstract, DebugNop from test_driver.driver import Driver from test_driver.logger import ( CompositeLogger, @@ -65,6 +66,10 @@ def main() -> None: help="drop into a python repl and run the tests interactively", action=argparse.BooleanOptionalAction, ) + arg_parser.add_argument( + "--debug-hook-attach", + help="Enable interactive debugging breakpoints for sandboxed runs", + ) arg_parser.add_argument( "--start-scripts", metavar="START-SCRIPT", @@ -129,6 +134,10 @@ def main() -> None: if not args.keep_vm_state: logger.info("Machine state will be reset. To keep it, pass --keep-vm-state") + debugger: DebugAbstract = DebugNop() + if args.debug_hook_attach is not None: + debugger = Debug(logger, args.debug_hook_attach) + with Driver( args.start_scripts, args.vlans, @@ -137,6 +146,7 @@ def main() -> None: logger, args.keep_vm_state, args.global_timeout, + debug=debugger, ) as driver: if args.interactive: history_dir = os.getcwd() diff --git a/nixos/lib/test-driver/src/test_driver/debug.py b/nixos/lib/test-driver/src/test_driver/debug.py new file mode 100644 index 000000000000..7f783fe96de8 --- /dev/null +++ b/nixos/lib/test-driver/src/test_driver/debug.py @@ -0,0 +1,53 @@ +import logging +import os +import random +import shutil +import subprocess +import sys +from abc import ABC, abstractmethod + +from remote_pdb import RemotePdb # type:ignore + +from test_driver.logger import AbstractLogger + + +class DebugAbstract(ABC): + @abstractmethod + def breakpoint(self, host: str = "127.0.0.1", port: int = 4444) -> None: + pass + + +class DebugNop(DebugAbstract): + def __init__(self) -> None: + pass + + def breakpoint(self, host: str = "127.0.0.1", port: int = 4444) -> None: + pass + + +class Debug(DebugAbstract): + def __init__(self, logger: AbstractLogger, attach_command: str) -> None: + self.breakpoint_on_failure = False + self.logger = logger + self.attach = attach_command + + def breakpoint(self, host: str = "127.0.0.1", port: int = 4444) -> None: + """ + Call this function to stop execution and put the process on sleep while + at the same time have the test driver provide a debug shell on TCP port + `port`. This is meant to be used for sandboxed tests that have the test + driver feature `enableDebugHook` enabled. + """ + pattern = str(random.randrange(999999, 9999999)) + self.logger.log_test_error( + f"Breakpoint reached, run 'sudo {self.attach} {pattern}'" + ) + os.environ["bashInteractive"] = shutil.which("bash") # type:ignore + if os.fork() == 0: + subprocess.run(["sleep", pattern]) + else: + # RemotePdb writes log messages to both stderr AND the logger, + # which is the same here. Hence, disabling the remote_pdb logger + # to avoid duplicate messages in the build log. + logging.root.manager.loggerDict["remote_pdb"].disabled = True # type:ignore + RemotePdb(host=host, port=port).set_trace(sys._getframe().f_back) diff --git a/nixos/lib/test-driver/src/test_driver/driver.py b/nixos/lib/test-driver/src/test_driver/driver.py index 57b434f09e29..361bb1c2a93d 100644 --- a/nixos/lib/test-driver/src/test_driver/driver.py +++ b/nixos/lib/test-driver/src/test_driver/driver.py @@ -13,6 +13,7 @@ from unittest import TestCase from colorama import Style +from test_driver.debug import DebugAbstract, DebugNop from test_driver.errors import MachineError, RequestedAssertionFailed from test_driver.logger import AbstractLogger from test_driver.machine import Machine, NixStartScript, retry @@ -67,6 +68,7 @@ class Driver: global_timeout: int race_timer: threading.Timer logger: AbstractLogger + debug: DebugAbstract def __init__( self, @@ -77,12 +79,14 @@ class Driver: logger: AbstractLogger, keep_vm_state: bool = False, global_timeout: int = 24 * 60 * 60 * 7, + debug: DebugAbstract = DebugNop(), ): self.tests = tests self.out_dir = out_dir self.global_timeout = global_timeout self.race_timer = threading.Timer(global_timeout, self.terminate_test) self.logger = logger + self.debug = debug tmp_dir = get_tmp_dir() @@ -159,6 +163,7 @@ class Driver: polling_condition=self.polling_condition, Machine=Machine, # for typing t=AssertionTester(), + debug=self.debug, ) machine_symbols = {pythonize_name(m.name): m for m in self.machines} # If there's exactly one machine, make it available under the name @@ -224,8 +229,14 @@ class Driver: for line in f"{exc_prefix}: {exc}".splitlines(): self.logger.log_test_error(line) + self.debug.breakpoint() + sys.exit(1) + except Exception: + self.debug.breakpoint() + raise + def run_tests(self) -> None: """Run the test script (for non-interactive test runs)""" self.logger.info( diff --git a/nixos/lib/test-script-prepend.py b/nixos/lib/test-script-prepend.py index 31dad14ef8dd..067fd20fe7c5 100644 --- a/nixos/lib/test-script-prepend.py +++ b/nixos/lib/test-script-prepend.py @@ -1,6 +1,7 @@ # This file contains type hints that can be prepended to Nix test scripts so they can be type # checked. +from test_driver.debug import DebugAbstract from test_driver.driver import Driver from test_driver.vlan import VLan from test_driver.machine import Machine @@ -52,4 +53,5 @@ join_all: Callable[[], None] serial_stdout_off: Callable[[], None] serial_stdout_on: Callable[[], None] polling_condition: PollingConditionProtocol +debug: DebugAbstract t: TestCase diff --git a/nixos/lib/testing/run.nix b/nixos/lib/testing/run.nix index 45ab311cfbc4..ab1dc7733d5f 100644 --- a/nixos/lib/testing/run.nix +++ b/nixos/lib/testing/run.nix @@ -7,6 +7,7 @@ }: let inherit (lib) types mkOption; + inherit (hostPkgs.stdenv.hostPlatform) isDarwin isLinux; # TODO (lib): Also use lib equivalent in nodes.nix /** @@ -26,7 +27,6 @@ let */ f: lib.mkOverride (opt.highestPrio - 1) (f opt.value); - in { options = { @@ -42,6 +42,15 @@ in ''; }; + enableDebugHook = lib.mkEnableOption "" // { + description = '' + Halt test execution after any test fail and provide the possibility to + hook into the sandbox to connect with either the test driver via + `telnet localhost 4444` or with the VMs via SSH and vsocks (see also + `sshBackdoor.enable`). + ''; + }; + rawTestDerivation = mkOption { type = types.package; description = '' @@ -74,15 +83,23 @@ in rawTestDerivation = hostPkgs.stdenv.mkDerivation config.rawTestDerivationArg; rawTestDerivationArg = finalAttrs: - assert lib.assertMsg (!config.sshBackdoor.enable) - "The SSH backdoor is currently not supported for non-interactive testing! Please make sure to only set `interactive.sshBackdoor.enable = true;`!"; + assert lib.assertMsg ( + config.sshBackdoor.enable -> isLinux + ) "The SSH backdoor is not supported for macOS host systems!"; + + assert lib.assertMsg ( + config.enableDebugHook -> isLinux + ) "The debugging hook is not supported for macOS host systems!"; { name = "vm-test-run-${config.name}"; requiredSystemFeatures = - [ "nixos-test" ] - ++ lib.optionals hostPkgs.stdenv.hostPlatform.isLinux [ "kvm" ] - ++ lib.optionals hostPkgs.stdenv.hostPlatform.isDarwin [ "apple-virt" ]; + [ "nixos-test" ] ++ lib.optional isLinux "kvm" ++ lib.optional isDarwin "apple-virt"; + + nativeBuildInputs = lib.optionals config.enableDebugHook [ + hostPkgs.openssh + hostPkgs.inetutils + ]; buildCommand = '' mkdir -p $out @@ -90,7 +107,15 @@ in # effectively mute the XMLLogger export LOGFILE=/dev/null - ${config.driver}/bin/nixos-test-driver -o $out + ${lib.optionalString config.enableDebugHook '' + ln -sf \ + ${hostPkgs.systemd}/lib/systemd/ssh_config.d/20-systemd-ssh-proxy.conf \ + ssh_config + ''} + + ${config.driver}/bin/nixos-test-driver \ + -o $out \ + ${lib.optionalString config.enableDebugHook "--debug-hook=${hostPkgs.breakpointHook.attach}"} ''; passthru = config.passthru;