From fb1647ec6e02030b1ea0a2a84686f9cd49c02ace Mon Sep 17 00:00:00 2001
From: Philip Taron <philip.taron@gmail.com>
Date: Wed, 17 Sep 2025 12:50:49 -0700
Subject: [PATCH] ci.eval.compare: explain the various metrics under the
 --explain flag

---
 ci/eval/compare/cmp-stats.py | 59 +++++++++++++++++++++++++++++++++---
 ci/eval/compare/default.nix  |  2 +-
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/ci/eval/compare/cmp-stats.py b/ci/eval/compare/cmp-stats.py
index 7c536b6330c8..a456e721fffb 100644
--- a/ci/eval/compare/cmp-stats.py
+++ b/ci/eval/compare/cmp-stats.py
@@ -8,6 +8,7 @@ import warnings
 from pathlib import Path
 from scipy.stats import ttest_rel
 from tabulate import tabulate
+from typing import Final
 
 
 def flatten_data(json_data: dict) -> dict:
@@ -86,6 +87,49 @@ def load_all_metrics(path: Path) -> dict:
     return metrics
 
 
+def metric_table_name(name: str, explain: bool) -> str:
+    """
+    Returns the name of the metric, plus a footnote to explain it if needed.
+    """
+    return f"{name}[^{name}]" if explain else name
+
+
+METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
+
+[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
+[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
+[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
+[^gc.cycles]: Number of times garbage collection has been performed.
+[^gc.heapSize]: Size in bytes of the garbage collector heap.
+[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
+[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
+[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
+[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
+[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
+[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
+[^envs.number]: The count of all `Env` objects allocated.
+[^nrAvoided]: The number of thunks avoided being created.
+[^nrExprs]: The number of expression objects ever created.
+[^nrFunctionCalls]: The number of function calls ever made.
+[^nrLookups]: The number of lookups into an attrset ever made.
+[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
+[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
+[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
+[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
+[^sets.number]: The number of attrsets ever made.
+[^symbols.number]: The number of symbols ever added to the symbol table.
+[^values.number]: The number of values ever made.
+[^envs.elements]: The number of values contained within an `Env` object.
+[^list.concats]: The number of list concatenation operations (`++`) performed.
+[^list.elements]: The number of values contained within a list.
+[^sets.elements]: The number of values contained within an attrset.
+[^sizes.Attr]: Size in bytes of the `Attr` type.
+[^sizes.Bindings]: Size in bytes of the `Bindings` type.
+[^sizes.Env]: Size in bytes of the `Env` type.
+[^sizes.Value]: Size in bytes of the `Value` type.
+"""
+
+
 def metric_sort_key(name: str) -> str:
     if name in ("time.cpu", "time.gc", "time.gcFraction"):
         return (1, name)
@@ -99,7 +143,7 @@ def metric_sort_key(name: str) -> str:
         return (5, name)
 
 
-def dataframe_to_markdown(df: pd.DataFrame) -> str:
+def dataframe_to_markdown(df: pd.DataFrame, explain: bool) -> str:
     df = df.sort_values(
         by=df.columns[0], ascending=True, key=lambda s: s.map(metric_sort_key)
     )
@@ -108,14 +152,18 @@ def dataframe_to_markdown(df: pd.DataFrame) -> str:
     headers = [str(column) for column in df.columns]
     table = [
         [
-            row["metric"],
+            # The metric acts as its own footnote name
+            metric_table_name(row["metric"], explain),
             # Check for no change and NaN in p_value/t_stat
             *[None if np.isnan(val) or np.allclose(val, 0) else val for val in row[1:]],
         ]
         for _, row in df.iterrows()
     ]
 
-    return tabulate(table, headers, tablefmt="github", floatfmt=".4f", missingval="-")
+    result = tabulate(table, headers, tablefmt="github", floatfmt=".4f", missingval="-")
+    if explain:
+        result += METRIC_EXPLANATION_FOOTNOTE
+    return result
 
 
 def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
@@ -173,6 +221,9 @@ def main():
     parser = argparse.ArgumentParser(
         description="Performance comparison of Nix evaluation statistics"
     )
+    parser.add_argument(
+        "--explain", action="store_true", help="Explain the evaluation statistics"
+    )
     parser.add_argument(
         "before", help="File or directory containing baseline (data before)"
     )
@@ -191,7 +242,7 @@ def main():
     before_metrics = load_all_metrics(before_stats)
     after_metrics = load_all_metrics(after_stats)
     df1 = perform_pairwise_tests(before_metrics, after_metrics)
-    markdown_table = dataframe_to_markdown(df1)
+    markdown_table = dataframe_to_markdown(df1, explain=options.explain)
     print(markdown_table)
 
 
diff --git a/ci/eval/compare/default.nix b/ci/eval/compare/default.nix
index 7b9f03e602a8..3a025a0238f6 100644
--- a/ci/eval/compare/default.nix
+++ b/ci/eval/compare/default.nix
@@ -209,7 +209,7 @@ runCommand "compare"
         echo
       } >> $out/step-summary.md
 
-      cmp-stats ${combined}/before/stats ${combined}/after/stats >> $out/step-summary.md
+      cmp-stats --explain ${combined}/before/stats ${combined}/after/stats >> $out/step-summary.md
 
     else
       # Package chunks are the same in both revisions