201 lines
8.4 KiB
Python
201 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# usage: auto-layer.py graph_file [ignore_file] [layer_limit]
|
|
|
|
# graph_file: Path to a json file as generated by writeReferencesGraph
|
|
# ignore_file: Path to a file with a list of store paths that should not appear in the output
|
|
# layer_limit: Maximum number of layers to generate, default 100
|
|
|
|
# This module tries to split a dependency graph of nix store paths into a
|
|
# limited set of layers that together cover all mentioned paths. It tries to
|
|
# choose the layers such that different inputs often have the largest layers in
|
|
# common so most layers can be shared, while the differences in the results end
|
|
# up in smaller layers.
|
|
|
|
# It does this by splitting off the N largest store paths (by nar size) into
|
|
# their own layers, including some of their dependencies.
|
|
# Specifically, for a large store path L, it creates a layer with L and any
|
|
# store path D that L depends on and for which there is no store path in the
|
|
# input that depends on D but not on L.
|
|
# Then, if there are any store paths that are depended on by multiple of the
|
|
# chosen large store paths, those common dependencies will get their own layer,
|
|
# one per set of large store paths that depends on them.
|
|
# N is iteratively increased until the layer limit is reached.
|
|
|
|
# The reasoning for this algorithm is as follows:
|
|
|
|
# Most closures contain a few large store paths and many small store paths. If
|
|
# we want to share as many bytes as possible with other layered images, we
|
|
# should focus on putting the largest paths in their own layer.
|
|
|
|
# If we had data on how much each store path is used and how likely each
|
|
# combination of store paths is, we might be able to infer which large store
|
|
# paths are better off being combined into a single layer. However, getting that
|
|
# information, let alone keeping it up-to-date is very difficult. If we can't
|
|
# tell that two large store paths are often going to appear together, then we're
|
|
# better off giving each of them their own layer.
|
|
|
|
# This leaves a lot of smaller store paths to be assigned to layers. Anything
|
|
# that will depend on a large store path L will also depend on all the store
|
|
# paths that L depends on, so it makes sense to move the dependencies of L into
|
|
# the same layer as L.
|
|
|
|
# Possible improvements:
|
|
# - Specifying a size limit below which the algorithm stops using large store
|
|
# paths as new layer roots might further improve sharing as the layer
|
|
# boundaries will depend less on the number of larger store paths in the
|
|
# input.
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
def layer_count(layer_split):
|
|
return len(set(layer_split.values()))
|
|
|
|
def path_key(path):
|
|
hash, name = path.split('-', 1)
|
|
return name, hash
|
|
|
|
def closure(*todo, key):
|
|
"""
|
|
Find all dependencies of the arguments including the arguments themselves.
|
|
"""
|
|
todo = set(todo)
|
|
done = set()
|
|
while todo:
|
|
x = todo.pop()
|
|
if x not in done:
|
|
done.add(x)
|
|
todo.update(key(x))
|
|
return done
|
|
|
|
def dependencies(*todo, key):
|
|
"""
|
|
Find all dependencies of the arguments excluding the arguments themselves.
|
|
"""
|
|
return closure(*todo, key=key) - set(todo)
|
|
|
|
def minimal_cover(paths, key):
|
|
"""
|
|
The minimal set of paths that together cover all input paths with their
|
|
closure. None of the result paths depend on each other.
|
|
"""
|
|
paths = set(paths)
|
|
paths_deps = set.union(*(dependencies(d, key=key) for d in paths))
|
|
return paths - paths_deps
|
|
|
|
def auto_layer(graph, ignore_paths, layer_limit):
|
|
# Compute all direct users of each path
|
|
nodes = {x["path"]: x | {"users": set()} for x in graph}
|
|
for user in nodes:
|
|
for ref in nodes[user]["references"]:
|
|
nodes[ref]["users"] |= {user}
|
|
|
|
def node_deps(path):
|
|
nonlocal nodes
|
|
return nodes[path]["references"]
|
|
|
|
def node_users(path):
|
|
nonlocal nodes
|
|
return nodes[path]["users"]
|
|
|
|
nodes_by_size = sorted(graph, key=lambda node: node["narSize"])
|
|
|
|
# Here starts the main algorithm:
|
|
# The goal is to split the set of store paths into layers such that the layers are likely to be
|
|
# reusable and that the closure size is spread out over the layers. We do this by iteratively taking
|
|
# the largest store path and giving it its own layer. This primary store path becomes the identity
|
|
# of the layer. We also add every dependency of the identifying store path to the same layer unless
|
|
# it is also used by something that doesn't depend on the identifying store path. More generally, we
|
|
# put store paths together in the same layer when the set of other layers that depend on it is the
|
|
# same.
|
|
|
|
# layer_split defines how the layers are currently split. We start with a single layer with no
|
|
# dependencies. This is encoded as every store path mapped to the empty set of dependencies.
|
|
# In general, layer_split maps each store path to the set of primary paths that depend on it and
|
|
# that set defines and identifies the layer.
|
|
layer_split = {path: frozenset() for path in nodes}
|
|
|
|
primary_paths = set()
|
|
while nodes_by_size:
|
|
# Every iteration, we choose the next biggest path to be the root of a new layer.
|
|
new_primary_path = nodes_by_size.pop()["path"]
|
|
primary_paths.add(new_primary_path)
|
|
new_layer_split = layer_split.copy()
|
|
new_layer_split[new_primary_path] = frozenset({new_primary_path})
|
|
new_primary_path_deps = dependencies(new_primary_path, key=node_deps)
|
|
new_primary_path_users = dependencies(new_primary_path, key=node_users)
|
|
|
|
# Update the set of primary users for every dependency of the new primary path.
|
|
for dep in new_primary_path_deps:
|
|
new_layer_split[dep] -= new_primary_path_users
|
|
if not new_layer_split[dep] & new_primary_path_deps:
|
|
new_layer_split[dep] |= {new_primary_path}
|
|
|
|
# If we exceed the layer limit, we give up. The previous split should be good enough.
|
|
if layer_count(new_layer_split) > layer_limit:
|
|
break
|
|
layer_split = new_layer_split
|
|
|
|
# Main algorithm done, the layers have been chosen.
|
|
# Now, let's give each layer some metadata, mostly for debugging.
|
|
|
|
def layer_info(layer_id):
|
|
nonlocal nodes
|
|
nonlocal layer_split
|
|
# The full set of paths in this layer is all the paths that were assigned to it.
|
|
paths = {path
|
|
for path, layer_id_2 in layer_split.items()
|
|
if layer_id == layer_id_2}
|
|
layerSize = sum(nodes[path]["narSize"] for path in paths)
|
|
return {
|
|
"usedBy": sorted(layer_id, key=path_key),
|
|
"paths": sorted(paths, key=path_key),
|
|
"layerSize": layerSize,
|
|
"closureSize": sum(nodes[path]["narSize"] for path in closure(*paths, key=node_deps)),
|
|
}
|
|
|
|
layers = {layer_id: layer_info(layer_id)
|
|
for layer_id in set(layer_split.values())}
|
|
|
|
# The layer order doesn't actually matter for docker but it's still kind of neat to have layers come
|
|
# after all of their dependencies. The easiest way to do that is to order by closure size since a
|
|
# layer is necessarily always larger than each of its dependencies since it includes them.
|
|
layer_order = sorted(layers.values(), key=lambda info: info["closureSize"])
|
|
|
|
if os.environ.get("DEBUG"):
|
|
print(json.dumps(layer_order, indent=2), file=sys.stderr)
|
|
|
|
# Sanity check that no store path ends up in multiple layers.
|
|
total_layer_size = sum(node["layerSize"] for node in layer_order)
|
|
total_nar_size = sum(node["narSize"] for node in graph)
|
|
assert total_layer_size == total_nar_size, (total_layer_size, total_nar_size)
|
|
|
|
# Format as a list of layers, each defined as a list of store paths.
|
|
return [[path
|
|
for path in layer["paths"]
|
|
if path not in ignore_paths]
|
|
for layer in layer_order
|
|
if set(layer["paths"]) - ignore_paths]
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog='auto-layer',
|
|
description='Split store paths into docker layers.'
|
|
)
|
|
parser.add_argument('graph_file')
|
|
parser.add_argument('ignore_file', default="/dev/null")
|
|
parser.add_argument('layer_limit', type=int, default=100)
|
|
args = parser.parse_args()
|
|
|
|
with open(args.graph_file) as f:
|
|
graph = json.load(f)
|
|
|
|
with open(args.ignore_file) as f:
|
|
ignore_paths = {line.strip() for line in f}
|
|
|
|
print(json.dumps(auto_layer(graph, ignore_paths, args.layer_limit)))
|