From c4d00e062a1a36f48d57a3686e20d65c42c6db85 Mon Sep 17 00:00:00 2001 From: Daniel Barlow Date: Tue, 30 Jul 2024 22:37:43 +0100 Subject: [PATCH] add health check service and example that uses it --- examples/l2tp.nix | 31 +++++++++++++++-------- modules/health-check/default.nix | 43 ++++++++++++++++++++++++++++++++ modules/health-check/service.nix | 37 +++++++++++++++++++++++++++ overlay.nix | 1 + 4 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 modules/health-check/default.nix create mode 100644 modules/health-check/service.nix diff --git a/examples/l2tp.nix b/examples/l2tp.nix index f7b66ca..a22709e 100644 --- a/examples/l2tp.nix +++ b/examples/l2tp.nix @@ -53,6 +53,7 @@ in rec { # ../modules/mount ../modules/ppp ../modules/round-robin + ../modules/health-check ../modules/profiles/gateway.nix ]; hostname = "thing"; @@ -106,18 +107,28 @@ in rec { target = lns.address; dependencies = [services.bootstrap-dhcpc check-address]; }; - in svc.l2tp.build { - lns = lns.address; - ppp-options = [ - "debug" "+ipv6" "noauth" - "name" rsecrets.l2tp.name - "password" rsecrets.l2tp.password - ]; - dependencies = [config.services.lns-address route check-address]; - }; + l2tpd= svc.l2tp.build { + lns = lns.address; + ppp-options = [ + "debug" "+ipv6" "noauth" + "name" rsecrets.l2tp.name + "password" rsecrets.l2tp.password + ]; + dependencies = [config.services.lns-address route check-address]; + }; + in + svc.health-check.build { + service = l2tpd; + threshold = 3; + interval = 2; + healthCheck = pkgs.writeAshScript "ping-check" {} "ping 1.1.1.1"; + }; in svc.round-robin.build { name = "wan"; - services = [ l2tp pppoe ]; + services = [ + pppoe + l2tp + ]; }; dhcp6.enable = true; }; diff --git a/modules/health-check/default.nix b/modules/health-check/default.nix new file mode 100644 index 0000000..9ac8858 --- /dev/null +++ b/modules/health-check/default.nix @@ -0,0 +1,43 @@ +## Health check +## +## Runs a service and a separate periodic health process. When the +## health check starts failing over a period of time, kill the service. +## (Usually that means the supervisor will restart it, but you can +## have other behaviours by e.g. combining this service with a round-robin +## for failover) + + +{ lib, pkgs, config, ...}: +let + inherit (lib) mkOption types; + inherit (pkgs) liminix; +# inherit (pkgs.liminix.services) longrun; +in { + options = { + system.service.health-check = mkOption { + description = "run a service while periodically checking it is healthy"; + type = liminix.lib.types.serviceDefn; + }; + }; + config.system.service.health-check = config.system.callService ./service.nix { + service = mkOption { + type = liminix.lib.types.service; + }; + interval = mkOption { + description = "interval between checks, in seconds"; + type = types.int; + default = 10; + example = 10; + }; + threshold = mkOption { + description = "number of consecutive failures required for the service to be kicked"; + type = types.int; + example = 3; + }; + healthCheck = mkOption { + description = "health check command or script. Expected to exit 0 if the service is healthy or any other exit status otherwise"; + type = types.path; + }; + }; + config.programs.busybox.applets = ["expr"]; +} diff --git a/modules/health-check/service.nix b/modules/health-check/service.nix new file mode 100644 index 0000000..80b3e34 --- /dev/null +++ b/modules/health-check/service.nix @@ -0,0 +1,37 @@ +{ + liminix, lib, lim, s6 +}: +{ service, interval, threshold, healthCheck } : +let + inherit (liminix.services) oneshot longrun; + inherit (builtins) toString; + inherit (service) name; + checker = let name' = "check-${name}"; in longrun { + name = name'; + run = '' + fails=0 + echo waiting for /run/service/${name} + ${s6}/bin/s6-svwait -U /run/service/${name} || exit + while sleep ${toString interval} ; do + ${healthCheck} + if test $? -gt 0; then + fails=$(expr $fails + 1) + else + fails=0 + fi + echo fails $fails/${toString threshold} for ${name} + if test "$fails" -gt "${toString threshold}" ; then + echo time to die + ${s6}/bin/s6-svc -r /run/service/${name} + echo bounced + fails=0 + echo waiting for /run/service/${name} + ${s6}/bin/s6-svwait -U /run/service/${name} + fi + done + ''; + }; +in service.overrideAttrs(o: { + buildInputs = (lim.orEmpty o.buildInputs) ++ [ checker ]; + dependencies = (lim.orEmpty o.dependencies) ++ [ checker ]; +}) diff --git a/overlay.nix b/overlay.nix index 3fe3667..b33c7c2 100644 --- a/overlay.nix +++ b/overlay.nix @@ -47,6 +47,7 @@ extraPkgs // { # liminix library functions lim = { parseInt = s: (builtins.fromTOML "r=${s}").r; + orEmpty = x: if x != null then x else []; }; # keep these alphabetical