add health check service and example that uses it

This commit is contained in:
Daniel Barlow 2024-07-30 22:37:43 +01:00
parent 8fa3443923
commit c4d00e062a
4 changed files with 102 additions and 10 deletions

View File

@ -53,6 +53,7 @@ in rec {
# ../modules/mount # ../modules/mount
../modules/ppp ../modules/ppp
../modules/round-robin ../modules/round-robin
../modules/health-check
../modules/profiles/gateway.nix ../modules/profiles/gateway.nix
]; ];
hostname = "thing"; hostname = "thing";
@ -106,18 +107,28 @@ in rec {
target = lns.address; target = lns.address;
dependencies = [services.bootstrap-dhcpc check-address]; dependencies = [services.bootstrap-dhcpc check-address];
}; };
in svc.l2tp.build { l2tpd= svc.l2tp.build {
lns = lns.address; lns = lns.address;
ppp-options = [ ppp-options = [
"debug" "+ipv6" "noauth" "debug" "+ipv6" "noauth"
"name" rsecrets.l2tp.name "name" rsecrets.l2tp.name
"password" rsecrets.l2tp.password "password" rsecrets.l2tp.password
]; ];
dependencies = [config.services.lns-address route check-address]; dependencies = [config.services.lns-address route check-address];
}; };
in
svc.health-check.build {
service = l2tpd;
threshold = 3;
interval = 2;
healthCheck = pkgs.writeAshScript "ping-check" {} "ping 1.1.1.1";
};
in svc.round-robin.build { in svc.round-robin.build {
name = "wan"; name = "wan";
services = [ l2tp pppoe ]; services = [
pppoe
l2tp
];
}; };
dhcp6.enable = true; dhcp6.enable = true;
}; };

View File

@ -0,0 +1,43 @@
## Health check
##
## Runs a service and a separate periodic health process. When the
## health check starts failing over a period of time, kill the service.
## (Usually that means the supervisor will restart it, but you can
## have other behaviours by e.g. combining this service with a round-robin
## for failover)
{ lib, pkgs, config, ...}:
let
inherit (lib) mkOption types;
inherit (pkgs) liminix;
# inherit (pkgs.liminix.services) longrun;
in {
options = {
system.service.health-check = mkOption {
description = "run a service while periodically checking it is healthy";
type = liminix.lib.types.serviceDefn;
};
};
config.system.service.health-check = config.system.callService ./service.nix {
service = mkOption {
type = liminix.lib.types.service;
};
interval = mkOption {
description = "interval between checks, in seconds";
type = types.int;
default = 10;
example = 10;
};
threshold = mkOption {
description = "number of consecutive failures required for the service to be kicked";
type = types.int;
example = 3;
};
healthCheck = mkOption {
description = "health check command or script. Expected to exit 0 if the service is healthy or any other exit status otherwise";
type = types.path;
};
};
config.programs.busybox.applets = ["expr"];
}

View File

@ -0,0 +1,37 @@
{
liminix, lib, lim, s6
}:
{ service, interval, threshold, healthCheck } :
let
inherit (liminix.services) oneshot longrun;
inherit (builtins) toString;
inherit (service) name;
checker = let name' = "check-${name}"; in longrun {
name = name';
run = ''
fails=0
echo waiting for /run/service/${name}
${s6}/bin/s6-svwait -U /run/service/${name} || exit
while sleep ${toString interval} ; do
${healthCheck}
if test $? -gt 0; then
fails=$(expr $fails + 1)
else
fails=0
fi
echo fails $fails/${toString threshold} for ${name}
if test "$fails" -gt "${toString threshold}" ; then
echo time to die
${s6}/bin/s6-svc -r /run/service/${name}
echo bounced
fails=0
echo waiting for /run/service/${name}
${s6}/bin/s6-svwait -U /run/service/${name}
fi
done
'';
};
in service.overrideAttrs(o: {
buildInputs = (lim.orEmpty o.buildInputs) ++ [ checker ];
dependencies = (lim.orEmpty o.dependencies) ++ [ checker ];
})

View File

@ -47,6 +47,7 @@ extraPkgs // {
# liminix library functions # liminix library functions
lim = { lim = {
parseInt = s: (builtins.fromTOML "r=${s}").r; parseInt = s: (builtins.fromTOML "r=${s}").r;
orEmpty = x: if x != null then x else [];
}; };
# keep these alphabetical # keep these alphabetical