diff --git a/homeConfigurations/tejing/news/default.nix b/homeConfigurations/tejing/news/default.nix index c7c3079..8eb9584 100644 --- a/homeConfigurations/tejing/news/default.nix +++ b/homeConfigurations/tejing/news/default.nix @@ -39,9 +39,6 @@ in # Enable my sfeed module my.sfeed.enable = true; - # Update every hour, at a non-round time - my.sfeed.update = "*:45:35"; - # Pass useful args through to submodule config my.sfeed.rc._module.args = { inherit pkgs my; }; my.sfeed.rc.imports = [ diff --git a/homeConfigurations/tejing/news/rc.public.nix b/homeConfigurations/tejing/news/rc.public.nix index 8b5ab67..e4c17dc 100644 --- a/homeConfigurations/tejing/news/rc.public.nix +++ b/homeConfigurations/tejing/news/rc.public.nix @@ -56,15 +56,14 @@ in }; # Gets around cloudflare's bot detection, last time I checked - fetch.imitate_browser = { + helper.curl_cf = { code = '' - curl --no-alpn -fsSLm 15 -A ${escapeShellArg innocuousUserAgent} "$2"''; + curl --no-alpn -fsSL -A ${escapeShellArg innocuousUserAgent} "$@"''; inputs = [ pkgs.curl ]; }; - fetch.imitate_browser_post = { + fetch.imitate_browser = { code = '' - curl --no-alpn -fsSLm 15 -A ${escapeShellArg innocuousUserAgent} -X POST "$2"''; - inputs = [ pkgs.curl ]; + curl_cf -m 15 "$2"''; }; # A good general pattern for scraping html pages @@ -114,7 +113,7 @@ in helper.normalize_dates = { code = ''awk -F'\t' -v OFS=$'\t' -f '' + pkgs.writeText "sfeed_process_dates.awk" '' BEGIN { - datecmd = "${pkgs.coreutils}/bin/date -f- +%s" + datecmd = "${pkgs.coreutils}/bin/date -uf- +%s" PROCINFO[datecmd, "pty"] = 1 } { diff --git a/homeConfigurations/tejing/news/rc.secret.nix b/homeConfigurations/tejing/news/rc.secret.nix index fd6b671..7431b81 100644 Binary files a/homeConfigurations/tejing/news/rc.secret.nix and b/homeConfigurations/tejing/news/rc.secret.nix differ diff --git a/homeModules/sfeed/default.nix b/homeModules/sfeed/default.nix index 31f63e9..f2eb737 100644 --- a/homeModules/sfeed/default.nix +++ b/homeModules/sfeed/default.nix @@ -8,7 +8,7 @@ let optionalString optionalAttrs splitString mapAttrsToList groupBy' nameValuePair removeSuffix unique filterAttrs makeBinPath mkEnableOption mkOption mkIf; - inherit (lib.types) attrsOf submodule str ints listOf package nullOr submoduleWith; + inherit (lib.types) attrsOf submodule str ints float listOf package nullOr submoduleWith; inherit (pkgs) resholve; # Changing these 2 lines will safely relocate this module's options @@ -33,8 +33,7 @@ let defineFunctions = funcs: concatMapStringsSep "\n" ({name, value}: '' ${name}() { - ${indentString " " '' - ${removeSuffix "\n" value}''} + ${indentString " " (removeSuffix "\n" value)} }'') (if isList funcs then funcs @@ -44,12 +43,9 @@ let # like branches caseStatement = word: default: branches: '' case ${word} in - ${indentString " " '' - ${concatMapStringsSep "\n" ({ patterns, commands }: '' + ${indentString " " (concatMapStringsSep "\n" ({ patterns, commands }: '' ${patterns}) - ${indentString " " '' - ${removeSuffix "\n" commands} - ;;''}'') + ${indentString " " (removeSuffix "\n" commands + "\n;;")}'') (mapAttrsToList (_: patterns: { patterns = concatMapStringsSep "|" escapeShellArg patterns; @@ -63,44 +59,116 @@ let (mapAttrsToList (pattern: commands: { inherit pattern commands; }) branches)) - ++ [ { patterns = "*"; commands = default; } ])}''} + ++ [ { patterns = "*"; commands = default; } ]))} esac''; - # Code for the feed() function - feed_function = defineFunctions { - feed = '' - [ "$(jobs | wc -l)" -ge ${toString cfg.jobs} ] && wait -nf - _feed "$@" & + settings = concatMapStringsSep "\n" ({name,value}: "${name}=${escapeShellArg value}") (mapAttrsToList nameValuePair { + maxjobs = cfg.jobs; + sfeedpath="${config.home.homeDirectory}/.sfeed/feeds"; + stamppath="${config.home.homeDirectory}/.sfeed/stamps"; + }); + + makedirs = concatMapStringsSep "\n" (dir: "mkdir -p ${dir}") [ + ''"$stamppath"'' + ]; + + cp_function = defineFunctions { + cp_function = '' + test -n "$(declare -f "$1")" || return + eval "''${_/#"$1"/"$2"}" ''; }; + # Code for the _feed() function + feed_function = '' + cp_function _feed __feed + ${defineFunctions { + "_feed" = '' + local name="$(printf '%s' "$1" | tr '/' '_')" + local file="$sfeedpath/$name" + local stamp="$stamppath/$name" + if __feed "$@"; then + touch "$file" + rm -f "$stamp" + return 0 + else + touch "$stamp" + return 1 + fi + ''; + }}''; + + # Code for the adaptfeed() function + adaptfeed_function = defineFunctions { + adaptfeed = '' + local file="''${sfeedpath}/$(printf '%s' "''${10}" | tr '/' '_')" + local stamp="''${sfeedpath%%/feeds}/stamps/$(printf '%s' "''${10}" | tr '/' '_')" + mkdir -p "''${sfeedpath%%/feeds}/stamps" + + if [ -e "$file" ]; then + local lastchecked="$(stat -c'%Y' "$file")" + else + local lastchecked=0 + fi + + if [ -e "$stamp" ]; then + local lastfailed="$(stat -c'%Y' "$stamp")" + else + local lastfailed=0 + fi + + # Temporarily set pipefail + local restore_opts="$(shopt -po pipefail)";set -o pipefail + + if sort -k1,1n "$file" | awk -F'\t' -v recentdiv="$1" -v recentmin="$2" -v regressiondiv="$3" -v regressionmin="$4" -v maxdelay="$5" -v firststeplength="$6" -v weightdoublingtime="$7" -v backofffactor="$8" -v deferprobablility="$9" -v lastchecked="$lastchecked" -v lastfailed="$lastfailed" -f ${./regressions.awk}; then + # Restore pipefail setting + eval "$restore_opts" + else + # Restore pipefail setting + eval "$restore_opts" + + shift 9 + feed "$@" || return + fi + ''; + }; + + # Command to go in the feeds() function, for a given feed + feed_command = name: {url, baseurl, encoding, adapt, ... }: + (if adapt.enable then + "adaptfeed " + escapeShellArgs [ + adapt.recentdiv + adapt.recentmin + adapt.regressiondiv + adapt.regressionmin + adapt.maxdelay + adapt.firststeplength + adapt.weightdoublingtime + adapt.backofffactor + adapt.deferprobablility + ] + else + "feed" + ) + " " + escapeShellArgs ( + [ name url ] ++ + (if encoding != "" + then [ baseurl encoding ] + else + (if baseurl != "" + then [ baseurl ] + else [] + ) + ) + ); + # Code for the feeds() function feeds_function = defineFunctions { feeds = if length (attrValues cfg.rc.feeds) == 0 then ":" else - (concatMapStringsSep "\n" - (feed_args: "feed " + escapeShellArgs feed_args) - (mapAttrsToList - (name: {url, baseurl ? "", encoding ? "", ... }: - [ name url ] ++ - (if encoding != "" - then [ baseurl encoding ] - else - (if baseurl != "" - then [ baseurl ] - else []))) - cfg.rc.feeds)); + concatStringsSep "\n" (mapAttrsToList feed_command cfg.rc.feeds); }; # Code for all helper functions defined through cfg.rc.helper - helper_functions = defineFunctions - (mapAttrs (_: v: v.code) cfg.rc.helper - // optionalAttrs - (any - (feed: any (name: hasAttr name feed) overrideableFuncs) - (attrValues cfg.rc.feeds)) - {cp_function = '' - test -n "$(declare -f "$1")" || return - eval "''${_/#"$1"/"$2"}"'';}); + helper_functions = defineFunctions (mapAttrs (_: v: v.code) cfg.rc.helper); # Code to define all configured implementations for ${name} and override the ${name}() function functionOverride = name: optionalString @@ -124,15 +192,19 @@ let # Generate the entire sfeedrc file sfeedrc = resholve.writeScript "sfeedrc" { interpreter = "none"; - inputs = unique (concatMap (x: if x ? inputs then x.inputs else []) (concatMap attrValues (attrValues cfg.rc))); + inputs = unique ([ pkgs.gawk ] ++ concatMap (x: if x ? inputs then x.inputs else []) (concatMap attrValues (attrValues cfg.rc))); execer = unique (concatMap (x: if x ? execer then x.execer else []) (concatMap attrValues (attrValues cfg.rc))); - fake.function = [ "_feed" ] ++ map (name: "__${name}") overrideableFuncs; + fake.function = [ "feed" "__feed" ] ++ map (name: "__${name}") overrideableFuncs; } (concatStringsSep "\n\n" (map (removeSuffix "\n") (filter (x: x != "") ( [ + settings + makedirs + cp_function feed_function + adaptfeed_function feeds_function helper_functions ] ++ map functionOverride overrideableFuncs)))); @@ -160,6 +232,64 @@ let type = str; description = "URL of the feed"; }; + baseurl = mkOption { + type = str; + default = ""; + description = "Base URL of the feed. If set to the empty string, the feed url is used."; + }; + encoding = mkOption { + type = str; + default = ""; + description = "Encoding of the feed. If set to the empty string, it is autodetected, falling back to utf-8."; + }; + adapt = { + enable = mkEnableOption "adapting the timing of checks for this feed to its history" // {default = true;}; + recentdiv = mkOption { + type = ints.positive; + default = 15; + description = "Divide the time from latest entry to latest check by this to determine recentdelay"; + }; + recentmin = mkOption { + type = ints.unsigned; + default = 60*60*24; + description = "Clamp recentdelay to be at least this"; + }; + regressiondiv = mkOption { + type = ints.positive; + default = 50; + description = "Divide the regression-estimated time per release by this to determine regressiondelay"; + }; + regressionmin = mkOption { + type = ints.unsigned; + default = 60*30; + description = "Clamp regressiondelay to be at least this"; + }; + maxdelay = mkOption { + type = ints.unsigned; + default = 60*60*24*30; + description = "Clamp the minimum of recentdelay and regressiondelay to be at most this, giving the final check delay"; + }; + firststeplength = mkOption { + type = ints.unsigned; + default = 60*60; + description = "The regression should consider the history to start this amount of time before the first entry"; + }; + weightdoublingtime = mkOption { + type = ints.positive; + default = 60*60*24*90; + description = "The doubling time of the exponential time-weighting used in the regression"; + }; + backofffactor = mkOption { + type = float; + default = 1.2; + description = "Check again after a failure if time since last success is more than this multiple of the time between last success and last failure."; + }; + deferprobablility = mkOption { + type = float; + default = 0.1; + description = "Probability (between 0 and 1) with which to randomly defer checks when they are otherwise considered ready. Breaks up cadence between feeds over time."; + }; + }; } // listToAttrs (map (func: nameValuePair func (mkOption { type = nullOr str; default = null; @@ -205,14 +335,24 @@ in { options = putopt { enable = mkEnableOption "sfeed"; - update = mkOption { - type = str; - default = "hourly"; - description = "systemd calendar event string describing when to update feeds (see man systemd.time)"; + update.averagedelay = mkOption { + type = ints.positive; + default = 600; + description = "Average delay, in seconds, between runs of the service."; + }; + update.deviation = mkOption { + type = ints.unsigned; + default = 30; + description = "How long, in seconds, to randomly deviate either way on the delay."; + }; + update.accuracy = mkOption { + type = ints.unsigned; + default = 5; + description = "How long, in seconds, to non-randomly deviate either way on the delay to coalesce with other wakeups."; }; jobs = mkOption { type = ints.positive; - default = 12; + default = 16; description = "number of simultaneous fetches to run"; }; rc = mkOption { @@ -231,12 +371,13 @@ in home.file.".sfeed/sfeedrc".source = sfeedrc; systemd.user.services.sfeed_update = { - Unit.Description = "news feed update"; + Unit.Description = "news feed updater"; Service.Environment = [ "PATH=${makeBinPath (attrValues { inherit (pkgs) sfeed curl glibc # for iconv + findutils # for xargs coreutils ; })}" ]; @@ -245,8 +386,10 @@ in systemd.user.timers.sfeed_update = { Unit.Description = "news feed update timer"; Install.WantedBy = [ "timers.target" ]; - Timer.OnCalendar = cfg.update; - Timer.Persistent = true; + Timer.OnActiveSec = 0; + Timer.OnUnitInactiveSec = cfg.update.averagedelay - cfg.update.deviation - cfg.update.accuracy; + Timer.RandomizedDelaySec = 2 * cfg.update.deviation; + Timer.AccuracySec = 2 * cfg.update.accuracy; }; }; } diff --git a/homeModules/sfeed/regressions.awk b/homeModules/sfeed/regressions.awk new file mode 100644 index 0000000..e4955a7 --- /dev/null +++ b/homeModules/sfeed/regressions.awk @@ -0,0 +1,132 @@ +@namespace "regression" + +# Does a weighted linear regression of a step function f(t) with a weight function of exp(r*t) + +# The core regression formula used is: +# y = b1*x + b0 +# b0 = (Σ[y]*Σ[x^2] - Σ[x]*Σ[x*y]) / (Σ[1]*Σ[x^2] - Σ[x]^2) +# b1 = (Σ[1]*Σ[x*y] - Σ[x]*Σ[y]) / (Σ[1]*Σ[x^2] - Σ[x]^2) +# where Σ[g(x,y)] = ∫[t₀ -> t] exp(r*x)*g(x,f(x))*dx + +# Persistent variables used by the algorithm: +# r = growth constant of the weight function +# irt = r*t of the first step's start time +# prt = r*t of the last step's end time +# sy = (∫[irt/r -> prt/r] exp(r*x) *f(x)*dx)*r /exp(prt) +# sxy = (∫[irt/r -> prt/r] exp(r*x)*x*f(x)*dx)*r^2/exp(prt) + +# Components of the formula in terms of those variables: +# with drt = irt - prt +# with ef = exp(drt) +# Σ[1] = exp(prt)/r*(1-ef) +# Σ[y] = exp(prt)/r*sy +# Σ[x] = exp(prt)/r^2*(prt-1-ef*(irt-1)) +# Σ[x*y] = exp(prt)/r^2*sxy +# Σ[x^2] = exp(prt)/r^3*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) + +# b0 = (Σ[y]*Σ[x^2] - Σ[x]*Σ[x*y]) / (Σ[1]*Σ[x^2] - Σ[x]^2) +# = (exp(prt)/r*sy*exp(prt)/r^3*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - exp(prt)/r^2*(prt-1-ef*(irt-1))*exp(prt)/r^2*sxy) / +# (exp(prt)/r*(1-ef)*exp(prt)/r^3*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - (exp(prt)/r^2*(prt-1-ef*(irt-1)))^2) +# = (sy*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - (prt-1-ef*(irt-1))*sxy) / ((1-ef)*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - (prt-1-ef*(irt-1))^2) +# = (sy*(prt^2-ef*irt^2)+(sxy+2*sy)*(1-prt-ef*(1-irt))) / (ef*(ef-drt^2-2)+1) +# b1 = (Σ[1]*Σ[x*y] - Σ[x]*Σ[y]) / (Σ[1]*Σ[x^2] - Σ[x]^2) +# = (exp(prt)/r*(1-ef)*exp(prt)/r^2*sxy - exp(prt)/r^2*(prt-1-ef*(irt-1))*exp(prt)/r*sy) / +# (exp(prt)/r*(1-ef)*exp(prt)/r^3*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - (exp(prt)/r^2*(prt-1-ef*(irt-1)))^2) +# = r*((1-ef)*sxy - (prt-1-ef*(irt-1))*sy)/((1-ef)*(prt^2-2*prt+2-ef*(irt^2-2*irt+2)) - ((prt-1-ef*(irt-1)))^2) +# = r*((1-ef)*(sxy+sy)+(ef*irt-prt)*sy) / (ef*(ef-drt^2-2)+1) + +# a version of exp() that doesn't warn about rounding to zero +BEGIN { exp_warn_breakpoint = -6554261109157969/8796093022208; } +function exp_no_warn(x) { + if (x < exp_warn_breakpoint) return 0; + return exp(x); +} + +# configure the weight function +function config(dt) { + # calculate the growth constant of the exponential growth of the weight function, in terms of the given doubling time + r = log(2)/dt; +} + +# initialize regression, with a given start time for the (not yet created) first step +function init(t) { + irt = prt = r*t; + sy = sxy = 0; +} + +# add a step of the step function, ending at the given time, with the given value +function step(t, v) { + rt = r*t; + drt = prt-rt; + if (drt > 0) { + printf "Warning: Entries are not in time order! Skipping.\n" >>"/dev/stderr"; + return; + } + ef = exp_no_warn(drt); + sy = ef*(sy - v ) + v; + sxy = ef*(sxy - v*(prt - 1)) + v*(rt - 1); + prt = rt; +} + +# calculate the reciprocal of the slope of the linear regression result +function recip_slope() { + drt = irt - prt; + ef = exp_no_warn(drt); + divisor = r*((1 - ef)*(sxy + sy) + (ef*irt - prt)*sy); + if (divisor == 0) return strtonum("+inf"); + return (ef*(ef - drt^2 - 2) + 1)/divisor; +} + +@namespace "awk" + +function min(x, y) { + if (x <= y) + return x; + else + return y; +} + +function max(x, y) { + if (x >= y) + return x; + else + return y; +} + +BEGIN { + regression::config(weightdoublingtime); +} + +NR==1 { + regression::init($1 - firststeplength); + count = 0; +} + +{ + regression::step($1, count); + count += 1; + lastentry = $1; +} + +END { + regression::step(lastchecked, count); + + # Regression-based estimator + regressiondelay = regression::recip_slope()/regressiondiv; + regressiondelay = max(regressiondelay,regressionmin); + + # Estimator based on time since last entry + recentdelay = (lastchecked-lastentry)/recentdiv; + recentdelay = max(recentdelay,recentmin); + + # Use the minimum of the 2 estimators + checkdelay = min(regressiondelay,recentdelay); + checkdelay = min(checkdelay,maxdelay); + + now = systime(); + srand(now + count); # Probably random enough for what we need. + if (now >= max(lastchecked,lastfailed) + checkdelay || (lastfailed > lastchecked && (now-lastchecked)/(lastfailed-lastchecked) > backofffactor)) + exit (rand() > deferprobablility) # Fail, indicating it's time to re-check, but occasionally put it off, to break up cadence between feeds over time. + else + exit 0 # Succeed, indicating it can wait till later +}