From 128ebf04ce98144b27582721fdd25351953248f9 Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Tue, 15 Oct 2019 10:49:14 -0700 Subject: [PATCH] Dead man's switch via healthchecks.io integration (#223) + new monitoring documentation. --- NEWS | 8 + README.md | 6 +- borgmatic/commands/borgmatic.py | 9 ++ borgmatic/config/schema.yaml | 15 +- borgmatic/hook.py | 33 ++++ ...reparation-and-cleanup-steps-to-backups.md | 5 +- docs/how-to/inspect-your-backups.md | 72 +-------- docs/how-to/monitor-your-backups.md | 152 ++++++++++++++++++ docs/how-to/restore-a-backup.md | 1 + docs/how-to/set-up-backups.md | 8 +- setup.py | 3 +- test_requirements.txt | 1 + tests/unit/test_hook.py | 30 ++++ 13 files changed, 255 insertions(+), 88 deletions(-) create mode 100644 docs/how-to/monitor-your-backups.md diff --git a/NEWS b/NEWS index b4142507..89d9884a 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,11 @@ +1.3.25 + * #223: Dead man's switch to detect when backups start failing silently, implemented via + healthchecks.io hook integration. See the documentation for more information: + https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook + * Documentation on monitoring and alerting options for borgmatic backups: + https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/ + * Automatically rewrite links when developing on documentation locally. + 1.3.24 * #86: Add "borgmatic list --successful" flag to only list successful (non-checkpoint) archives. * Add a suggestion form to all documentation pages, so users can submit ideas for improving the diff --git a/README.md b/README.md index 92cce0b0..0d2dd496 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ href="https://asciinema.org/a/203761" target="_blank">screencast. * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups/) * [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/) * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups/) + * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/) * [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/) * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/) * [Upgrade borgmatic](https://torsion.org/borgmatic/docs/how-to/upgrade/) @@ -116,8 +117,3 @@ your thing. In general, contributions are very welcome. We don't bite! Also, please check out the [borgmatic development how-to](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic/) for info on cloning source code, running tests, etc. - - diff --git a/borgmatic/commands/borgmatic.py b/borgmatic/commands/borgmatic.py index a3735505..64b427b7 100644 --- a/borgmatic/commands/borgmatic.py +++ b/borgmatic/commands/borgmatic.py @@ -60,6 +60,9 @@ def run_configuration(config_filename, config, arguments): 'pre-backup', global_arguments.dry_run, ) + hook.ping_healthchecks( + hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'start' + ) except (OSError, CalledProcessError) as error: encountered_error = error yield from make_error_log_records( @@ -95,6 +98,9 @@ def run_configuration(config_filename, config, arguments): 'post-backup', global_arguments.dry_run, ) + hook.ping_healthchecks( + hooks.get('healthchecks'), config_filename, global_arguments.dry_run + ) except (OSError, CalledProcessError) as error: encountered_error = error yield from make_error_log_records( @@ -113,6 +119,9 @@ def run_configuration(config_filename, config, arguments): error=encountered_error, output=getattr(encountered_error, 'output', ''), ) + hook.ping_healthchecks( + hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'fail' + ) except (OSError, CalledProcessError) as error: yield from make_error_log_records( '{}: Error running on-error hook'.format(config_filename), error diff --git a/borgmatic/config/schema.yaml b/borgmatic/config/schema.yaml index d7597db1..071d3545 100644 --- a/borgmatic/config/schema.yaml +++ b/borgmatic/config/schema.yaml @@ -337,8 +337,8 @@ map: example: false hooks: desc: | - Shell commands or scripts to execute at various points during a borgmatic run. - IMPORTANT: All provided commands and scripts are executed with user permissions of + Shell commands, scripts, or integrations to execute at various points during a borgmatic + run. IMPORTANT: All provided commands and scripts are executed with user permissions of borgmatic. Do not forget to set secure permissions on this configuration file (chmod 0600) as well as on any script called from a hook (chmod 0700) to prevent potential shell injection or privilege escalation. @@ -363,10 +363,17 @@ map: seq: - type: str desc: | - List of one or more shell commands or scripts to execute when an exception occurs - during a backup or when running a before_backup or after_backup hook. + List of one or more shell commands or scripts to execute when an exception + occurs during a backup or when running a before_backup or after_backup hook. example: - echo "Error while creating a backup or running a backup hook." + healthchecks: + type: str + desc: | + Healthchecks ping URL or UUID to notify when a backup begins, ends, or errors. + Create an account at https://healthchecks.io if you'd like to use this service. + example: + https://hc-ping.com/your-uuid-here before_everything: seq: - type: str diff --git a/borgmatic/hook.py b/borgmatic/hook.py index 16dc3769..895c412f 100644 --- a/borgmatic/hook.py +++ b/borgmatic/hook.py @@ -1,6 +1,8 @@ import logging import os +import requests + from borgmatic import execute logger = logging.getLogger(__name__) @@ -69,3 +71,34 @@ def execute_hook(commands, umask, config_filename, description, dry_run, **conte finally: if original_umask: os.umask(original_umask) + + +def ping_healthchecks(ping_url_or_uuid, config_filename, dry_run, append=None): + ''' + Ping the given healthchecks.io URL or UUID, appending the append string if any. Use the given + configuration filename in any log entries. If this is a dry run, then don't actually ping + anything. + ''' + if not ping_url_or_uuid: + logger.debug('{}: No healthchecks hook set'.format(config_filename)) + return + + ping_url = ( + ping_url_or_uuid + if ping_url_or_uuid.startswith('http') + else 'https://hc-ping.com/{}'.format(ping_url_or_uuid) + ) + dry_run_label = ' (dry run; not actually pinging)' if dry_run else '' + + if append: + ping_url = '{}/{}'.format(ping_url, append) + + logger.info( + '{}: Pinging healthchecks.io{}{}'.format( + config_filename, ' ' + append if append else '', dry_run_label + ) + ) + logger.debug('{}: Using healthchecks.io ping URL {}'.format(config_filename, ping_url)) + + logging.getLogger('urllib3').setLevel(logging.ERROR) + requests.get(ping_url) diff --git a/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md b/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md index 8e609878..b7d04d32 100644 --- a/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md +++ b/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md @@ -48,8 +48,8 @@ a backup or a backup hook, but not if an error occurs during a `before_everything` hook. borgmatic also runs `on_error` hooks if an error occurs, either when creating -a backup or running a backup hook. See the [error alerting -documentation](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md) +a backup or running a backup hook. See the [monitoring and alerting +documentation](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md) for more information. ## Hook output @@ -73,3 +73,4 @@ invoked by hooks. * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md) * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md) * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md) + * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md) diff --git a/docs/how-to/inspect-your-backups.md b/docs/how-to/inspect-your-backups.md index 73f522bd..aac26dc5 100644 --- a/docs/how-to/inspect-your-backups.md +++ b/docs/how-to/inspect-your-backups.md @@ -22,7 +22,7 @@ borgmatic --verbosity 2 ## Backup summary -If you're less concerned with progress during a backup, and you just want to +If you're less concerned with progress during a backup, and you only want to see the summary of archive statistics at the end, you can use the stats option when performing a backup: @@ -83,78 +83,10 @@ Note that the [sample borgmatic systemd service file](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#systemd) already has this rate limit disabled. -## Error alerting - -When an error occurs during a backup, borgmatic can run configurable shell -commands to fire off custom error notifications or take other actions, so you -can get alerted as soon as something goes wrong. Here's a not-so-useful -example: - -```yaml -hooks: - on_error: - - echo "Error while creating a backup or running a backup hook." -``` - -The `on_error` hook supports interpolating particular runtime variables into -the hook command. Here's an example that assumes you provide a separate shell -script to handle the alerting: - -```yaml -hooks: - on_error: - - send-text-message.sh "{configuration_filename}" "{repository}" -``` - -In this example, when the error occurs, borgmatic interpolates a few runtime -values into the hook command: the borgmatic configuration filename, and the -path of the repository. Here's the full set of supported variables you can use -here: - - * `configuration_filename`: borgmatic configuration filename in which the - error occurred - * `repository`: path of the repository in which the error occurred (may be - blank if the error occurs in a hook) - * `error`: the error message itself - * `output`: output of the command that failed (may be blank if an error - occurred without running a command) - -Note that borgmatic does not run `on_error` hooks if an error occurs within a -`before_everything` or `after_everything` hook. For more about hooks, see the -[borgmatic hooks -documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md), -especially the security information. - - -## Scripting borgmatic - -To consume the output of borgmatic in other software, you can include an -optional `--json` flag with `create`, `list`, or `info` to get the output -formatted as JSON. - -Note that when you specify the `--json` flag, Borg's other non-JSON output is -suppressed so as not to interfere with the captured JSON. Also note that JSON -output only shows up at the console, and not in syslog. - -### Successful backups - -`borgmatic list` includes support for a `--successful` flag that only lists -successful (non-checkpoint) backups. Combined with a built-in Borg flag like -`--last`, you can list the last successful backup for use in your monitoring -scripts. Here's an example combined with `--json`: - -```bash -borgmatic list --successful --last 1 --json -``` - -Note that this particular combination will only work if you've got a single -backup "series" in your repository. If you're instead backing up, say, from -multiple different hosts into a single repository, then you'll need to get -fancier with your archive listing. See `borg list --help` for more flags. - ## Related documentation * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md) + * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md) * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md) * [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md) diff --git a/docs/how-to/monitor-your-backups.md b/docs/how-to/monitor-your-backups.md new file mode 100644 index 00000000..4dd9ae41 --- /dev/null +++ b/docs/how-to/monitor-your-backups.md @@ -0,0 +1,152 @@ +--- +title: How to monitor your backups +--- + +## Monitoring and alerting + +Having backups is great, but they won't do you a lot of good unless you have +confidence that they're running on a regular basis. That's where monitoring +and alerting comes in. + +There are several different ways you can monitor your backups and find out +whether they're succeeding. Which of these you choose to do is up to you and +your particular infrastructure: + +1. **Job runner alerts**: The easiest place to start is with failure alerts +from the [scheduled job +runner](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#autopilot) (cron, +systemd, etc.) that's running borgmatic. But note that if the job doesn't even +get scheduled (e.g. due to the job runner not running), you probably won't get +an alert at all! Still, this is a decent first line of defense, especially +when combined with some of the other approaches below. +2. **borgmatic error hooks**: The `on_error` hook allows you to run an arbitrary +command or script when borgmatic itself encounters an error running your +backups. So for instance, you can run a script to send yourself a text message +alert. But note that if borgmatic doesn't actually run, this alert won't fire. +See [error +hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hooks) +below for how to configure this. +4. **borgmatic Healthchecks hook**: This feature integrates with the +[Healthchecks](https://healthchecks.io/) service, and pings Healthchecks +whenever borgmatic runs. That way, Healthchecks can alert you when something +goes wrong or it doesn't hear from borgmatic for a configured interval. (See +[Healthchecks +hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook) +below for how to configure this.) +3. **Third-party monitoring software**: You can use traditional monitoring +software to consume borgmatic JSON output and track when the last +successful backup occurred. See [scripting +borgmatic](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#scripting-borgmatic) +below for how to configure this. +5. **Borg hosting providers**: Most [Borg hosting +providers](https://torsion.org/borgmatic/#hosting-providers) include +monitoring and alerting as part of their offering. This gives you a dashboard +to check on all of your backups, and can alert you if the service doesn't hear +from borgmatic for a configured interval. +6. **borgmatic consistency checks**: While not strictly part of monitoring, if you +really want confidence that your backups are not only running but are +restorable as well, you can configure particular [consistency +checks](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/#consistency-check-configuration) +or even script full [restore +tests](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/). + + +## Error hooks + +When an error occurs during a backup, borgmatic can run configurable shell +commands to fire off custom error notifications or take other actions, so you +can get alerted as soon as something goes wrong. Here's a not-so-useful +example: + +```yaml +hooks: + on_error: + - echo "Error while creating a backup or running a backup hook." +``` + +The `on_error` hook supports interpolating particular runtime variables into +the hook command. Here's an example that assumes you provide a separate shell +script to handle the alerting: + +```yaml +hooks: + on_error: + - send-text-message.sh "{configuration_filename}" "{repository}" +``` + +In this example, when the error occurs, borgmatic interpolates a few runtime +values into the hook command: the borgmatic configuration filename, and the +path of the repository. Here's the full set of supported variables you can use +here: + + * `configuration_filename`: borgmatic configuration filename in which the + error occurred + * `repository`: path of the repository in which the error occurred (may be + blank if the error occurs in a hook) + * `error`: the error message itself + * `output`: output of the command that failed (may be blank if an error + occurred without running a command) + +Note that borgmatic does not run `on_error` hooks if an error occurs within a +`before_everything` or `after_everything` hook. For more about hooks, see the +[borgmatic hooks +documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md), +especially the security information. + + +## Healthchecks hook + +[Healthchecks](https://healthchecks.io/) is a service that provides "instant +alerts when your cron jobs fail silently", and borgmatic has built-in +integration with it. Once you create a Healthchecks account and project on +their site, all you need to do is configure borgmatic with the unique "Ping +URL" for your project. Here's an example: + + +```yaml +hooks: + healthchecks: https://hc-ping.com/addffa72-da17-40ae-be9c-ff591afb942a +``` + +With this hook in place, borgmatic will ping your Healthchecks project when a +backup begins, ends, or errors. Then you can configure Healthchecks to notify +you by a [variety of +mechanisms](https://healthchecks.io/#welcome-integrations) when backups fail +or it doesn't hear from borgmatic for a certain period of time. + + +## Scripting borgmatic + +To consume the output of borgmatic in other software, you can include an +optional `--json` flag with `create`, `list`, or `info` to get the output +formatted as JSON. + +Note that when you specify the `--json` flag, Borg's other non-JSON output is +suppressed so as not to interfere with the captured JSON. Also note that JSON +output only shows up at the console, and not in syslog. + + +### Successful backups + +`borgmatic list` includes support for a `--successful` flag that only lists +successful (non-checkpoint) backups. Combined with a built-in Borg flag like +`--last`, you can list the last successful backup for use in your monitoring +scripts. Here's an example combined with `--json`: + +```bash +borgmatic list --successful --last 1 --json +``` + +Note that this particular combination will only work if you've got a single +backup "series" in your repository. If you're instead backing up, say, from +multiple different hosts into a single repository, then you'll need to get +fancier with your archive listing. See `borg list --help` for more flags. + + +## Related documentation + + * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md) + * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md) + * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md) + * [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup.md) + * [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md) diff --git a/docs/how-to/restore-a-backup.md b/docs/how-to/restore-a-backup.md index 336a356e..0f771abe 100644 --- a/docs/how-to/restore-a-backup.md +++ b/docs/how-to/restore-a-backup.md @@ -65,3 +65,4 @@ Like a whole-archive restore, this also restores into the current directory. * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md) * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md) + * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md) diff --git a/docs/how-to/set-up-backups.md b/docs/how-to/set-up-backups.md index f9c53b15..526cfb5f 100644 --- a/docs/how-to/set-up-backups.md +++ b/docs/how-to/set-up-backups.md @@ -228,7 +228,7 @@ found character that cannot start any token in "config.yaml", line 230, column 1 ``` -YAML does not allow tabs. So to fix this, simply replace any tabs in your +YAML does not allow tabs. So to fix this, replace any tabs in your configuration file with the requisite number of spaces. ### libyaml compilation errors @@ -247,10 +247,6 @@ it. * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md) * [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups.md) * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md) + * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md) * [borgmatic configuration reference](https://torsion.org/borgmatic/docs/reference/configuration.md) * [borgmatic command-line reference](https://torsion.org/borgmatic/docs/reference/command-line.md) - - diff --git a/setup.py b/setup.py index 921def49..de307d53 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import find_packages, setup -VERSION = '1.3.24' +VERSION = '1.3.25' setup( @@ -31,6 +31,7 @@ setup( obsoletes=['atticmatic'], install_requires=( 'pykwalify>=1.6.0,<14.06', + 'requests', 'ruamel.yaml>0.15.0,<0.17.0', 'setuptools', 'colorama>=0.4.1,<0.5', diff --git a/test_requirements.txt b/test_requirements.txt index 827d5327..d7962068 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -20,5 +20,6 @@ pytest==5.1.2 pytest-cov==2.7.1 python-dateutil==2.8.0 PyYAML==5.1.2 +requests==2.22.0 ruamel.yaml>0.15.0,<0.17.0 toml==0.10.0 diff --git a/tests/unit/test_hook.py b/tests/unit/test_hook.py index 647e92bb..bd42f002 100644 --- a/tests/unit/test_hook.py +++ b/tests/unit/test_hook.py @@ -79,3 +79,33 @@ def test_execute_hook_on_error_logs_as_error(): ).once() module.execute_hook([':'], None, 'config.yaml', 'on-error', dry_run=False) + + +def test_ping_healthchecks_hits_ping_url(): + ping_url = 'https://example.com' + flexmock(module.requests).should_receive('get').with_args(ping_url) + + module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False) + + +def test_ping_healthchecks_without_ping_url_does_not_raise(): + flexmock(module.requests).should_receive('get').never() + + module.ping_healthchecks(ping_url_or_uuid=None, config_filename='config.yaml', dry_run=False) + + +def test_ping_healthchecks_with_ping_uuid_hits_corresponding_url(): + ping_uuid = 'abcd-efgh-ijkl-mnop' + flexmock(module.requests).should_receive('get').with_args( + 'https://hc-ping.com/{}'.format(ping_uuid) + ) + + module.ping_healthchecks(ping_uuid, 'config.yaml', dry_run=False) + + +def test_ping_healthchecks_hits_ping_url_with_append(): + ping_url = 'https://example.com' + append = 'failed-so-hard' + flexmock(module.requests).should_receive('get').with_args('{}/{}'.format(ping_url, append)) + + module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False, append=append)