From 32019ea8f31ef07158ca1694363ce3037881c74c Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Fri, 25 Aug 2023 10:52:00 -0700 Subject: [PATCH] Add documentation for Grafana Loki hook (#743). --- NEWS | 3 + borgmatic/hooks/loki.py | 5 ++ borgmatic/hooks/mariadb.py | 3 +- borgmatic/hooks/mysql.py | 3 +- docs/how-to/monitor-your-backups.md | 122 ++++++++++++++++++--------- tests/integration/hooks/test_loki.py | 23 +++-- tests/unit/hooks/test_loki.py | 78 +++++++++-------- 7 files changed, 154 insertions(+), 83 deletions(-) diff --git a/NEWS b/NEWS index e46fe854..1c380cfd 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,9 @@ only restorable with a "mysql_databases:" configuration. * #738: Fix for potential data loss (data not getting restored) in which the database "restore" action didn't actually restore anything and indicated success anyway. + * #743: Add a monitoring hook for sending backup status and logs to to Grafana Loki. See the + documentation for more information: + https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#loki-hook * Remove the deprecated use of the MongoDB hook's "--db" flag for database restoration. * Add source code reference documentation for getting oriented with the borgmatic code as a developer: https://torsion.org/borgmatic/docs/reference/source-code/ diff --git a/borgmatic/hooks/loki.py b/borgmatic/hooks/loki.py index 9a5dae8f..9734ddc5 100644 --- a/borgmatic/hooks/loki.py +++ b/borgmatic/hooks/loki.py @@ -67,6 +67,7 @@ class Loki_log_buffer: request_body = self.to_request() self.root['streams'][0]['values'] = [] request_header = {'Content-Type': 'application/json'} + try: result = requests.post(self.url, headers=request_header, data=request_body, timeout=5) result.raise_for_status() @@ -100,6 +101,7 @@ class Loki_log_handler(logging.Handler): Add an arbitrary string as a log entry to the stream. ''' self.buffer.add_value(msg) + if len(self.buffer) > MAX_BUFFER_LINES: self.buffer.flush() @@ -116,6 +118,7 @@ def initialize_monitor(hook_config, config, config_filename, monitoring_log_leve ''' url = hook_config.get('url') loki = Loki_log_handler(url, dry_run) + for key, value in hook_config.get('labels').items(): if value == '__hostname': loki.add_label(key, platform.node()) @@ -125,6 +128,7 @@ def initialize_monitor(hook_config, config, config_filename, monitoring_log_leve loki.add_label(key, config_filename) else: loki.add_label(key, value) + logging.getLogger().addHandler(loki) @@ -143,6 +147,7 @@ def destroy_monitor(hook_config, config, config_filename, monitoring_log_level, Remove the monitor handler that was added to the root logger. ''' logger = logging.getLogger() + for handler in tuple(logger.handlers): if isinstance(handler, Loki_log_handler): handler.flush() diff --git a/borgmatic/hooks/mariadb.py b/borgmatic/hooks/mariadb.py index a7c8c1f6..b3c9a3eb 100644 --- a/borgmatic/hooks/mariadb.py +++ b/borgmatic/hooks/mariadb.py @@ -62,7 +62,7 @@ def execute_dump_command( ): ''' Kick off a dump for the given MariaDB database (provided as a configuration dict) to a named - pipe constructed from the given dump path and database names. Use the given log prefix in any + pipe constructed from the given dump path and database name. Use the given log prefix in any log entries. Return a subprocess.Popen instance for the dump process ready to spew to a named pipe. But if @@ -72,6 +72,7 @@ def execute_dump_command( dump_filename = dump.make_data_source_dump_filename( dump_path, database['name'], database.get('hostname') ) + if os.path.exists(dump_filename): logger.warning( f'{log_prefix}: Skipping duplicate dump of MariaDB database "{database_name}" to {dump_filename}' diff --git a/borgmatic/hooks/mysql.py b/borgmatic/hooks/mysql.py index be37c8d4..8f1b83b7 100644 --- a/borgmatic/hooks/mysql.py +++ b/borgmatic/hooks/mysql.py @@ -62,7 +62,7 @@ def execute_dump_command( ): ''' Kick off a dump for the given MySQL/MariaDB database (provided as a configuration dict) to a - named pipe constructed from the given dump path and database names. Use the given log prefix in + named pipe constructed from the given dump path and database name. Use the given log prefix in any log entries. Return a subprocess.Popen instance for the dump process ready to spew to a named pipe. But if @@ -72,6 +72,7 @@ def execute_dump_command( dump_filename = dump.make_data_source_dump_filename( dump_path, database['name'], database.get('hostname') ) + if os.path.exists(dump_filename): logger.warning( f'{log_prefix}: Skipping duplicate dump of MySQL database "{database_name}" to {dump_filename}' diff --git a/docs/how-to/monitor-your-backups.md b/docs/how-to/monitor-your-backups.md index 05d1163a..be3deb7e 100644 --- a/docs/how-to/monitor-your-backups.md +++ b/docs/how-to/monitor-your-backups.md @@ -38,11 +38,11 @@ below for how to configure this. borgmatic integrates with monitoring services like [Healthchecks](https://healthchecks.io/), [Cronitor](https://cronitor.io), -[Cronhub](https://cronhub.io), [PagerDuty](https://www.pagerduty.com/), and -[ntfy](https://ntfy.sh/) and pings these services whenever borgmatic runs. -That way, you'll receive an alert when something goes wrong or (for certain -hooks) the service doesn't hear from borgmatic for a configured interval. See -[Healthchecks +[Cronhub](https://cronhub.io), [PagerDuty](https://www.pagerduty.com/), +[ntfy](https://ntfy.sh/), and [Grafana Loki](https://grafana.com/oss/loki/) +and pings these services whenever borgmatic runs. That way, you'll receive an +alert when something goes wrong or (for certain hooks) the service doesn't +hear from borgmatic for a configured interval. See [Healthchecks hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook), [Cronitor hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), @@ -50,7 +50,10 @@ hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-h hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook), [PagerDuty hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook), -and [ntfy hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#ntfy-hook) +[ntfy +hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#ntfy-hook), +and [Loki +hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#loki-hook), below for how to configure this. While these services offer different features, you probably only need to use @@ -129,7 +132,7 @@ especially the security information. ## Healthchecks hook [Healthchecks](https://healthchecks.io/) is a service that provides "instant -alerts when your cron jobs fail silently", and borgmatic has built-in +alerts when your cron jobs fail silently," and borgmatic has built-in integration with it. Once you create a Healthchecks account and project on their site, all you need to do is configure borgmatic with the unique "Ping URL" for your project. Here's an example: @@ -144,21 +147,19 @@ healthchecks: this option in the `hooks:` section of your configuration. With this hook in place, borgmatic pings your Healthchecks project when a -backup begins, ends, or errors. Specifically, after the `before_backup` -hooks run, borgmatic lets Healthchecks know that it has started if any of -the `create`, `prune`, `compact`, or `check` actions are run. +backup begins, ends, or errors, but only when any of the `create`, `prune`, +`compact`, or `check` actions are run. Then, if the actions complete successfully, borgmatic notifies Healthchecks of -the success after the `after_backup` hooks run and includes borgmatic logs in -the payload data sent to Healthchecks. This means that borgmatic logs show up -in the Healthchecks UI, although be aware that Healthchecks currently has a -10-kilobyte limit for the logs in each ping. +the success and includes borgmatic logs in the payload data sent to +Healthchecks. This means that borgmatic logs show up in the Healthchecks UI, +although be aware that Healthchecks currently has a 10-kilobyte limit for the +logs in each ping. -If an error occurs during any action or hook, borgmatic notifies Healthchecks -after the `on_error` hooks run, also tacking on logs including the error -itself. But the logs are only included for errors that occur when a `create`, -`prune`, `compact`, or `check` action is run. +If an error occurs during any action or hook, borgmatic notifies Healthchecks, +also tacking on logs including the error itself. But the logs are only +included for errors that occur when a `create`, `prune`, `compact`, or `check` +action is run. You can customize the verbosity of the logs that are sent to Healthchecks with borgmatic's `--monitoring-verbosity` flag. The `--list` and `--stats` flags @@ -175,7 +176,7 @@ or it doesn't hear from borgmatic for a certain period of time. ## Cronitor hook [Cronitor](https://cronitor.io/) provides "Cron monitoring and uptime healthchecks -for websites, services and APIs", and borgmatic has built-in +for websites, services and APIs," and borgmatic has built-in integration with it. Once you create a Cronitor account and cron job monitor on their site, all you need to do is configure borgmatic with the unique "Ping API URL" for your monitor. Here's an example: @@ -190,13 +191,9 @@ cronitor: this option in the `hooks:` section of your configuration. With this hook in place, borgmatic pings your Cronitor monitor when a backup -begins, ends, or errors. Specifically, after the `before_backup` -hooks run, borgmatic lets Cronitor know that it has started if any of the -`prune`, `compact`, `create`, or `check` actions are run. Then, if the actions -complete successfully, borgmatic notifies Cronitor of the success after the -`after_backup` hooks run. And if an error occurs during any action or hook, -borgmatic notifies Cronitor after the `on_error` hooks run. +begins, ends, or errors, but only when any of the `prune`, `compact`, +`create`, or `check` actions are run. Then, if the actions complete +successfully or errors, borgmatic notifies Cronitor accordingly. You can configure Cronitor to notify you by a [variety of mechanisms](https://cronitor.io/docs/cron-job-notifications) when backups fail @@ -206,7 +203,7 @@ or it doesn't hear from borgmatic for a certain period of time. ## Cronhub hook [Cronhub](https://cronhub.io/) provides "instant alerts when any of your -background jobs fail silently or run longer than expected", and borgmatic has +background jobs fail silently or run longer than expected," and borgmatic has built-in integration with it. Once you create a Cronhub account and monitor on their site, all you need to do is configure borgmatic with the unique "Ping URL" for your monitor. Here's an example: @@ -221,13 +218,9 @@ cronhub: this option in the `hooks:` section of your configuration. With this hook in place, borgmatic pings your Cronhub monitor when a backup -begins, ends, or errors. Specifically, after the `before_backup` -hooks run, borgmatic lets Cronhub know that it has started if any of the -`prune`, `compact`, `create`, or `check` actions are run. Then, if the actions -complete successfully, borgmatic notifies Cronhub of the success after the -`after_backup` hooks run. And if an error occurs during any action or hook, -borgmatic notifies Cronhub after the `on_error` hooks run. +begins, ends, or errors, but only when any of the `prune`, `compact`, +`create`, or `check` actions are run. Then, if the actions complete +successfully or errors, borgmatic notifies Cronhub accordingly. Note that even though you configure borgmatic with the "start" variant of the ping URL, borgmatic substitutes the correct state into the URL when pinging @@ -266,10 +259,9 @@ pagerduty: this option in the `hooks:` section of your configuration. With this hook in place, borgmatic creates a PagerDuty event for your service -whenever backups fail. Specifically, if an error occurs during a `create`, -`prune`, `compact`, or `check` action, borgmatic sends an event to PagerDuty -before the `on_error` hooks run. Note that borgmatic does not contact -PagerDuty when a backup starts or ends without error. +whenever backups fail, but only when any of the `create`, `prune`, `compact`, +or `check` actions are run. Note that borgmatic does not contact PagerDuty +when a backup starts or when it ends without error. You can configure PagerDuty to notify you by a [variety of mechanisms](https://support.pagerduty.com/docs/notifications) when backups @@ -328,6 +320,58 @@ ntfy: the `ntfy:` option in the `hooks:` section of your configuration. +## Loki hook + +[Grafana Loki](https://grafana.com/oss/loki/) is a "horizontally scalable, +highly available, multi-tenant log aggregation system inspired by Prometheus." +borgmatic has built-in integration with Loki, sending both backup status and +borgmatic logs. + +You can configure borgmatic to use either a [self-hosted Loki +instance](https://grafana.com/docs/loki/latest/installation/) or [a Grafana +Cloud account](https://grafana.com/auth/sign-up/create-user). Start by setting +your Loki API push URL. Here's an example: + +```yaml +loki: + url: http://localhost:3100/loki/api/v1/push +``` + +With this hook in place, borgmatic sends its logs to your Loki instance as any +of the `prune`, `compact`, `create`, or `check` actions are run. Then, after +the actions complete, borgmatic notifies Loki of success or failure. + +This hook supports sending arbitrary labels to Loki. For instance: + +```yaml +loki: + url: http://localhost:3100/loki/api/v1/push + + labels: + app: borgmatic + hostname: example.org +``` + +There are also a few placeholders you can optionally use as label values: + + * `__config`: name of the borgmatic configuration file + * `__config_path`: full path of the borgmatic configuration file + * `__hostname`: the local machine hostname + +These placeholders are only substituted for the whole label value, not +interpolated into a larger string. For instance: + +```yaml +loki: + url: http://localhost:3100/loki/api/v1/push + + labels: + app: borgmatic + config: __config + hostname: __hostname +``` + + ## Scripting borgmatic To consume the output of borgmatic in other software, you can include an diff --git a/tests/integration/hooks/test_loki.py b/tests/integration/hooks/test_loki.py index 3eac29d3..64c262f8 100644 --- a/tests/integration/hooks/test_loki.py +++ b/tests/integration/hooks/test_loki.py @@ -6,9 +6,9 @@ from flexmock import flexmock from borgmatic.hooks import loki as module -def test_log_handler_label_replacment(): +def test_initialize_monitor_replaces_labels(): ''' - Assert that label placeholders get replaced + Assert that label placeholders get replaced. ''' hook_config = { 'url': 'http://localhost:3100/loki/api/v1/push', @@ -17,18 +17,20 @@ def test_log_handler_label_replacment(): config_filename = '/mock/path/test.yaml' dry_run = True module.initialize_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run) + for handler in tuple(logging.getLogger().handlers): if isinstance(handler, module.Loki_log_handler): assert handler.buffer.root['streams'][0]['stream']['hostname'] == platform.node() assert handler.buffer.root['streams'][0]['stream']['config'] == 'test.yaml' assert handler.buffer.root['streams'][0]['stream']['config_full'] == config_filename return + assert False -def test_initalize_adds_log_handler(): +def test_initialize_monitor_adds_log_handler(): ''' - Assert that calling initialize_monitor adds our logger to the root logger + Assert that calling initialize_monitor adds our logger to the root logger. ''' hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}} module.initialize_monitor( @@ -38,15 +40,17 @@ def test_initalize_adds_log_handler(): monitoring_log_level=flexmock(), dry_run=True, ) + for handler in tuple(logging.getLogger().handlers): if isinstance(handler, module.Loki_log_handler): return + assert False -def test_ping_adds_log_message(): +def test_ping_monitor_adds_log_message(): ''' - Assert that calling ping_monitor adds a message to our logger + Assert that calling ping_monitor adds a message to our logger. ''' hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}} config_filename = 'test.yaml' @@ -55,6 +59,7 @@ def test_ping_adds_log_message(): module.ping_monitor( hook_config, flexmock(), config_filename, module.monitor.State.FINISH, flexmock(), dry_run ) + for handler in tuple(logging.getLogger().handlers): if isinstance(handler, module.Loki_log_handler): assert any( @@ -65,18 +70,20 @@ def test_ping_adds_log_message(): ) ) return + assert False -def test_log_handler_gets_removed(): +def test_destroy_monitor_removes_log_handler(): ''' - Assert that destroy_monitor removes the logger from the root logger + Assert that destroy_monitor removes the logger from the root logger. ''' hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}} config_filename = 'test.yaml' dry_run = True module.initialize_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run) module.destroy_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run) + for handler in tuple(logging.getLogger().handlers): if isinstance(handler, module.Loki_log_handler): assert False diff --git a/tests/unit/hooks/test_loki.py b/tests/unit/hooks/test_loki.py index 33e35ecd..ccf19553 100644 --- a/tests/unit/hooks/test_loki.py +++ b/tests/unit/hooks/test_loki.py @@ -6,93 +6,103 @@ from flexmock import flexmock from borgmatic.hooks import loki as module -def test_log_handler_gets_labels(): +def test_loki_log_buffer_add_value_gets_raw(): ''' - Assert that adding labels works - ''' - buffer = module.Loki_log_buffer(flexmock(), False) - buffer.add_label('test', 'label') - assert buffer.root['streams'][0]['stream']['test'] == 'label' - buffer.add_label('test2', 'label2') - assert buffer.root['streams'][0]['stream']['test2'] == 'label2' - - -def test_log_buffer_gets_raw(): - ''' - Assert that adding values to the log buffer increases it's length + Assert that adding values to the log buffer increases it's length. ''' buffer = module.Loki_log_buffer(flexmock(), False) assert len(buffer) == 0 + buffer.add_value('Some test log line') assert len(buffer) == 1 + buffer.add_value('Another test log line') assert len(buffer) == 2 -def test_log_buffer_gets_log_messages(): +def test_loki_log_buffer_json_serializes_empty_buffer(): ''' - Assert that adding log records works - ''' - handler = module.Loki_log_handler(flexmock(), False) - handler.emit(flexmock(getMessage=lambda: 'Some test log line')) - assert len(handler.buffer) == 1 - - -def test_log_buffer_json(): - ''' - Assert that the buffer correctly serializes when empty + Assert that the buffer correctly serializes when empty. ''' buffer = module.Loki_log_buffer(flexmock(), False) + assert json.loads(buffer.to_request()) == json.loads('{"streams":[{"stream":{},"values":[]}]}') -def test_log_buffer_json_labels(): +def test_loki_log_buffer_json_serializes_labels(): ''' - Assert that the buffer correctly serializes with labels + Assert that the buffer correctly serializes with labels. ''' buffer = module.Loki_log_buffer(flexmock(), False) buffer.add_label('test', 'label') + assert json.loads(buffer.to_request()) == json.loads( '{"streams":[{"stream":{"test": "label"},"values":[]}]}' ) -def test_log_buffer_json_log_lines(): +def test_loki_log_buffer_json_serializes_log_lines(): ''' - Assert that log lines end up in the correct place in the log buffer + Assert that log lines end up in the correct place in the log buffer. ''' buffer = module.Loki_log_buffer(flexmock(), False) buffer.add_value('Some test log line') + assert json.loads(buffer.to_request())['streams'][0]['values'][0][1] == 'Some test log line' -def test_log_handler_post(): +def test_loki_log_handler_add_label_gets_labels(): ''' - Assert that the flush function sends a post request after a certain limit + Assert that adding labels works. + ''' + buffer = module.Loki_log_buffer(flexmock(), False) + + buffer.add_label('test', 'label') + assert buffer.root['streams'][0]['stream']['test'] == 'label' + + buffer.add_label('test2', 'label2') + assert buffer.root['streams'][0]['stream']['test2'] == 'label2' + + +def test_loki_log_handler_emit_gets_log_messages(): + ''' + Assert that adding log records works. + ''' + handler = module.Loki_log_handler(flexmock(), False) + handler.emit(flexmock(getMessage=lambda: 'Some test log line')) + + assert len(handler.buffer) == 1 + + +def test_loki_log_handler_raw_posts_to_server(): + ''' + Assert that the flush function sends a post request after a certain limit. ''' handler = module.Loki_log_handler(flexmock(), False) flexmock(module.requests).should_receive('post').and_return( flexmock(raise_for_status=lambda: '') ).once() + for num in range(int(module.MAX_BUFFER_LINES * 1.5)): handler.raw(num) -def test_log_handler_post_failiure(): +def test_loki_log_handler_raw_post_failure_does_not_raise(): ''' - Assert that the flush function catches request exceptions + Assert that the flush function catches request exceptions. ''' handler = module.Loki_log_handler(flexmock(), False) flexmock(module.requests).should_receive('post').and_return( flexmock(raise_for_status=lambda: (_ for _ in ()).throw(requests.RequestException())) ).once() + for num in range(int(module.MAX_BUFFER_LINES * 1.5)): handler.raw(num) -def test_log_handler_empty_flush_noop(): +def test_loki_log_handler_flush_with_empty_buffer_does_not_raise(): ''' - Test that flushing an empty buffer does indeed nothing + Test that flushing an empty buffer does indeed nothing. ''' handler = module.Loki_log_handler(flexmock(), False) handler.flush()