Add documentation for Grafana Loki hook (#743).

2023-08-25 10:52:00 -07:00 · 2023-08-25 10:52:00 -07:00 · 32019ea8f3
parent fa9a061033
commit 32019ea8f3
7 changed files with 154 additions and 83 deletions
--- a/3
+++ b/3
@ -6,6 +6,9 @@
   only restorable with a "mysql_databases:" configuration.
 * #738: Fix for potential data loss (data not getting restored) in which the database "restore"
   action didn't actually restore anything and indicated success anyway.
+ * #743: Add a monitoring hook for sending backup status and logs to to Grafana Loki. See the
+   documentation for more information:
+   https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#loki-hook
 * Remove the deprecated use of the MongoDB hook's "--db" flag for database restoration.
 * Add source code reference documentation for getting oriented with the borgmatic code as a
   developer: https://torsion.org/borgmatic/docs/reference/source-code/
--- a/borgmatic/hooks/loki.py
+++ b/borgmatic/hooks/loki.py
@ -67,6 +67,7 @@ class Loki_log_buffer:
        request_body = self.to_request()
        self.root['streams'][0]['values'] = []
        request_header = {'Content-Type': 'application/json'}
+
        try:
            result = requests.post(self.url, headers=request_header, data=request_body, timeout=5)
            result.raise_for_status()
@ -100,6 +101,7 @@ class Loki_log_handler(logging.Handler):
        Add an arbitrary string as a log entry to the stream.
        '''
        self.buffer.add_value(msg)
+
        if len(self.buffer) > MAX_BUFFER_LINES:
            self.buffer.flush()

@ -116,6 +118,7 @@ def initialize_monitor(hook_config, config, config_filename, monitoring_log_leve
    '''
    url = hook_config.get('url')
    loki = Loki_log_handler(url, dry_run)
+
    for key, value in hook_config.get('labels').items():
        if value == '__hostname':
            loki.add_label(key, platform.node())
@ -125,6 +128,7 @@ def initialize_monitor(hook_config, config, config_filename, monitoring_log_leve
            loki.add_label(key, config_filename)
        else:
            loki.add_label(key, value)
+
    logging.getLogger().addHandler(loki)


@ -143,6 +147,7 @@ def destroy_monitor(hook_config, config, config_filename, monitoring_log_level,
    Remove the monitor handler that was added to the root logger.
    '''
    logger = logging.getLogger()
+
    for handler in tuple(logger.handlers):
        if isinstance(handler, Loki_log_handler):
            handler.flush()
--- a/borgmatic/hooks/mariadb.py
+++ b/borgmatic/hooks/mariadb.py
@ -62,7 +62,7 @@ def execute_dump_command(
 ):
    '''
    Kick off a dump for the given MariaDB database (provided as a configuration dict) to a named
-    pipe constructed from the given dump path and database names. Use the given log prefix in any
+    pipe constructed from the given dump path and database name. Use the given log prefix in any
    log entries.

    Return a subprocess.Popen instance for the dump process ready to spew to a named pipe. But if
@ -72,6 +72,7 @@ def execute_dump_command(
    dump_filename = dump.make_data_source_dump_filename(
        dump_path, database['name'], database.get('hostname')
    )
+
    if os.path.exists(dump_filename):
        logger.warning(
            f'{log_prefix}: Skipping duplicate dump of MariaDB database "{database_name}" to {dump_filename}'
--- a/borgmatic/hooks/mysql.py
+++ b/borgmatic/hooks/mysql.py
@ -62,7 +62,7 @@ def execute_dump_command(
 ):
    '''
    Kick off a dump for the given MySQL/MariaDB database (provided as a configuration dict) to a
-    named pipe constructed from the given dump path and database names. Use the given log prefix in
+    named pipe constructed from the given dump path and database name. Use the given log prefix in
    any log entries.

    Return a subprocess.Popen instance for the dump process ready to spew to a named pipe. But if
@ -72,6 +72,7 @@ def execute_dump_command(
    dump_filename = dump.make_data_source_dump_filename(
        dump_path, database['name'], database.get('hostname')
    )
+
    if os.path.exists(dump_filename):
        logger.warning(
            f'{log_prefix}: Skipping duplicate dump of MySQL database "{database_name}" to {dump_filename}'
--- a/docs/how-to/monitor-your-backups.md
+++ b/docs/how-to/monitor-your-backups.md
@ -38,11 +38,11 @@ below for how to configure this.

 borgmatic integrates with monitoring services like
 [Healthchecks](https://healthchecks.io/), [Cronitor](https://cronitor.io),
-[Cronhub](https://cronhub.io), [PagerDuty](https://www.pagerduty.com/), and
-[ntfy](https://ntfy.sh/) and pings these services whenever borgmatic runs.
-That way, you'll receive an alert when something goes wrong or (for certain
-hooks) the service doesn't hear from borgmatic for a configured interval. See
-[Healthchecks
+[Cronhub](https://cronhub.io), [PagerDuty](https://www.pagerduty.com/),
+[ntfy](https://ntfy.sh/), and [Grafana Loki](https://grafana.com/oss/loki/)
+and pings these services whenever borgmatic runs. That way, you'll receive an
+alert when something goes wrong or (for certain hooks) the service doesn't
+hear from borgmatic for a configured interval. See [Healthchecks
 hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook),
 [Cronitor
 hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook),
@ -50,7 +50,10 @@ hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-h
 hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook),
 [PagerDuty
 hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook),
-and [ntfy hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#ntfy-hook)
+[ntfy
+hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#ntfy-hook),
+and [Loki
+hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#loki-hook),
 below for how to configure this.

 While these services offer different features, you probably only need to use
@ -129,7 +132,7 @@ especially the security information.
 ## Healthchecks hook

 [Healthchecks](https://healthchecks.io/) is a service that provides "instant
-alerts when your cron jobs fail silently", and borgmatic has built-in
+alerts when your cron jobs fail silently," and borgmatic has built-in
 integration with it. Once you create a Healthchecks account and project on
 their site, all you need to do is configure borgmatic with the unique "Ping
 URL" for your project. Here's an example:
@ -144,21 +147,19 @@ healthchecks:
 this option in the `hooks:` section of your configuration.

 With this hook in place, borgmatic pings your Healthchecks project when a
-backup begins, ends, or errors. Specifically, after the <a
-href="https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/">`before_backup`
-hooks</a> run, borgmatic lets Healthchecks know that it has started if any of
-the `create`, `prune`, `compact`, or `check` actions are run.
+backup begins, ends, or errors, but only when any of the `create`, `prune`,
+`compact`, or `check` actions are run.

 Then, if the actions complete successfully, borgmatic notifies Healthchecks of
-the success after the `after_backup` hooks run and includes borgmatic logs in
-the payload data sent to Healthchecks. This means that borgmatic logs show up
-in the Healthchecks UI, although be aware that Healthchecks currently has a
-10-kilobyte limit for the logs in each ping.
+the success and includes borgmatic logs in the payload data sent to
+Healthchecks. This means that borgmatic logs show up in the Healthchecks UI,
+although be aware that Healthchecks currently has a 10-kilobyte limit for the
+logs in each ping.

-If an error occurs during any action or hook, borgmatic notifies Healthchecks
-after the `on_error` hooks run, also tacking on logs including the error
-itself. But the logs are only included for errors that occur when a `create`,
-`prune`, `compact`, or `check` action is run.
+If an error occurs during any action or hook, borgmatic notifies Healthchecks,
+also tacking on logs including the error itself. But the logs are only
+included for errors that occur when a `create`, `prune`, `compact`, or `check`
+action is run.

 You can customize the verbosity of the logs that are sent to Healthchecks with
 borgmatic's `--monitoring-verbosity` flag. The `--list` and `--stats` flags
@ -175,7 +176,7 @@ or it doesn't hear from borgmatic for a certain period of time.
 ## Cronitor hook

 [Cronitor](https://cronitor.io/) provides "Cron monitoring and uptime healthchecks
-for websites, services and APIs", and borgmatic has built-in
+for websites, services and APIs," and borgmatic has built-in
 integration with it. Once you create a Cronitor account and cron job monitor on
 their site, all you need to do is configure borgmatic with the unique "Ping
 API URL" for your monitor. Here's an example:
@ -190,13 +191,9 @@ cronitor:
 this option in the `hooks:` section of your configuration.

 With this hook in place, borgmatic pings your Cronitor monitor when a backup
-begins, ends, or errors. Specifically, after the <a
-href="https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/">`before_backup`
-hooks</a> run, borgmatic lets Cronitor know that it has started if any of the
-`prune`, `compact`, `create`, or `check` actions are run. Then, if the actions
-complete successfully, borgmatic notifies Cronitor of the success after the
-`after_backup` hooks run. And if an error occurs during any action or hook,
-borgmatic notifies Cronitor after the `on_error` hooks run.
+begins, ends, or errors, but only when any of the `prune`, `compact`,
+`create`, or `check` actions are run. Then, if the actions complete
+successfully or errors, borgmatic notifies Cronitor accordingly.

 You can configure Cronitor to notify you by a [variety of
 mechanisms](https://cronitor.io/docs/cron-job-notifications) when backups fail
@ -206,7 +203,7 @@ or it doesn't hear from borgmatic for a certain period of time.
 ## Cronhub hook

 [Cronhub](https://cronhub.io/) provides "instant alerts when any of your
-background jobs fail silently or run longer than expected", and borgmatic has
+background jobs fail silently or run longer than expected," and borgmatic has
 built-in integration with it. Once you create a Cronhub account and monitor on
 their site, all you need to do is configure borgmatic with the unique "Ping
 URL" for your monitor. Here's an example:
@ -221,13 +218,9 @@ cronhub:
 this option in the `hooks:` section of your configuration.

 With this hook in place, borgmatic pings your Cronhub monitor when a backup
-begins, ends, or errors. Specifically, after the <a
-href="https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/">`before_backup`
-hooks</a> run, borgmatic lets Cronhub know that it has started if any of the
-`prune`, `compact`, `create`, or `check` actions are run. Then, if the actions
-complete successfully, borgmatic notifies Cronhub of the success after the
-`after_backup` hooks run. And if an error occurs during any action or hook,
-borgmatic notifies Cronhub after the `on_error` hooks run.
+begins, ends, or errors, but only when any of the `prune`, `compact`,
+`create`, or `check` actions are run. Then, if the actions complete
+successfully or errors, borgmatic notifies Cronhub accordingly.

 Note that even though you configure borgmatic with the "start" variant of the
 ping URL, borgmatic substitutes the correct state into the URL when pinging
@ -266,10 +259,9 @@ pagerduty:
 this option in the `hooks:` section of your configuration.

 With this hook in place, borgmatic creates a PagerDuty event for your service
-whenever backups fail. Specifically, if an error occurs during a `create`,
-`prune`, `compact`, or `check` action, borgmatic sends an event to PagerDuty
-before the `on_error` hooks run. Note that borgmatic does not contact
-PagerDuty when a backup starts or ends without error.
+whenever backups fail, but only when any of the `create`, `prune`, `compact`,
+or `check` actions are run. Note that borgmatic does not contact PagerDuty
+when a backup starts or when it ends without error.

 You can configure PagerDuty to notify you by a [variety of
 mechanisms](https://support.pagerduty.com/docs/notifications) when backups
@ -328,6 +320,58 @@ ntfy:
 the `ntfy:` option in the `hooks:` section of your configuration.


+## Loki hook
+
+[Grafana Loki](https://grafana.com/oss/loki/) is a "horizontally scalable,
+highly available, multi-tenant log aggregation system inspired by Prometheus."
+borgmatic has built-in integration with Loki, sending both backup status and
+borgmatic logs.
+
+You can configure borgmatic to use either a [self-hosted Loki
+instance](https://grafana.com/docs/loki/latest/installation/) or [a Grafana
+Cloud account](https://grafana.com/auth/sign-up/create-user). Start by setting
+your Loki API push URL. Here's an example:
+
+```yaml
+loki:
+    url: http://localhost:3100/loki/api/v1/push
+```
+
+With this hook in place, borgmatic sends its logs to your Loki instance as any
+of the `prune`, `compact`, `create`, or `check` actions are run. Then, after
+the actions complete, borgmatic notifies Loki of success or failure.
+
+This hook supports sending arbitrary labels to Loki. For instance:
+
+```yaml
+loki:
+    url: http://localhost:3100/loki/api/v1/push
+
+    labels:
+        app: borgmatic
+        hostname: example.org
+```
+
+There are also a few placeholders you can optionally use as label values:
+
+ * `__config`: name of the borgmatic configuration file
+ * `__config_path`: full path of the borgmatic configuration file
+ * `__hostname`: the local machine hostname
+
+These placeholders are only substituted for the whole label value, not
+interpolated into a larger string. For instance:
+
+```yaml
+loki:
+    url: http://localhost:3100/loki/api/v1/push
+
+    labels:
+        app: borgmatic
+        config: __config
+        hostname: __hostname
+```
+
+
 ## Scripting borgmatic

 To consume the output of borgmatic in other software, you can include an
--- a/tests/integration/hooks/test_loki.py
+++ b/tests/integration/hooks/test_loki.py
@ -6,9 +6,9 @@ from flexmock import flexmock
 from borgmatic.hooks import loki as module


-def test_log_handler_label_replacment():
+def test_initialize_monitor_replaces_labels():
    '''
-    Assert that label placeholders get replaced
+    Assert that label placeholders get replaced.
    '''
    hook_config = {
        'url': 'http://localhost:3100/loki/api/v1/push',
@ -17,18 +17,20 @@ def test_log_handler_label_replacment():
    config_filename = '/mock/path/test.yaml'
    dry_run = True
    module.initialize_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run)
+
    for handler in tuple(logging.getLogger().handlers):
        if isinstance(handler, module.Loki_log_handler):
            assert handler.buffer.root['streams'][0]['stream']['hostname'] == platform.node()
            assert handler.buffer.root['streams'][0]['stream']['config'] == 'test.yaml'
            assert handler.buffer.root['streams'][0]['stream']['config_full'] == config_filename
            return
+
    assert False


-def test_initalize_adds_log_handler():
+def test_initialize_monitor_adds_log_handler():
    '''
-    Assert that calling initialize_monitor adds our logger to the root logger
+    Assert that calling initialize_monitor adds our logger to the root logger.
    '''
    hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}}
    module.initialize_monitor(
@ -38,15 +40,17 @@ def test_initalize_adds_log_handler():
        monitoring_log_level=flexmock(),
        dry_run=True,
    )
+
    for handler in tuple(logging.getLogger().handlers):
        if isinstance(handler, module.Loki_log_handler):
            return
+
    assert False


-def test_ping_adds_log_message():
+def test_ping_monitor_adds_log_message():
    '''
-    Assert that calling ping_monitor adds a message to our logger
+    Assert that calling ping_monitor adds a message to our logger.
    '''
    hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}}
    config_filename = 'test.yaml'
@ -55,6 +59,7 @@ def test_ping_adds_log_message():
    module.ping_monitor(
        hook_config, flexmock(), config_filename, module.monitor.State.FINISH, flexmock(), dry_run
    )
+
    for handler in tuple(logging.getLogger().handlers):
        if isinstance(handler, module.Loki_log_handler):
            assert any(
@ -65,18 +70,20 @@ def test_ping_adds_log_message():
                )
            )
            return
+
    assert False


-def test_log_handler_gets_removed():
+def test_destroy_monitor_removes_log_handler():
    '''
-    Assert that destroy_monitor removes the logger from the root logger
+    Assert that destroy_monitor removes the logger from the root logger.
    '''
    hook_config = {'url': 'http://localhost:3100/loki/api/v1/push', 'labels': {'app': 'borgmatic'}}
    config_filename = 'test.yaml'
    dry_run = True
    module.initialize_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run)
    module.destroy_monitor(hook_config, flexmock(), config_filename, flexmock(), dry_run)
+
    for handler in tuple(logging.getLogger().handlers):
        if isinstance(handler, module.Loki_log_handler):
            assert False
--- a/tests/unit/hooks/test_loki.py
+++ b/tests/unit/hooks/test_loki.py
@ -6,93 +6,103 @@ from flexmock import flexmock
 from borgmatic.hooks import loki as module


-def test_log_handler_gets_labels():
+def test_loki_log_buffer_add_value_gets_raw():
    '''
-    Assert that adding labels works
-    '''
-    buffer = module.Loki_log_buffer(flexmock(), False)
-    buffer.add_label('test', 'label')
-    assert buffer.root['streams'][0]['stream']['test'] == 'label'
-    buffer.add_label('test2', 'label2')
-    assert buffer.root['streams'][0]['stream']['test2'] == 'label2'
-
-
-def test_log_buffer_gets_raw():
-    '''
-    Assert that adding values to the log buffer increases it's length
+    Assert that adding values to the log buffer increases it's length.
    '''
    buffer = module.Loki_log_buffer(flexmock(), False)
    assert len(buffer) == 0
+
    buffer.add_value('Some test log line')
    assert len(buffer) == 1
+
    buffer.add_value('Another test log line')
    assert len(buffer) == 2


-def test_log_buffer_gets_log_messages():
+def test_loki_log_buffer_json_serializes_empty_buffer():
    '''
-    Assert that adding log records works
-    '''
-    handler = module.Loki_log_handler(flexmock(), False)
-    handler.emit(flexmock(getMessage=lambda: 'Some test log line'))
-    assert len(handler.buffer) == 1
-
-
-def test_log_buffer_json():
-    '''
-    Assert that the buffer correctly serializes when empty
+    Assert that the buffer correctly serializes when empty.
    '''
    buffer = module.Loki_log_buffer(flexmock(), False)
+
    assert json.loads(buffer.to_request()) == json.loads('{"streams":[{"stream":{},"values":[]}]}')


-def test_log_buffer_json_labels():
+def test_loki_log_buffer_json_serializes_labels():
    '''
-    Assert that the buffer correctly serializes with labels
+    Assert that the buffer correctly serializes with labels.
    '''
    buffer = module.Loki_log_buffer(flexmock(), False)
    buffer.add_label('test', 'label')
+
    assert json.loads(buffer.to_request()) == json.loads(
        '{"streams":[{"stream":{"test": "label"},"values":[]}]}'
    )


-def test_log_buffer_json_log_lines():
+def test_loki_log_buffer_json_serializes_log_lines():
    '''
-    Assert that log lines end up in the correct place in the log buffer
+    Assert that log lines end up in the correct place in the log buffer.
    '''
    buffer = module.Loki_log_buffer(flexmock(), False)
    buffer.add_value('Some test log line')
+
    assert json.loads(buffer.to_request())['streams'][0]['values'][0][1] == 'Some test log line'


-def test_log_handler_post():
+def test_loki_log_handler_add_label_gets_labels():
    '''
-    Assert that the flush function sends a post request after a certain limit
+    Assert that adding labels works.
+    '''
+    buffer = module.Loki_log_buffer(flexmock(), False)
+
+    buffer.add_label('test', 'label')
+    assert buffer.root['streams'][0]['stream']['test'] == 'label'
+
+    buffer.add_label('test2', 'label2')
+    assert buffer.root['streams'][0]['stream']['test2'] == 'label2'
+
+
+def test_loki_log_handler_emit_gets_log_messages():
+    '''
+    Assert that adding log records works.
+    '''
+    handler = module.Loki_log_handler(flexmock(), False)
+    handler.emit(flexmock(getMessage=lambda: 'Some test log line'))
+
+    assert len(handler.buffer) == 1
+
+
+def test_loki_log_handler_raw_posts_to_server():
+    '''
+    Assert that the flush function sends a post request after a certain limit.
    '''
    handler = module.Loki_log_handler(flexmock(), False)
    flexmock(module.requests).should_receive('post').and_return(
        flexmock(raise_for_status=lambda: '')
    ).once()
+
    for num in range(int(module.MAX_BUFFER_LINES * 1.5)):
        handler.raw(num)


-def test_log_handler_post_failiure():
+def test_loki_log_handler_raw_post_failure_does_not_raise():
    '''
-    Assert that the flush function catches request exceptions
+    Assert that the flush function catches request exceptions.
    '''
    handler = module.Loki_log_handler(flexmock(), False)
    flexmock(module.requests).should_receive('post').and_return(
        flexmock(raise_for_status=lambda: (_ for _ in ()).throw(requests.RequestException()))
    ).once()
+
    for num in range(int(module.MAX_BUFFER_LINES * 1.5)):
        handler.raw(num)


-def test_log_handler_empty_flush_noop():
+def test_loki_log_handler_flush_with_empty_buffer_does_not_raise():
    '''
-    Test that flushing an empty buffer does indeed nothing
+    Test that flushing an empty buffer does indeed nothing.
    '''
    handler = module.Loki_log_handler(flexmock(), False)
    handler.flush()