Retry failing backups (#28, #432).

Reviewed-on: borgmatic-collective/borgmatic#432
2021-11-15 19:34:24 +00:00 · 2021-11-15 19:34:24 +00:00 · 180018fd81
parent 794ae94ac4 976a877a25
commit 180018fd81
3 changed files with 188 additions and 4 deletions
--- a/borgmatic/commands/borgmatic.py
+++ b/borgmatic/commands/borgmatic.py
@ -4,6 +4,8 @@ import json
 import logging
 import os
 import sys
+import time
+from queue import Queue
 from subprocess import CalledProcessError

 import colorama
@ -52,6 +54,8 @@ def run_configuration(config_filename, config, arguments):

    local_path = location.get('local_path', 'borg')
    remote_path = location.get('remote_path')
+    retries = storage.get('retries', 0)
+    retry_timeout = storage.get('retry_timeout', 0)
    borg_environment.initialize(storage)
    encountered_error = None
    error_repository = ''
@ -120,7 +124,16 @@ def run_configuration(config_filename, config, arguments):
        )

    if not encountered_error:
-        for repository_path in location['repositories']:
+        repo_queue = Queue()
+        for repo in location['repositories']:
+            repo_queue.put((repo, 0),)
+
+        while not repo_queue.empty():
+            repository_path, retry_num = repo_queue.get()
+            timeout = retry_num * retry_timeout
+            if timeout:
+                logger.warning(f'Sleeping {timeout}s before next retry')
+                time.sleep(timeout)
            try:
                yield from run_actions(
                    arguments=arguments,
@ -134,11 +147,15 @@ def run_configuration(config_filename, config, arguments):
                    repository_path=repository_path,
                )
            except (OSError, CalledProcessError, ValueError) as error:
-                encountered_error = error
-                error_repository = repository_path
                yield from make_error_log_records(
                    '{}: Error running actions for repository'.format(repository_path), error
                )
+                if retry_num < retries:
+                    repo_queue.put((repository_path, retry_num + 1),)
+                    logger.warning(f'Retrying.. attempt {retry_num + 1}/{retries}')
+                    continue
+                encountered_error = error
+                error_repository = repository_path

    if not encountered_error:
        try:
@ -257,7 +274,7 @@ def run_actions(
    hooks,
    local_path,
    remote_path,
-    repository_path
+    repository_path,
 ):  # pragma: no cover
    '''
    Given parsed command-line arguments as an argparse.ArgumentParser instance, several different
--- a/borgmatic/config/schema.yaml
+++ b/borgmatic/config/schema.yaml
@ -251,6 +251,18 @@ properties:
                    Remote network upload rate limit in kiBytes/second. Defaults
                    to unlimited.
                example: 100
+            retries:
+                type: integer
+                description: |
+                    Number of times to retry a backup before failing. Defaults
+                    to 0 (i.e. does not attempt retry).
+                example: 3
+            retry_timeout:
+                type: integer
+                description: |
+                    Wait time between retries, to allow transient issues to pass
+                    Defaults to 0s.
+                example: 10
            temporary_directory:
                type: string
                description: |
--- a/tests/unit/commands/test_borgmatic.py
+++ b/tests/unit/commands/test_borgmatic.py
@ -1,5 +1,6 @@
 import logging
 import subprocess
+import time

 from flexmock import flexmock

@ -184,6 +185,160 @@ def test_run_configuration_bails_for_on_error_hook_soft_failure():
    assert results == expected_results


+def test_run_retries_soft_error():
+    # Run action first fails, second passes
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).and_return([])
+    expected_results = [flexmock()]
+    flexmock(module).should_receive('make_error_log_records').and_return(expected_results).once()
+    config = {'location': {'repositories': ['foo']}, 'storage': {'retries': 1}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_retries_hard_error():
+    # Run action fails twice
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).times(2)
+    expected_results = [flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[:1]).with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(
+        expected_results[1:]
+    ).twice()
+    config = {'location': {'repositories': ['foo']}, 'storage': {'retries': 1}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_repos_ordered():
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).times(2)
+    expected_results = [flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[:1]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[1:]).ordered()
+    config = {'location': {'repositories': ['foo', 'bar']}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_retries_round_robbin():
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).times(4)
+    expected_results = [flexmock(), flexmock(), flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[0:1]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[1:2]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[2:3]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[3:4]).ordered()
+    config = {'location': {'repositories': ['foo', 'bar']}, 'storage': {'retries': 1}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_retries_one_passes():
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).and_raise(OSError).and_return(
+        []
+    ).and_raise(OSError).times(4)
+    expected_results = [flexmock(), flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[0:1]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[1:2]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[2:3]).ordered()
+    config = {'location': {'repositories': ['foo', 'bar']}, 'storage': {'retries': 1}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_retry_timeout():
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).times(4)
+    expected_results = [flexmock(), flexmock(), flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[0:1]).ordered()
+
+    flexmock(time).should_receive('sleep').with_args(10).and_return().ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[1:2]).ordered()
+
+    flexmock(time).should_receive('sleep').with_args(20).and_return().ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[2:3]).ordered()
+
+    flexmock(time).should_receive('sleep').with_args(30).and_return().ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[3:4]).ordered()
+    config = {'location': {'repositories': ['foo']}, 'storage': {'retries': 3, 'retry_timeout': 10}}
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
+def test_run_retries_timeout_multiple_repos():
+    flexmock(module.borg_environment).should_receive('initialize')
+    flexmock(module.command).should_receive('execute_hook')
+    flexmock(module).should_receive('run_actions').and_raise(OSError).and_raise(OSError).and_return(
+        []
+    ).and_raise(OSError).times(4)
+    expected_results = [flexmock(), flexmock(), flexmock()]
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'foo: Error running actions for repository', OSError
+    ).and_return(expected_results[0:1]).ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[1:2]).ordered()
+
+    # Sleep before retrying foo (and passing)
+    flexmock(time).should_receive('sleep').with_args(10).and_return().ordered()
+
+    # Sleep before retrying bar (and failing)
+    flexmock(time).should_receive('sleep').with_args(10).and_return().ordered()
+    flexmock(module).should_receive('make_error_log_records').with_args(
+        'bar: Error running actions for repository', OSError
+    ).and_return(expected_results[2:3]).ordered()
+    config = {
+        'location': {'repositories': ['foo', 'bar']},
+        'storage': {'retries': 1, 'retry_timeout': 10},
+    }
+    arguments = {'global': flexmock(monitoring_verbosity=1, dry_run=False), 'create': flexmock()}
+    results = list(module.run_configuration('test.yaml', config, arguments))
+    assert results == expected_results
+
+
 def test_load_configurations_collects_parsed_configurations():
    configuration = flexmock()
    other_configuration = flexmock()