summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaushik Lingarkar <kaushikl@codeaurora.org>2020-11-10 13:24:54 -0700
committerKaushik Lingarkar <kaushikl@codeaurora.org>2020-12-02 14:39:20 -0800
commitba2c8e16b798c2eaf4e56dd66d8c1cd00999e096 (patch)
tree99527761cfc82b0bb4c808ef221b6a78d7baa1d8
parent8fdb0f9ac0a7f68b3f942cb4a9fd4c94e488ab57 (diff)
Fix replication to retry on lock errorsv3.0.16v2.16.26
Versions of Git released since 2014 have created a new status "failed to update ref" which replaces the two statuses "failed to lock" and "failed to write". So, we now see the newer status when the remote is unable to lock a ref. Refer Git commit: https://github.com/git/git/commit/6629ea2d4a5faa0a84367f6d4aedba53cb0f26b4 Config 'lockErrorMaxRetries' is not removed as part of this change as folks who have it configured currently don't run into unexpected behavior with retries when they upgrade to a newer version of the plugin. Also, the "failed to lock" check is not removed for folks still using a version of Git older than 2014. Change-Id: I9b3b15bebd55df30cbee50a0e0c2190d04f2f443
-rw-r--r--src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java4
-rw-r--r--src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java13
-rw-r--r--src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java44
-rw-r--r--src/main/resources/Documentation/config.md26
4 files changed, 60 insertions, 27 deletions
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
index e057626..522abbd 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
@@ -734,8 +734,8 @@ public class Destination {
return config.getProjects();
}
- int getLockErrorMaxRetries() {
- return config.getLockErrorMaxRetries();
+ int getUpdateRefErrorMaxRetries() {
+ return config.getUpdateRefErrorMaxRetries();
}
String getRemoteConfigName() {
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
index f688cfc..c2b361f 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
@@ -28,7 +28,7 @@ public class DestinationConfiguration {
private final int rescheduleDelay;
private final int retryDelay;
private final int drainQueueAttempts;
- private final int lockErrorMaxRetries;
+ private final int updateRefErrorMaxRetries;
private final ImmutableList<String> adminUrls;
private final int poolThreads;
private final boolean createMissingRepos;
@@ -56,8 +56,11 @@ public class DestinationConfiguration {
Math.max(0, getInt(remoteConfig, cfg, "drainQueueAttempts", DEFAULT_DRAIN_QUEUE_ATTEMPTS));
poolThreads = Math.max(0, getInt(remoteConfig, cfg, "threads", 1));
authGroupNames = ImmutableList.copyOf(cfg.getStringList("remote", name, "authGroup"));
- lockErrorMaxRetries = cfg.getInt("replication", "lockErrorMaxRetries", 0);
-
+ updateRefErrorMaxRetries =
+ cfg.getInt(
+ "replication",
+ "updateRefErrorMaxRetries",
+ cfg.getInt("replication", "lockErrorMaxRetries", 0));
createMissingRepos = cfg.getBoolean("remote", name, "createMissingRepositories", true);
replicatePermissions = cfg.getBoolean("remote", name, "replicatePermissions", true);
replicateProjectDeletions = cfg.getBoolean("remote", name, "replicateProjectDeletions", false);
@@ -89,8 +92,8 @@ public class DestinationConfiguration {
return poolThreads;
}
- public int getLockErrorMaxRetries() {
- return lockErrorMaxRetries;
+ public int getUpdateRefErrorMaxRetries() {
+ return updateRefErrorMaxRetries;
}
public ImmutableList<String> getUrls() {
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
index 4e5cca6..f2a077e 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
@@ -88,6 +88,17 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
static final String ALL_REFS = "..all..";
static final String ID_MDC_KEY = "pushOneId";
+ // The string here needs to match the one returned by Git(versions prior to 2014) server.
+ // See:
+ // https://github.com/git/git/blob/b4d75ac1d152bbab44b0777a4cc0c48db75f6024/builtin/receive-pack.c#L587
+ // https://github.com/eclipse/jgit/blob/8774f541904ca9afba1786b4da14c1aedf4dda78/org.eclipse.jgit/src/org/eclipse/jgit/transport/ReceivePack.java#L1859
+ static final String LOCK_FAILURE = "failed to lock";
+
+ // The string here needs to match the one returned by Git server.
+ // See:
+ // https://github.com/git/git/blob/e67fbf927dfdf13d0b21dc6ea15dc3c7ef448ea0/builtin/receive-pack.c#L1611
+ static final String UPDATE_REF_FAILURE = "failed to update ref";
+
interface Factory {
PushOne create(Project.NameKey d, URIish u);
}
@@ -111,8 +122,8 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
private final int maxRetries;
private boolean canceled;
private final ListMultimap<String, ReplicationState> stateMap = LinkedListMultimap.create();
- private final int maxLockRetries;
- private int lockRetryCount;
+ private final int maxUpdateRefRetries;
+ private int updateRefRetryCount;
private final int id;
private final long createdAt;
private final ReplicationMetrics metrics;
@@ -146,8 +157,8 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
replicationQueue = rq;
projectName = d;
uri = u;
- lockRetryCount = 0;
- maxLockRetries = pool.getLockErrorMaxRetries();
+ updateRefRetryCount = 0;
+ maxUpdateRefRetries = pool.getUpdateRefErrorMaxRetries();
id = ig.next();
stateLog = sl;
createdAt = System.nanoTime();
@@ -375,12 +386,12 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
Throwable cause = e.getCause();
if (cause instanceof JSchException && cause.getMessage().startsWith("UnknownHostKey:")) {
repLog.error("Cannot replicate to {}: {}", uri, cause.getMessage());
- } else if (e instanceof LockFailureException) {
- lockRetryCount++;
- repLog.error("Cannot replicate to {} due to lock failure", uri);
+ } else if (e instanceof UpdateRefFailureException) {
+ updateRefRetryCount++;
+ repLog.error("Cannot replicate to {} due to a lock or write ref failure", uri);
// The remote push operation should be retried.
- if (lockRetryCount <= maxLockRetries) {
+ if (updateRefRetryCount <= maxUpdateRefRetries) {
if (canceledWhileRunning.get()) {
logCanceledWhileRunningException(e);
} else {
@@ -388,7 +399,10 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
}
} else {
repLog.error(
- "Giving up after {} lock failures during replication to {}", lockRetryCount, uri);
+ "Giving up after {} '{}' failures during replication to {}",
+ updateRefRetryCount,
+ e.getMessage(),
+ uri);
}
} else {
if (canceledWhileRunning.get()) {
@@ -642,7 +656,8 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
cmds.add(new RemoteRefUpdate(git, (Ref) null, dst, force, null, null));
}
- private void updateStates(Collection<RemoteRefUpdate> refUpdates) throws LockFailureException {
+ private void updateStates(Collection<RemoteRefUpdate> refUpdates)
+ throws UpdateRefFailureException {
Set<String> doneRefs = new HashSet<>();
boolean anyRefFailed = false;
RemoteRefUpdate.Status lastRefStatusError = RemoteRefUpdate.Status.OK;
@@ -686,8 +701,9 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
+ " of destination repository.",
u.getRemoteName(), uri),
logStatesArray);
- } else if ("failed to lock".equals(u.getMessage())) {
- throw new LockFailureException(uri, u.getMessage());
+ } else if (LOCK_FAILURE.equals(u.getMessage())
+ || UPDATE_REF_FAILURE.equals(u.getMessage())) {
+ throw new UpdateRefFailureException(uri, u.getMessage());
} else {
stateLog.error(
String.format(
@@ -726,10 +742,10 @@ class PushOne implements ProjectRunnable, CanceledWhileRunning {
stateMap.clear();
}
- public static class LockFailureException extends TransportException {
+ public static class UpdateRefFailureException extends TransportException {
private static final long serialVersionUID = 1L;
- LockFailureException(URIish uri, String message) {
+ UpdateRefFailureException(URIish uri, String message) {
super(uri, message);
}
}
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 1309aeb..0e47010 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -90,19 +90,23 @@ gerrit.sshConnectionTimeout
: Timeout for SSH connections. If 0, there is no timeout and
the client waits indefinitely. By default, 2 minutes.
-replication.lockErrorMaxRetries
-: Number of times to retry a replication operation if a lock
- error is detected.
+<a name="replication.updateRefErrorMaxRetries">replication.updateRefErrorMaxRetries</a>
+: Number of times to retry a replication operation if an update
+ ref error is detected.
If two or more replication operations (to the same GIT and Ref)
are scheduled at approximately the same time (and end up on different
replication threads), there is a large probability that the last
- push to complete will fail with a remote "failure to lock" error.
+ push to complete will fail with a remote "failed to update ref" error.
+ This error may also occur due to a transient issue like file system
+ being full which was previously returned as "failed to write" by git.
+
This option allows Gerrit to retry the replication push when the
- "failure to lock" error is detected.
+ "failed to update ref" error is detected. Also retry when the error
+ "failed to lock" is detected as that is the legacy string used by git.
A good value would be 3 retries or less, depending on how often
- you see lockError collisions in your server logs. A too highly set
+ you see updateRefError collisions in your server logs. A too highly set
value risks keeping around the replication operations in the queue
for a long time, and the number of items in the queue will increase
with time.
@@ -117,6 +121,16 @@ replication.lockErrorMaxRetries
Default: 0 (disabled, i.e. never retry)
+replication.lockErrorMaxRetries
+: Refer to the [replication.updateRefErrorMaxRetries][4] section.
+
+ If both `lockErrorMaxRetries` and `updateRefErrorMaxRetries` are
+ configured, then `updateRefErrorMaxRetries` takes precedence.
+
+ Default: 0 (disabled, i.e. never retry)
+
+[4]: #replication.updateRefErrorMaxRetries
+
replication.maxRetries
: Maximum number of times to retry a push operation that previously
failed.