From bb56657f210c704334ad66723e3a297e1ed37e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C5=A1a=20=C5=BDivkov?= Date: Tue, 4 Jul 2017 16:35:12 +0200 Subject: Fix race condition when scheduling a replication The state of PushOne object was modified after the operation was scheduled. Depending on the delay parameter the PushOne.run() could be called before (when the delay is zero or close to zero) or after (longer delay) the current thread calls e.addState(ref, state). Thus, the thread executing PushOne.run() may or may not see the update of its stateMap. Further, the stateMap is a LinkedListMultimap which is not thread safe and the PushOne doesn't synchronize access to it. Change-Id: Ib26a5933a7993a6d2f0501e5daaf06a7892e597f --- .../java/com/googlesource/gerrit/plugins/replication/Destination.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java index 603a70f..dd401af 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java @@ -295,13 +295,14 @@ public class Destination { if (e == null) { e = opFactory.create(project, uri); addRef(e, ref); + e.addState(ref, state); pool.schedule(e, config.getDelay(), TimeUnit.SECONDS); pending.put(uri, e); } else if (!e.getRefs().contains(ref)) { addRef(e, ref); + e.addState(ref, state); } state.increasePushTaskCount(project.get(), ref); - e.addState(ref, state); repLog.info("scheduled {}:{} => {} to run after {}s", project, ref, e, config.getDelay()); } } -- cgit v1.2.3 From 7dd3d90e1c4bb92dfde626eaa429fb37afa4d41c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C5=A1a=20=C5=BDivkov?= Date: Tue, 4 Jul 2017 17:27:46 +0200 Subject: Use rescheduleDelay instead of replicationDelay when rescheduling The replicationDelay was used for two purposes: a) initial delay to schedule a replication b) delay when rescheduling the replication due to an in-flight push The replicationDelay could be set to zero, which make sense for the case a) but doesn't make sense for the case b). Actually, when replicationDelay is set to zero, the case b) will cause an excessive number of retries (over 1K/second) and will also fire a replication failed event for each retry. The latter is yet another issue and will be addressed in another change. Introduce a new config parameter rescheduleDelay and use it for the case b). Make sure it cannot bet set to a value lower than 3 seconds to avoid the same issue. In order to keep backwards compabitility with using the replicationDelay as rescheduleDelay, a plugin init step is added. For each remote section which doesn't already define rescheduleDelay, it will set rescheduleDelay to the current value of the replicationDelay, unless replicationDelay is set to zero. In the latter case it will assume the default value for the rescheduleDelay. Change-Id: Ia78f46460b531b04ee36ec2d5ab4228dba5c0c50 --- BUILD | 1 + .../gerrit/plugins/replication/Destination.java | 2 +- .../replication/DestinationConfiguration.java | 12 +++- .../gerrit/plugins/replication/Init.java | 70 ++++++++++++++++++++++ src/main/resources/Documentation/config.md | 11 ++++ 5 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 src/main/java/com/googlesource/gerrit/plugins/replication/Init.java diff --git a/BUILD b/BUILD index eb92f6d..1cad80c 100644 --- a/BUILD +++ b/BUILD @@ -8,6 +8,7 @@ gerrit_plugin( "Implementation-Title: Replication plugin", "Implementation-URL: https://gerrit-review.googlesource.com/#/admin/projects/plugins/replication", "Gerrit-PluginName: replication", + "Gerrit-InitStep: com.googlesource.gerrit.plugins.replication.Init", "Gerrit-Module: com.googlesource.gerrit.plugins.replication.ReplicationModule", "Gerrit-SshModule: com.googlesource.gerrit.plugins.replication.SshModule", ], diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java index dd401af..f8e2d7b 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java @@ -389,7 +389,7 @@ public class Destination { pending.put(uri, pushOp); switch (reason) { case COLLISION: - pool.schedule(pushOp, config.getDelay(), TimeUnit.SECONDS); + pool.schedule(pushOp, config.getRescheduleDelay(), TimeUnit.SECONDS); break; case TRANSPORT_ERROR: case REPOSITORY_MISSING: diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java index fc109bf..856ffb1 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java @@ -20,7 +20,11 @@ import org.eclipse.jgit.lib.Config; import org.eclipse.jgit.transport.RemoteConfig; class DestinationConfiguration { + static final int DEFAULT_REPLICATION_DELAY = 15; + static final int DEFAULT_RESCHEDULE_DELAY = 3; + private final int delay; + private final int rescheduleDelay; private final int retryDelay; private final int lockErrorMaxRetries; private final ImmutableList adminUrls; @@ -40,7 +44,9 @@ class DestinationConfiguration { this.remoteConfig = remoteConfig; String name = remoteConfig.getName(); urls = ImmutableList.copyOf(cfg.getStringList("remote", name, "url")); - delay = Math.max(0, getInt(remoteConfig, cfg, "replicationdelay", 15)); + delay = Math.max(0, getInt(remoteConfig, cfg, "replicationdelay", DEFAULT_REPLICATION_DELAY)); + rescheduleDelay = + Math.max(3, getInt(remoteConfig, cfg, "rescheduledelay", DEFAULT_RESCHEDULE_DELAY)); projects = ImmutableList.copyOf(cfg.getStringList("remote", name, "projects")); adminUrls = ImmutableList.copyOf(cfg.getStringList("remote", name, "adminUrl")); retryDelay = Math.max(0, getInt(remoteConfig, cfg, "replicationretry", 1)); @@ -63,6 +69,10 @@ class DestinationConfiguration { return delay; } + public int getRescheduleDelay() { + return rescheduleDelay; + } + public int getRetryDelay() { return retryDelay; } diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Init.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Init.java new file mode 100644 index 0000000..a9fdb4f --- /dev/null +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Init.java @@ -0,0 +1,70 @@ +// Copyright (C) 2017 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.googlesource.gerrit.plugins.replication; + +import static com.googlesource.gerrit.plugins.replication.DestinationConfiguration.DEFAULT_REPLICATION_DELAY; +import static com.googlesource.gerrit.plugins.replication.DestinationConfiguration.DEFAULT_RESCHEDULE_DELAY; + +import com.google.common.base.Strings; +import com.google.gerrit.extensions.annotations.PluginName; +import com.google.gerrit.pgm.init.api.ConsoleUI; +import com.google.gerrit.pgm.init.api.InitStep; +import com.google.gerrit.server.config.SitePaths; +import com.google.inject.Inject; +import java.io.File; +import org.eclipse.jgit.storage.file.FileBasedConfig; +import org.eclipse.jgit.util.FS; + +public class Init implements InitStep { + private final String pluginName; + private final SitePaths site; + private final ConsoleUI ui; + + @Inject + Init(@PluginName String pluginName, SitePaths site, ConsoleUI ui) { + this.pluginName = pluginName; + this.site = site; + this.ui = ui; + } + + @Override + public void run() throws Exception { + File configFile = site.etc_dir.resolve(pluginName + ".config").toFile(); + if (!configFile.exists()) { + return; + } + + FileBasedConfig config = new FileBasedConfig(configFile, FS.DETECTED); + config.load(); + for (String name : config.getSubsections("remote")) { + if (!Strings.isNullOrEmpty(config.getString("remote", name, "rescheduleDelay"))) { + continue; + } + + int replicationDelay = + config.getInt("remote", name, "replicationDelay", DEFAULT_REPLICATION_DELAY); + if (replicationDelay > 0) { + int delay = Math.max(replicationDelay, DEFAULT_RESCHEDULE_DELAY); + ui.message("Setting remote.%s.rescheduleDelay = %d\n", name, delay); + config.setInt("remote", name, "rescheduleDelay", delay); + } else { + ui.message( + "INFO: Assuming default (%d s) for remote.%s.rescheduleDelay\n", + DEFAULT_RESCHEDULE_DELAY, name); + } + } + config.save(); + } +} diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md index 50664dd..d8fe9e5 100644 --- a/src/main/resources/Documentation/config.md +++ b/src/main/resources/Documentation/config.md @@ -216,6 +216,17 @@ remote.NAME.replicationDelay By default, 15 seconds. +remote.NAME.rescheduleDelay +: Delay when rescheduling a push operation due to an in-flight push + running for the same project. + + Cannot be set to a value lower than 3 seconds to avoid a tight loop + of schedule/run which could cause 1K+ retries per second. + + A configured value lower than 3 seconds will be rounded to 3 seconds. + + By default, 3 seconds. + remote.NAME.replicationRetry : Time to wait before scheduling a remote push operation previously failed due to an offline remote server. -- cgit v1.2.3 From f88c39a52c45933f0196b94e570904383aa865ec Mon Sep 17 00:00:00 2001 From: David Pursehouse Date: Fri, 2 Jun 2017 04:05:34 +0000 Subject: Fix minor nits in documentation of replication.maxRetries Change-Id: I80e46269e12b9e7a429634dd90ce52e90cc6ad79 --- src/main/resources/Documentation/config.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md index d8fe9e5..b1058b5 100644 --- a/src/main/resources/Documentation/config.md +++ b/src/main/resources/Documentation/config.md @@ -108,13 +108,13 @@ replication.maxRetries : Maximum number of times to retry a push operation that previously failed. - When a push operation reaches its maximum number of retries + When a push operation reaches its maximum number of retries, the replication event is discarded from the queue and the remote - destinations could be out of sync. + destinations may remain out of sync. Can be overridden at remote-level by setting replicationMaxRetries. - By default, push are retried indefinitely. + By default, pushes are retried indefinitely. remote.NAME.url : Address of the remote server to push to. Multiple URLs may be -- cgit v1.2.3