Use msec-level precision for refs/multi-site/version value
Very active repositories may have multiple updates within the same
second which deserve to be traced in the project version. Previously
with the maximum precision of 1 second there was the risk of losing
track of missed replication updates for multiple modifications.
The warning 'NOT Updating project <> in shared ref-db' was noticed
many times, which was a cause of concern for the lack of
traceability of potential issues.
NOTE: This introduces a new replication-lag metric with a more
accurate version which tracks at msec level.
TODO: Once this change is merged up to master, the legacy metric
'multi_site/subscriber/subscriber_replication_status/sec_behind'
must be removed because it would be confusing.
Bug: Issue 17013
Change-Id: I9cc78ae0ce67b90ec1a82091b3ca60294086350e
diff --git a/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatus.java b/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatus.java
index 8700107..d677df8 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatus.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatus.java
@@ -99,6 +99,10 @@
}
public Long getMaxLag() {
+ return getMaxLagMillis() / 1000;
+ }
+
+ public Long getMaxLagMillis() {
Collection<Long> lags = replicationStatusPerProject.values();
if (lags.isEmpty()) {
return 0L;
diff --git a/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetrics.java b/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetrics.java
index efe1f91..8c4efba 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetrics.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetrics.java
@@ -39,6 +39,8 @@
"subscriber_msg_consumer_failure_counter";
public static final String REPLICATION_LAG_SEC =
"multi_site/subscriber/subscriber_replication_status/sec_behind";
+ private static final String REPLICATION_LAG_MSEC =
+ "multi_site/subscriber/subscriber_replication_status/msec_behind";
private final Counter1<String> subscriberSuccessCounter;
private final Counter1<String> subscriberFailureCounter;
@@ -68,6 +70,13 @@
Long.class,
new Description("Replication lag (sec)").setGauge().setUnit(Description.Units.SECONDS),
replicationStatus::getMaxLag);
+ metricMaker.newCallbackMetric(
+ REPLICATION_LAG_MSEC,
+ Long.class,
+ new Description("Replication lag (msec)")
+ .setGauge()
+ .setUnit(Description.Units.MILLISECONDS),
+ replicationStatus::getMaxLagMillis);
}
/**
diff --git a/src/main/java/com/googlesource/gerrit/plugins/multisite/validation/ProjectVersionRefUpdateImpl.java b/src/main/java/com/googlesource/gerrit/plugins/multisite/validation/ProjectVersionRefUpdateImpl.java
index 469122b..4bbfd91 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/multisite/validation/ProjectVersionRefUpdateImpl.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/multisite/validation/ProjectVersionRefUpdateImpl.java
@@ -286,7 +286,7 @@
}
private long getCurrentGlobalVersionNumber() {
- return System.currentTimeMillis() / 1000;
+ return System.currentTimeMillis();
}
private Boolean isSuccessful(RefUpdate.Result result) {
diff --git a/src/main/resources/Documentation/about.md b/src/main/resources/Documentation/about.md
index 159e4a0..d7c5648 100644
--- a/src/main/resources/Documentation/about.md
+++ b/src/main/resources/Documentation/about.md
@@ -99,3 +99,7 @@
* Subscriber replication lag (sec behind the producer)
`metric=site/multi_site/subscriber/subscriber_replication_status/sec_behind, type=com.google.gerrit.metrics.dropwizard.CallbackMetricImpl`
+
+* Subscriber replication lag (millisec behind the producer)
+
+`metric=site/multi_site/subscriber/subscriber_replication_status/msec_behind, type=com.google.gerrit.metrics.dropwizard.CallbackMetricImpl`
diff --git a/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatusTest.java b/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatusTest.java
index a067df9..d49108c 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatusTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/ReplicationStatusTest.java
@@ -71,6 +71,14 @@
replicationStatusCache.put("projectB", 3L);
objectUnderTest.start();
+ assertThat(objectUnderTest.getMaxLagMillis()).isEqualTo(10L);
+ }
+
+ @Test
+ public void shouldConvertMillisLagFromPersistedCacheOnStartToSecs() {
+ replicationStatusCache.put("projectA", 10000L);
+
+ objectUnderTest.start();
assertThat(objectUnderTest.getMaxLag()).isEqualTo(10L);
}
@@ -81,7 +89,7 @@
objectUnderTest.start();
objectUnderTest.doUpdateLag(Project.nameKey("projectA"), 20L);
- assertThat(objectUnderTest.getMaxLag()).isEqualTo(20L);
+ assertThat(objectUnderTest.getMaxLagMillis()).isEqualTo(20L);
}
@Test
diff --git a/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetricsTest.java b/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetricsTest.java
index 30254a7..6413b18 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetricsTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/multisite/consumer/SubscriberMetricsTest.java
@@ -71,7 +71,7 @@
@Test
public void shouldLogProjectVersionWhenReceivingRefUpdatedEventWithoutLag() {
- Optional<Long> globalRefDbVersion = Optional.of(System.currentTimeMillis() / 1000);
+ Optional<Long> globalRefDbVersion = Optional.of(System.currentTimeMillis());
when(projectVersionRefUpdate.getProjectRemoteVersion(A_TEST_PROJECT_NAME))
.thenReturn(globalRefDbVersion);
when(projectVersionRefUpdate.getProjectLocalVersion(A_TEST_PROJECT_NAME))
@@ -86,7 +86,7 @@
@Test
public void shouldLogProjectVersionWhenReceivingRefUpdatedEventWithALag() {
- Optional<Long> globalRefDbVersion = Optional.of(System.currentTimeMillis() / 1000);
+ Optional<Long> globalRefDbVersion = Optional.of(System.currentTimeMillis());
long replicationLag = 60;
when(projectVersionRefUpdate.getProjectRemoteVersion(A_TEST_PROJECT_NAME))
.thenReturn(globalRefDbVersion.map(ts -> ts + replicationLag));
@@ -104,7 +104,7 @@
public void
shouldLogUponProjectDeletionSuccessWhenLocalVersionDoesNotExistAndSubscriberMetricsExist()
throws Exception {
- long nowSecs = System.currentTimeMillis() / 1000;
+ long nowSecs = System.currentTimeMillis();
long replicationLagSecs = 60;
Optional<Long> globalRefDbVersion = Optional.of(nowSecs);
when(projectVersionRefUpdate.getProjectRemoteVersion(A_TEST_PROJECT_NAME))
@@ -146,7 +146,7 @@
@Test
public void shouldNotLogUponProjectDeletionSuccessWhenLocalVersionStillExists() throws Exception {
Event eventMessage = projectDeletionSuccess();
- Optional<Long> anyRefVersionValue = Optional.of(System.currentTimeMillis() / 1000);
+ Optional<Long> anyRefVersionValue = Optional.of(System.currentTimeMillis());
when(projectVersionRefUpdate.getProjectLocalVersion(A_TEST_PROJECT_NAME))
.thenReturn(anyRefVersionValue);
@@ -157,7 +157,7 @@
@Test
public void shouldRemoveProjectMetricsUponProjectDeletionSuccess() throws Exception {
- long nowSecs = System.currentTimeMillis() / 1000;
+ long nowSecs = System.currentTimeMillis();
long replicationLagSecs = 60;
Optional<Long> globalRefDbVersion = Optional.of(nowSecs);
when(projectVersionRefUpdate.getProjectRemoteVersion(A_TEST_PROJECT_NAME))