Skip to content

Commit 1e7f7ad

Browse files
committed
reset-replication: return {"elapsed_ms": u64} JSON in response body
Before: handler returned 200 with empty body. Operators had to grep libsql logs to see how long the reset took. After: handler returns 200 with {"elapsed_ms": N}. The streamer (and any other caller) can parse this into a StatsD histogram for dashboard latency tracking, without server-side log scraping. Backwards compatible: older clients that ignore the body still work. New clients that expect the body handle an empty body gracefully. New turmoil test: reset_replication_response_includes_elapsed_ms verifies the field is present and within sanity bounds.
1 parent 2ffd4b3 commit 1e7f7ad

3 files changed

Lines changed: 60 additions & 6 deletions

File tree

libsql-server/src/http/admin/mod.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -572,12 +572,18 @@ async fn handle_checkpoint<C>(
572572
/// - metastore config (jwt_key, block_writes, etc.) is preserved
573573
///
574574
/// Other namespaces on this pod are completely unaffected.
575+
#[derive(serde::Serialize)]
576+
struct ResetReplicationResp {
577+
/// Wall-clock duration of the reset, for operator-visible metrics.
578+
elapsed_ms: u64,
579+
}
580+
575581
async fn handle_reset_replication<C>(
576582
State(app_state): State<Arc<AppState<C>>>,
577583
Path(namespace): Path<NamespaceName>,
578-
) -> crate::Result<()> {
579-
app_state.namespaces.reset_replication(namespace).await?;
580-
Ok(())
584+
) -> crate::Result<axum::Json<ResetReplicationResp>> {
585+
let elapsed_ms = app_state.namespaces.reset_replication(namespace).await?;
586+
Ok(axum::Json(ResetReplicationResp { elapsed_ms }))
581587
}
582588

583589
#[derive(serde::Deserialize, Default)]

libsql-server/src/namespace/store.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ impl NamespaceStore {
262262
/// Brief unavailability window: from the moment we take the write lock
263263
/// until `make_namespace` returns. Other namespaces on the pod are
264264
/// completely unaffected.
265-
pub async fn reset_replication(&self, namespace: NamespaceName) -> crate::Result<()> {
265+
pub async fn reset_replication(&self, namespace: NamespaceName) -> crate::Result<u64> {
266266
if self.inner.has_shutdown.load(Ordering::Relaxed) {
267267
return Err(Error::NamespaceStoreShutdown);
268268
}
@@ -393,11 +393,11 @@ impl NamespaceStore {
393393
.await?;
394394
lock.replace(ns);
395395

396-
let elapsed_ms = start.elapsed().as_millis();
396+
let elapsed_ms = start.elapsed().as_millis() as u64;
397397
tracing::info!(
398398
"reset_replication: rebuilt replication log for namespace {namespace} in {elapsed_ms}ms"
399399
);
400-
Ok(())
400+
Ok(elapsed_ms)
401401
}
402402

403403
// This is only called on replica

libsql-server/tests/namespaces/mod.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,54 @@ fn integrity_check_defaults_to_quick_when_full_omitted() {
347347
sim.run().unwrap();
348348
}
349349

350+
#[test]
351+
fn reset_replication_response_includes_elapsed_ms() {
352+
// Operators (and ops dashboards) want to see how long reset took
353+
// without having to grep logs. The JSON response body includes
354+
// elapsed_ms, which must be a positive integer for any successful
355+
// reset.
356+
let mut sim = Builder::new()
357+
.simulation_duration(Duration::from_secs(1000))
358+
.build();
359+
let tmp = tempdir().unwrap();
360+
make_primary(&mut sim, tmp.path().to_path_buf());
361+
362+
sim.client("client", async {
363+
let client = Client::new();
364+
client
365+
.post("http://primary:9090/v1/namespaces/elapsed/create", json!({}))
366+
.await?;
367+
368+
let db = Database::open_remote_with_connector(
369+
"http://elapsed.primary:8080",
370+
"",
371+
TurmoilConnector,
372+
)?;
373+
let conn = db.connect()?;
374+
conn.execute("create table t(v text)", ()).await?;
375+
conn.execute("insert into t values ('x')", ()).await?;
376+
377+
let resp = client
378+
.post(
379+
"http://primary:9090/v1/namespaces/elapsed/reset-replication",
380+
json!({}),
381+
)
382+
.await?;
383+
assert_eq!(resp.status(), hyper::http::StatusCode::OK);
384+
let v = resp.json_value().await?;
385+
let elapsed = v["elapsed_ms"].as_u64().expect("elapsed_ms must be u64");
386+
// We can't assert a specific value (depends on hardware) but we
387+
// can assert it's present and non-negative. In practice this is
388+
// typically single-digit milliseconds for an empty/small table.
389+
// Max bound guards against a runaway regression.
390+
assert!(elapsed < 30_000, "elapsed_ms={elapsed} exceeds 30s sanity bound");
391+
392+
Ok(())
393+
});
394+
395+
sim.run().unwrap();
396+
}
397+
350398
#[test]
351399
fn reset_replication_is_idempotent() {
352400
// An operator (or the streamer's retry-after-reset path) may call

0 commit comments

Comments
 (0)