Skip to content

Commit 1de5a78

Browse files
committed
Add POST /v1/namespaces/:ns/integrity-check admin endpoint
Runs PRAGMA quick_check (or integrity_check if {full:true}) on a namespace's live data file. Lets callers (cloud-sync-streamer, operators) classify the failure mode BEFORE attempting recovery: ok:true -> live DB is fine; corruption must be in wallog/snapshots (Mode A) -> use reset-replication (~1s recovery). ok:false -> live DB itself is corrupt (Mode B) -> reset-replication would propagate the corruption; fall back to delete + bulk-import. Implementation detail: SQLite surfaces severe corruption as prepare/connect errors rather than PRAGMA rows. The endpoint normalizes those into the same {ok:false, message:...} response shape so the caller gets a uniform classification signal (HTTP 200) rather than a server error (HTTP 500). - libsql-server/src/namespace/mod.rs: Namespace::integrity_check() - libsql-server/src/namespace/store.rs: NamespaceStore::integrity_check() - libsql-server/src/http/admin/mod.rs: handler + route Verified with /tmp/test_integrity_check.sh against a healthy namespace (ok), a namespace with a poisoned data file (Mode B detected), and a non-existent namespace (proper 404).
1 parent e157523 commit 1de5a78

3 files changed

Lines changed: 131 additions & 0 deletions

File tree

libsql-server/src/http/admin/mod.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ where
162162
"/v1/namespaces/:namespace/reset-replication",
163163
post(handle_reset_replication),
164164
)
165+
.route(
166+
"/v1/namespaces/:namespace/integrity-check",
167+
post(handle_integrity_check),
168+
)
165169
.route("/v1/namespaces/:namespace", delete(handle_delete_namespace))
166170
.route("/v1/namespaces/:namespace/stats", get(stats::handle_stats))
167171
.route(
@@ -576,6 +580,53 @@ async fn handle_reset_replication<C>(
576580
Ok(())
577581
}
578582

583+
#[derive(serde::Deserialize, Default)]
584+
struct IntegrityCheckReq {
585+
/// If true, run full `PRAGMA integrity_check` (O(DB size), thorough).
586+
/// Default is `PRAGMA quick_check` which is fast and catches the
587+
/// critical corruption classes.
588+
#[serde(default)]
589+
full: bool,
590+
}
591+
592+
#[derive(serde::Serialize)]
593+
struct IntegrityCheckResp {
594+
ok: bool,
595+
/// Raw SQLite diagnostic text. `"ok"` on success, otherwise one or
596+
/// more messages describing integrity issues.
597+
message: String,
598+
/// "quick" or "full", mirrors the `full` request field.
599+
check: &'static str,
600+
}
601+
602+
/// Run `PRAGMA quick_check` (default) or `PRAGMA integrity_check` on a
603+
/// namespace's live data file without touching other namespaces.
604+
///
605+
/// Use this to classify the failure mode before recovery:
606+
/// - `ok` → live DB is fine, any corruption is in wallog/snapshots (Mode A)
607+
/// → caller should use `POST /v1/namespaces/:ns/reset-replication`.
608+
/// - non-"ok" → live DB itself is corrupt (Mode B)
609+
/// → caller should restore from backup, not reset-replication.
610+
///
611+
/// Cheap: ~10ms for quick_check on small-to-medium namespaces.
612+
async fn handle_integrity_check<C>(
613+
State(app_state): State<Arc<AppState<C>>>,
614+
Path(namespace): Path<NamespaceName>,
615+
payload: Option<Json<IntegrityCheckReq>>,
616+
) -> crate::Result<Json<IntegrityCheckResp>> {
617+
let full = payload.map(|p| p.0.full).unwrap_or(false);
618+
let message = app_state
619+
.namespaces
620+
.integrity_check(namespace, full)
621+
.await?;
622+
let ok = message.trim() == "ok";
623+
Ok(Json(IntegrityCheckResp {
624+
ok,
625+
message,
626+
check: if full { "full" } else { "quick" },
627+
}))
628+
}
629+
579630
#[derive(serde::Deserialize)]
580631
struct EnableHeapProfileRequest {
581632
#[serde(default)]

libsql-server/src/namespace/mod.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,48 @@ impl Namespace {
9191
Ok(())
9292
}
9393

94+
/// Run `PRAGMA quick_check` (or `integrity_check` if `full=true`) on
95+
/// the namespace's live DB file and return the result string.
96+
///
97+
/// For a healthy DB this returns `"ok"`. Anything else is an
98+
/// integrity diagnostic message from SQLite.
99+
///
100+
/// A catastrophically corrupt DB can fail before PRAGMA runs (e.g.
101+
/// `malformed database schema` raised while the prepared statement
102+
/// is parsing the schema). We normalize that into the same
103+
/// `Ok(String)` return path so callers get a uniform classification
104+
/// signal instead of a server error.
105+
async fn integrity_check(&self, full: bool) -> anyhow::Result<String> {
106+
// Even creating a connection can fail ("malformed database schema")
107+
// when the DB is badly corrupt — that IS an integrity signal so we
108+
// surface it as `Ok(String)` rather than an Err that becomes a 500.
109+
let conn = match self.db.connection_maker().create().await {
110+
Ok(c) => c,
111+
Err(e) => {
112+
return Ok(format!("connection failed: {e}"));
113+
}
114+
};
115+
let pragma = if full { "integrity_check" } else { "quick_check" };
116+
let result = conn.with_raw(move |raw| -> rusqlite::Result<Vec<String>> {
117+
let mut stmt = raw.prepare(&format!("PRAGMA {pragma}"))?;
118+
let mut rows = stmt.query([])?;
119+
let mut out = Vec::new();
120+
while let Some(row) = rows.next()? {
121+
let s: String = row.get(0)?;
122+
out.push(s);
123+
}
124+
Ok(out)
125+
});
126+
match result {
127+
Ok(rows) => Ok(rows.join("\n")),
128+
Err(e) => {
129+
// SQLite surfaces integrity failures as prepare/query errors
130+
// rather than PRAGMA rows. Treat those as integrity signals.
131+
Ok(format!("{e}"))
132+
}
133+
}
134+
}
135+
94136
async fn shutdown(mut self, should_checkpoint: bool) -> anyhow::Result<()> {
95137
self.tasks.shutdown().await;
96138
if should_checkpoint {

libsql-server/src/namespace/store.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,44 @@ impl NamespaceStore {
153153
Ok(())
154154
}
155155

156+
/// Run `PRAGMA quick_check` (or `integrity_check` if `full=true`) on
157+
/// the namespace's live DB and return the result string. Takes only
158+
/// a read lock on the namespace; other operations (including other
159+
/// namespaces on this pod) are unaffected.
160+
///
161+
/// A healthy DB returns `"ok"`. Any other text is diagnostic output
162+
/// from SQLite. Use this to classify Mode A (wallog/snapshot
163+
/// corruption, live DB OK) vs Mode B (live DB corruption) before
164+
/// deciding on a recovery procedure.
165+
pub async fn integrity_check(
166+
&self,
167+
namespace: NamespaceName,
168+
full: bool,
169+
) -> crate::Result<String> {
170+
if !self.inner.metadata.exists(&namespace).await {
171+
return Err(Error::NamespaceDoesntExist(namespace.to_string()));
172+
}
173+
// Force-load the namespace so we can run a query against it.
174+
let db_config = self.inner.metadata.handle(namespace.clone()).await;
175+
let _ = self
176+
.load_namespace(&namespace, db_config, RestoreOption::Latest)
177+
.await?;
178+
179+
let entry = self
180+
.inner
181+
.store
182+
.get_with(namespace.clone(), async { Default::default() })
183+
.await;
184+
let lock = entry.read().await;
185+
if let Some(ns) = &*lock {
186+
ns.integrity_check(full)
187+
.await
188+
.map_err(|e| Error::Internal(format!("integrity_check: {e}")))
189+
} else {
190+
Err(Error::NamespaceDoesntExist(namespace.to_string()))
191+
}
192+
}
193+
156194
pub async fn reset(
157195
&self,
158196
namespace: NamespaceName,

0 commit comments

Comments
 (0)