From 731d6d66a66e4a74dfdcea4dfd7db99c86e0ed54 Mon Sep 17 00:00:00 2001 From: yiekheng Date: Sun, 10 May 2026 11:39:56 +0800 Subject: [PATCH] fix(unpair): stop session manager from racing the web's status write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom ------- Click "Unpair" on a connected account. The web action sets \`status='unpaired'\`, but the account detail page often still shows "Disconnected" — and on accounts that had been previously connected, the QR pair flow restarts a few seconds later all on its own. Cause ----- Two races inside the session manager: 1. The web's \`unpairAccountAction\` notifies the bot via \`pg_notify\` and then writes \`status='unpaired'\` to the row. The bot's \`handleUnpair\` calls \`sessionManager.stop()\` which closes the Baileys socket; Baileys eventually fires a \`connection: close\` event which the manager's \`handleEvent\` translates into a \`status='disconnected'\` UPDATE. Whichever write lands second wins. The user clicks Unpair and sees Disconnected. 2. The same close-event handler schedules a 5-second \`stop().then(start())\` reconnect for accounts whose \`lastConnectedAt\` is set. Five seconds after unpair, the bot silently re-opens the socket, the row flips to \`pending\`, and the QR carousel restarts. Fix --- \`stop(accountId, { intentional: true })\` marks the account in a new \`intentionalStops\` Set. When the close event lands, \`handleEvent\` drains the flag (with \`Set.delete()\` returning whether the key was present, so it's exactly-once and a stale flag can't bleed into a later session) and skips both the DB UPDATE and the reconnect schedule. The caller — only \`handleUnpair\` for now — is the one choosing the row's next state, so we step out of its way. The flag is set ONLY when callers ask for it. Internal recoveries (restartRequired auto re-open, ephemeral-close back-off) keep the default behaviour and continue to write \`disconnected\` + reschedule. Drive-bys --------- - Refresh the stale "the row is gone by the time we run" comment in unpair-handler — the row stays alive now (the operator can re-pair without retyping the label). Look up the account first so the audit log carries the real \`operatorId\` instead of \`null\`. The delete-account flow really does delete the row before notifying us; the lookup tolerates that and falls back to \`null\`. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/bot/src/ipc/unpair-handler.ts | 21 ++++-- apps/bot/src/whatsapp/session-manager.ts | 87 +++++++++++++++--------- 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/apps/bot/src/ipc/unpair-handler.ts b/apps/bot/src/ipc/unpair-handler.ts index d6b4960..07e1abe 100644 --- a/apps/bot/src/ipc/unpair-handler.ts +++ b/apps/bot/src/ipc/unpair-handler.ts @@ -8,17 +8,26 @@ import { pgNotifyWeb } from "./notify.js"; import { logger } from "../logger.js"; /** - * Unpair handler: stop the live Baileys session and remove session files. - * The web's unpair action deletes the account row before notifying us, - * so we expect the row to be gone by the time we run. Audit log uses a - * null operator since the row is no longer queryable. + * Unpair handler: stop the live Baileys session and remove the on-disk + * session files. The web action keeps the account row alive (status = + * 'unpaired') so the operator can re-pair without retyping the label; + * the {intentional: true} stop tells the session manager not to race + * the web's status write with its own "disconnected" update or + * schedule a reconnect for a session we just chose to tear down. + * + * For the delete-account flow the row IS gone by the time we run; + * the audit log lookup tolerates that. */ export async function handleUnpair(accountId: string): Promise { - await sessionManager.stop(accountId); + await sessionManager.stop(accountId, { intentional: true }); await rm(join(env.SESSIONS_DIR, accountId), { recursive: true, force: true }); try { + const row = await db.query.whatsappAccounts.findFirst({ + where: (a, { eq }) => eq(a.id, accountId), + columns: { operatorId: true }, + }); await writeAuditLog(db, { - operatorId: null, + operatorId: row?.operatorId ?? null, source: "web", action: "account.unpaired", targetType: "whatsapp_account", diff --git a/apps/bot/src/whatsapp/session-manager.ts b/apps/bot/src/whatsapp/session-manager.ts index 771d936..7853f53 100644 --- a/apps/bot/src/whatsapp/session-manager.ts +++ b/apps/bot/src/whatsapp/session-manager.ts @@ -40,6 +40,13 @@ class SessionManager { private states = new Map(); private listeners = new Set(); private reconnectTimers = new Map(); + /** + * Account IDs whose next close event was triggered by us on purpose + * (unpair, delete, app shutdown). When an entry is present, the close + * handler skips the DB status write and the auto-reconnect schedule — + * the caller has already chosen the row's next state. + */ + private intentionalStops = new Set(); on(listener: SessionListener): () => void { this.listeners.add(listener); @@ -92,7 +99,10 @@ class SessionManager { this.sessions.set(accountId, session); } - async stop(accountId: string): Promise { + async stop( + accountId: string, + opts?: { intentional?: boolean }, + ): Promise { const timer = this.reconnectTimers.get(accountId); if (timer) { clearTimeout(timer); @@ -100,6 +110,12 @@ class SessionManager { } const session = this.sessions.get(accountId); if (!session) return; + if (opts?.intentional) { + // Mark the upcoming close event as expected so handleEvent won't + // race with the caller's DB write. We only set this when the + // caller will manage the row's status themselves (unpair, delete). + this.intentionalStops.add(accountId); + } await session.close(); this.sessions.delete(accountId); } @@ -135,43 +151,52 @@ class SessionManager { }) .where(eq(whatsappAccounts.id, accountId)); } else if (event.type === "close") { + // Drain the intentional-stop flag exactly once so a stale flag + // can't bleed into a later, unrelated session. + const wasIntentional = this.intentionalStops.delete(accountId); this.transition(accountId, { kind: "close", loggedOut: event.loggedOut }); - await db - .update(whatsappAccounts) - .set({ status: event.loggedOut ? "logged_out" : "disconnected" }) - .where(eq(whatsappAccounts.id, accountId)); - - if (event.loggedOut) { - await this.stop(accountId); - } else if (event.restartRequired) { - // Status 515 — the post-pair-success reconnect. Always re-open - // immediately (no 5 s back-off, no `lastConnectedAt` gate). If - // we don't, the auth handshake never completes and the user - // sees a spurious "Pairing timed out". - const timer = setTimeout(() => { - this.reconnectTimers.delete(accountId); - void this.stop(accountId).then(() => this.start(accountId)); - }, 250); - this.reconnectTimers.set(accountId, timer); + if (wasIntentional) { + // Caller (unpair/delete handler) is writing the row themselves. + // Don't overwrite their status, and don't schedule a reconnect + // for a session we just chose to tear down. } else { - // Other ephemeral closes (refs exhausted, network blip): only - // auto-reconnect for accounts that have been linked at least - // once. During an initial pair attempt this would otherwise - // restart the pair dance and rotate the QR every few seconds. - const account = await db.query.whatsappAccounts.findFirst({ - where: (a, { eq }) => eq(a.id, accountId), - columns: { lastConnectedAt: true }, - }); - if (account?.lastConnectedAt) { + await db + .update(whatsappAccounts) + .set({ status: event.loggedOut ? "logged_out" : "disconnected" }) + .where(eq(whatsappAccounts.id, accountId)); + + if (event.loggedOut) { + await this.stop(accountId); + } else if (event.restartRequired) { + // Status 515 — the post-pair-success reconnect. Always re-open + // immediately (no 5 s back-off, no `lastConnectedAt` gate). If + // we don't, the auth handshake never completes and the user + // sees a spurious "Pairing timed out". const timer = setTimeout(() => { this.reconnectTimers.delete(accountId); void this.stop(accountId).then(() => this.start(accountId)); - }, 5000); + }, 250); this.reconnectTimers.set(accountId, timer); } else { - // Brand-new account that hasn't authenticated yet — let the - // pair-handler clean up via its timeout. - await this.stop(accountId); + // Other ephemeral closes (refs exhausted, network blip): only + // auto-reconnect for accounts that have been linked at least + // once. During an initial pair attempt this would otherwise + // restart the pair dance and rotate the QR every few seconds. + const account = await db.query.whatsappAccounts.findFirst({ + where: (a, { eq }) => eq(a.id, accountId), + columns: { lastConnectedAt: true }, + }); + if (account?.lastConnectedAt) { + const timer = setTimeout(() => { + this.reconnectTimers.delete(accountId); + void this.stop(accountId).then(() => this.start(accountId)); + }, 5000); + this.reconnectTimers.set(accountId, timer); + } else { + // Brand-new account that hasn't authenticated yet — let the + // pair-handler clean up via its timeout. + await this.stop(accountId); + } } } } else if (event.type === "qr") {