fix(unpair): stop session manager from racing the web's status write

Symptom
-------
Click "Unpair" on a connected account. The web action sets
\`status='unpaired'\`, but the account detail page often still shows
"Disconnected" — and on accounts that had been previously connected,
the QR pair flow restarts a few seconds later all on its own.

Cause
-----
Two races inside the session manager:

1. The web's \`unpairAccountAction\` notifies the bot via \`pg_notify\`
   and then writes \`status='unpaired'\` to the row. The bot's
   \`handleUnpair\` calls \`sessionManager.stop()\` which closes the
   Baileys socket; Baileys eventually fires a \`connection: close\`
   event which the manager's \`handleEvent\` translates into a
   \`status='disconnected'\` UPDATE. Whichever write lands second wins.
   The user clicks Unpair and sees Disconnected.

2. The same close-event handler schedules a 5-second
   \`stop().then(start())\` reconnect for accounts whose
   \`lastConnectedAt\` is set. Five seconds after unpair, the bot
   silently re-opens the socket, the row flips to \`pending\`, and the
   QR carousel restarts.

Fix
---
\`stop(accountId, { intentional: true })\` marks the account in a new
\`intentionalStops\` Set. When the close event lands, \`handleEvent\`
drains the flag (with \`Set.delete()\` returning whether the key was
present, so it's exactly-once and a stale flag can't bleed into a
later session) and skips both the DB UPDATE and the reconnect
schedule. The caller — only \`handleUnpair\` for now — is the one
choosing the row's next state, so we step out of its way.

The flag is set ONLY when callers ask for it. Internal recoveries
(restartRequired auto re-open, ephemeral-close back-off) keep the
default behaviour and continue to write \`disconnected\` + reschedule.

Drive-bys
---------
- Refresh the stale "the row is gone by the time we run" comment in
  unpair-handler — the row stays alive now (the operator can re-pair
  without retyping the label). Look up the account first so the
  audit log carries the real \`operatorId\` instead of \`null\`. The
  delete-account flow really does delete the row before notifying us;
  the lookup tolerates that and falls back to \`null\`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
yiekheng 2026-05-10 11:39:56 +08:00
parent 7b991a565d
commit 731d6d66a6
2 changed files with 71 additions and 37 deletions

View File

@ -8,17 +8,26 @@ import { pgNotifyWeb } from "./notify.js";
import { logger } from "../logger.js"; import { logger } from "../logger.js";
/** /**
* Unpair handler: stop the live Baileys session and remove session files. * Unpair handler: stop the live Baileys session and remove the on-disk
* The web's unpair action deletes the account row before notifying us, * session files. The web action keeps the account row alive (status =
* so we expect the row to be gone by the time we run. Audit log uses a * 'unpaired') so the operator can re-pair without retyping the label;
* null operator since the row is no longer queryable. * the {intentional: true} stop tells the session manager not to race
* the web's status write with its own "disconnected" update or
* schedule a reconnect for a session we just chose to tear down.
*
* For the delete-account flow the row IS gone by the time we run;
* the audit log lookup tolerates that.
*/ */
export async function handleUnpair(accountId: string): Promise<void> { export async function handleUnpair(accountId: string): Promise<void> {
await sessionManager.stop(accountId); await sessionManager.stop(accountId, { intentional: true });
await rm(join(env.SESSIONS_DIR, accountId), { recursive: true, force: true }); await rm(join(env.SESSIONS_DIR, accountId), { recursive: true, force: true });
try { try {
const row = await db.query.whatsappAccounts.findFirst({
where: (a, { eq }) => eq(a.id, accountId),
columns: { operatorId: true },
});
await writeAuditLog(db, { await writeAuditLog(db, {
operatorId: null, operatorId: row?.operatorId ?? null,
source: "web", source: "web",
action: "account.unpaired", action: "account.unpaired",
targetType: "whatsapp_account", targetType: "whatsapp_account",

View File

@ -40,6 +40,13 @@ class SessionManager {
private states = new Map<string, SessionState>(); private states = new Map<string, SessionState>();
private listeners = new Set<SessionListener>(); private listeners = new Set<SessionListener>();
private reconnectTimers = new Map<string, NodeJS.Timeout>(); private reconnectTimers = new Map<string, NodeJS.Timeout>();
/**
* Account IDs whose next close event was triggered by us on purpose
* (unpair, delete, app shutdown). When an entry is present, the close
* handler skips the DB status write and the auto-reconnect schedule
* the caller has already chosen the row's next state.
*/
private intentionalStops = new Set<string>();
on(listener: SessionListener): () => void { on(listener: SessionListener): () => void {
this.listeners.add(listener); this.listeners.add(listener);
@ -92,7 +99,10 @@ class SessionManager {
this.sessions.set(accountId, session); this.sessions.set(accountId, session);
} }
async stop(accountId: string): Promise<void> { async stop(
accountId: string,
opts?: { intentional?: boolean },
): Promise<void> {
const timer = this.reconnectTimers.get(accountId); const timer = this.reconnectTimers.get(accountId);
if (timer) { if (timer) {
clearTimeout(timer); clearTimeout(timer);
@ -100,6 +110,12 @@ class SessionManager {
} }
const session = this.sessions.get(accountId); const session = this.sessions.get(accountId);
if (!session) return; if (!session) return;
if (opts?.intentional) {
// Mark the upcoming close event as expected so handleEvent won't
// race with the caller's DB write. We only set this when the
// caller will manage the row's status themselves (unpair, delete).
this.intentionalStops.add(accountId);
}
await session.close(); await session.close();
this.sessions.delete(accountId); this.sessions.delete(accountId);
} }
@ -135,43 +151,52 @@ class SessionManager {
}) })
.where(eq(whatsappAccounts.id, accountId)); .where(eq(whatsappAccounts.id, accountId));
} else if (event.type === "close") { } else if (event.type === "close") {
// Drain the intentional-stop flag exactly once so a stale flag
// can't bleed into a later, unrelated session.
const wasIntentional = this.intentionalStops.delete(accountId);
this.transition(accountId, { kind: "close", loggedOut: event.loggedOut }); this.transition(accountId, { kind: "close", loggedOut: event.loggedOut });
await db if (wasIntentional) {
.update(whatsappAccounts) // Caller (unpair/delete handler) is writing the row themselves.
.set({ status: event.loggedOut ? "logged_out" : "disconnected" }) // Don't overwrite their status, and don't schedule a reconnect
.where(eq(whatsappAccounts.id, accountId)); // for a session we just chose to tear down.
if (event.loggedOut) {
await this.stop(accountId);
} else if (event.restartRequired) {
// Status 515 — the post-pair-success reconnect. Always re-open
// immediately (no 5 s back-off, no `lastConnectedAt` gate). If
// we don't, the auth handshake never completes and the user
// sees a spurious "Pairing timed out".
const timer = setTimeout(() => {
this.reconnectTimers.delete(accountId);
void this.stop(accountId).then(() => this.start(accountId));
}, 250);
this.reconnectTimers.set(accountId, timer);
} else { } else {
// Other ephemeral closes (refs exhausted, network blip): only await db
// auto-reconnect for accounts that have been linked at least .update(whatsappAccounts)
// once. During an initial pair attempt this would otherwise .set({ status: event.loggedOut ? "logged_out" : "disconnected" })
// restart the pair dance and rotate the QR every few seconds. .where(eq(whatsappAccounts.id, accountId));
const account = await db.query.whatsappAccounts.findFirst({
where: (a, { eq }) => eq(a.id, accountId), if (event.loggedOut) {
columns: { lastConnectedAt: true }, await this.stop(accountId);
}); } else if (event.restartRequired) {
if (account?.lastConnectedAt) { // Status 515 — the post-pair-success reconnect. Always re-open
// immediately (no 5 s back-off, no `lastConnectedAt` gate). If
// we don't, the auth handshake never completes and the user
// sees a spurious "Pairing timed out".
const timer = setTimeout(() => { const timer = setTimeout(() => {
this.reconnectTimers.delete(accountId); this.reconnectTimers.delete(accountId);
void this.stop(accountId).then(() => this.start(accountId)); void this.stop(accountId).then(() => this.start(accountId));
}, 5000); }, 250);
this.reconnectTimers.set(accountId, timer); this.reconnectTimers.set(accountId, timer);
} else { } else {
// Brand-new account that hasn't authenticated yet — let the // Other ephemeral closes (refs exhausted, network blip): only
// pair-handler clean up via its timeout. // auto-reconnect for accounts that have been linked at least
await this.stop(accountId); // once. During an initial pair attempt this would otherwise
// restart the pair dance and rotate the QR every few seconds.
const account = await db.query.whatsappAccounts.findFirst({
where: (a, { eq }) => eq(a.id, accountId),
columns: { lastConnectedAt: true },
});
if (account?.lastConnectedAt) {
const timer = setTimeout(() => {
this.reconnectTimers.delete(accountId);
void this.stop(accountId).then(() => this.start(accountId));
}, 5000);
this.reconnectTimers.set(accountId, timer);
} else {
// Brand-new account that hasn't authenticated yet — let the
// pair-handler clean up via its timeout.
await this.stop(accountId);
}
} }
} }
} else if (event.type === "qr") { } else if (event.type === "qr") {