From 2e6fbfa7a5d5d28d99f4784de831fafe5b148ef0 Mon Sep 17 00:00:00 2001 From: yiekheng Date: Sun, 10 May 2026 20:27:52 +0800 Subject: [PATCH] fix(bot): reschedule was silently dropped under stately policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scheduled reminder for May 10 8:20 PM never fired. Bot logs showed "reminder.fire: scheduled" with jobId: null at 12:18 UTC — pg-boss returned null because the queue was on policy=stately, which dedupes sends across the (created/active/retry) state cone by singletonKey. A previous schedule for the same reminder (next recurring fire, created earlier) was still in 'created' state, so the new send for today 8:20 PM hit the dedupe and was silently rejected. Two fixes: 1. Switch the queue policy back to 'standard' (the default) and force-flip any existing 'stately' queue row on boot. Standard lets us enqueue across reschedules. 2. scheduleReminderFire now does a pre-send cancel: any 'created' job for this singletonKey is moved to 'cancelled' before the new boss.send. The new schedule wins; old stale jobs are tombstoned so the recurring/edit path produces exactly-one upcoming fire. Duplicate-fire safety (the 'qwerd msg three times' bug) is already covered at the handler level by the inner-mutex recent-run check inside fireReminderInner — that's what stately was guarding against, and the inner check works under standard too. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/bot/src/scheduler/reminder-jobs.ts | 68 +++++++++++++++++-------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/apps/bot/src/scheduler/reminder-jobs.ts b/apps/bot/src/scheduler/reminder-jobs.ts index 3a03311..8b7a056 100644 --- a/apps/bot/src/scheduler/reminder-jobs.ts +++ b/apps/bot/src/scheduler/reminder-jobs.ts @@ -8,31 +8,30 @@ import { fireReminder, type FireReminderPayload } from "./fire-reminder.js"; export const REMINDER_FIRE_QUEUE = "reminder.fire"; export async function registerReminderJobs(boss: PgBoss): Promise { - // 'stately' = at most 1 job per (state, singletonKey). Combined with - // singletonKey="reminder:" on every send, that means a duplicate - // schedule call (e.g. operator double-clicked Save, or the - // pg_notify('bot.command') consumer fired twice in the same tick) - // is folded into the existing 'created' job instead of producing a - // second run. The default 'standard' policy DOES NOT dedupe by - // singletonKey — that's how we ended up firing a reminder twice - // when two reminder.fire jobs landed within microseconds. - // https://github.com/timgit/pg-boss/blob/master/docs/usage.md#queue-policies - await boss.createQueue(REMINDER_FIRE_QUEUE, { policy: "stately" }); + // 'standard' (the default) lets us enqueue a new fire even when an + // older one for the same singletonKey is still 'created'. We need + // that for the recurring/edit path: when a reminder is rescheduled, + // scheduleReminderFire() first cancels the stale 'created' job for + // this reminder and then sends a new one — under 'stately' the + // SECOND send returns null (it dedupes against the first across + // states), so a reschedule silently dropped the new fire and the + // reminder never fired at the new time. Duplicate-fire safety is + // covered at the handler level by the inner-mutex recent-run check + // in fire-reminder.ts (see DUPLICATE_FIRE_WINDOW_MS), which catches + // the microsecond-spaced send case 'stately' was supposed to guard. + await boss.createQueue(REMINDER_FIRE_QUEUE, { policy: "standard" }); // pg-boss v12's createQueue is idempotent and DOES NOT update the - // policy on an existing queue row. Earlier deployments created the - // queue with the default 'standard' policy, so the queue keeps - // accepting duplicate enqueues and the same reminder fires twice or - // three times. Force the policy via direct SQL on every boot — this - // also future-proofs the upgrade path if pg-boss ever lets us flip - // policy through its API. + // policy on an existing queue row. Earlier deployments forced + // policy='stately' here, which broke reschedules. Force-flip back to + // 'standard' on every boot so an old queue row doesn't strand us. try { await db.execute( - sql`UPDATE pgboss.queue SET policy = 'stately' WHERE name = ${REMINDER_FIRE_QUEUE} AND policy <> 'stately'`, + sql`UPDATE pgboss.queue SET policy = 'standard' WHERE name = ${REMINDER_FIRE_QUEUE} AND policy <> 'standard'`, ); } catch (err) { logger.warn( { err }, - "reminder.fire: failed to force queue policy=stately (handler-level dedupe still applies)", + "reminder.fire: failed to force queue policy=standard (handler-level dedupe still applies)", ); } await boss.work( @@ -62,6 +61,33 @@ export async function scheduleReminderFire( reminderId: string, scheduledAt: Date, ): Promise { + const singletonKey = `reminder:${reminderId}`; + // Replace-then-send. Any 'created' (i.e. not yet started) job for + // this reminder is the stale next-fire from the previous schedule + // attempt; nuke it so the new schedule wins. Active/completed jobs + // are left alone — those represent in-flight or already-fired runs + // and the handler-level dedupe handles overlap. + try { + const cancelled = await db.execute( + sql`UPDATE pgboss.job + SET state = 'cancelled', completed_on = now() + WHERE name = ${REMINDER_FIRE_QUEUE} + AND singleton_key = ${singletonKey} + AND state = 'created' + RETURNING id`, + ); + if (cancelled.rows.length > 0) { + logger.info( + { reminderId, cancelled: cancelled.rows.length }, + "reminder.fire: cancelled stale created jobs before reschedule", + ); + } + } catch (err) { + // If the cancellation step fails, log but still try to send. Worst + // case we end up with two created jobs and the handler-level + // recent-run dedupe drops the duplicate fire. + logger.warn({ err, reminderId }, "reminder.fire: pre-send cancel failed"); + } const id = await boss.send( REMINDER_FIRE_QUEUE, { reminderId }, @@ -70,8 +96,10 @@ export async function scheduleReminderFire( retryLimit: 3, retryDelay: 30, retryBackoff: true, - // Use the reminderId as a singleton key so re-scheduling cancels the old job - singletonKey: `reminder:${reminderId}`, + // Singleton key kept on the job row for diagnostics + the + // pre-send cancel above, even though 'standard' policy doesn't + // dedupe by it. + singletonKey, }, ); logger.info({ reminderId, jobId: id, scheduledAt }, "reminder.fire: scheduled");