fix(bot): reschedule was silently dropped under stately policy
Scheduled reminder for May 10 8:20 PM never fired. Bot logs showed
"reminder.fire: scheduled" with jobId: null at 12:18 UTC — pg-boss
returned null because the queue was on policy=stately, which dedupes
sends across the (created/active/retry) state cone by singletonKey.
A previous schedule for the same reminder (next recurring fire,
created earlier) was still in 'created' state, so the new send for
today 8:20 PM hit the dedupe and was silently rejected.
Two fixes:
1. Switch the queue policy back to 'standard' (the default) and
force-flip any existing 'stately' queue row on boot. Standard
lets us enqueue across reschedules.
2. scheduleReminderFire now does a pre-send cancel: any 'created'
job for this singletonKey is moved to 'cancelled' before the new
boss.send. The new schedule wins; old stale jobs are tombstoned
so the recurring/edit path produces exactly-one upcoming fire.
Duplicate-fire safety (the 'qwerd msg three times' bug) is already
covered at the handler level by the inner-mutex recent-run check
inside fireReminderInner — that's what stately was guarding against,
and the inner check works under standard too.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
991b7ae0ab
commit
2e6fbfa7a5
@ -8,31 +8,30 @@ import { fireReminder, type FireReminderPayload } from "./fire-reminder.js";
|
||||
export const REMINDER_FIRE_QUEUE = "reminder.fire";
|
||||
|
||||
export async function registerReminderJobs(boss: PgBoss): Promise<void> {
|
||||
// 'stately' = at most 1 job per (state, singletonKey). Combined with
|
||||
// singletonKey="reminder:<id>" on every send, that means a duplicate
|
||||
// schedule call (e.g. operator double-clicked Save, or the
|
||||
// pg_notify('bot.command') consumer fired twice in the same tick)
|
||||
// is folded into the existing 'created' job instead of producing a
|
||||
// second run. The default 'standard' policy DOES NOT dedupe by
|
||||
// singletonKey — that's how we ended up firing a reminder twice
|
||||
// when two reminder.fire jobs landed within microseconds.
|
||||
// https://github.com/timgit/pg-boss/blob/master/docs/usage.md#queue-policies
|
||||
await boss.createQueue(REMINDER_FIRE_QUEUE, { policy: "stately" });
|
||||
// 'standard' (the default) lets us enqueue a new fire even when an
|
||||
// older one for the same singletonKey is still 'created'. We need
|
||||
// that for the recurring/edit path: when a reminder is rescheduled,
|
||||
// scheduleReminderFire() first cancels the stale 'created' job for
|
||||
// this reminder and then sends a new one — under 'stately' the
|
||||
// SECOND send returns null (it dedupes against the first across
|
||||
// states), so a reschedule silently dropped the new fire and the
|
||||
// reminder never fired at the new time. Duplicate-fire safety is
|
||||
// covered at the handler level by the inner-mutex recent-run check
|
||||
// in fire-reminder.ts (see DUPLICATE_FIRE_WINDOW_MS), which catches
|
||||
// the microsecond-spaced send case 'stately' was supposed to guard.
|
||||
await boss.createQueue(REMINDER_FIRE_QUEUE, { policy: "standard" });
|
||||
// pg-boss v12's createQueue is idempotent and DOES NOT update the
|
||||
// policy on an existing queue row. Earlier deployments created the
|
||||
// queue with the default 'standard' policy, so the queue keeps
|
||||
// accepting duplicate enqueues and the same reminder fires twice or
|
||||
// three times. Force the policy via direct SQL on every boot — this
|
||||
// also future-proofs the upgrade path if pg-boss ever lets us flip
|
||||
// policy through its API.
|
||||
// policy on an existing queue row. Earlier deployments forced
|
||||
// policy='stately' here, which broke reschedules. Force-flip back to
|
||||
// 'standard' on every boot so an old queue row doesn't strand us.
|
||||
try {
|
||||
await db.execute(
|
||||
sql`UPDATE pgboss.queue SET policy = 'stately' WHERE name = ${REMINDER_FIRE_QUEUE} AND policy <> 'stately'`,
|
||||
sql`UPDATE pgboss.queue SET policy = 'standard' WHERE name = ${REMINDER_FIRE_QUEUE} AND policy <> 'standard'`,
|
||||
);
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
{ err },
|
||||
"reminder.fire: failed to force queue policy=stately (handler-level dedupe still applies)",
|
||||
"reminder.fire: failed to force queue policy=standard (handler-level dedupe still applies)",
|
||||
);
|
||||
}
|
||||
await boss.work<FireReminderPayload>(
|
||||
@ -62,6 +61,33 @@ export async function scheduleReminderFire(
|
||||
reminderId: string,
|
||||
scheduledAt: Date,
|
||||
): Promise<string | null> {
|
||||
const singletonKey = `reminder:${reminderId}`;
|
||||
// Replace-then-send. Any 'created' (i.e. not yet started) job for
|
||||
// this reminder is the stale next-fire from the previous schedule
|
||||
// attempt; nuke it so the new schedule wins. Active/completed jobs
|
||||
// are left alone — those represent in-flight or already-fired runs
|
||||
// and the handler-level dedupe handles overlap.
|
||||
try {
|
||||
const cancelled = await db.execute(
|
||||
sql`UPDATE pgboss.job
|
||||
SET state = 'cancelled', completed_on = now()
|
||||
WHERE name = ${REMINDER_FIRE_QUEUE}
|
||||
AND singleton_key = ${singletonKey}
|
||||
AND state = 'created'
|
||||
RETURNING id`,
|
||||
);
|
||||
if (cancelled.rows.length > 0) {
|
||||
logger.info(
|
||||
{ reminderId, cancelled: cancelled.rows.length },
|
||||
"reminder.fire: cancelled stale created jobs before reschedule",
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
// If the cancellation step fails, log but still try to send. Worst
|
||||
// case we end up with two created jobs and the handler-level
|
||||
// recent-run dedupe drops the duplicate fire.
|
||||
logger.warn({ err, reminderId }, "reminder.fire: pre-send cancel failed");
|
||||
}
|
||||
const id = await boss.send(
|
||||
REMINDER_FIRE_QUEUE,
|
||||
{ reminderId },
|
||||
@ -70,8 +96,10 @@ export async function scheduleReminderFire(
|
||||
retryLimit: 3,
|
||||
retryDelay: 30,
|
||||
retryBackoff: true,
|
||||
// Use the reminderId as a singleton key so re-scheduling cancels the old job
|
||||
singletonKey: `reminder:${reminderId}`,
|
||||
// Singleton key kept on the job row for diagnostics + the
|
||||
// pre-send cancel above, even though 'standard' policy doesn't
|
||||
// dedupe by it.
|
||||
singletonKey,
|
||||
},
|
||||
);
|
||||
logger.info({ reminderId, jobId: id, scheduledAt }, "reminder.fire: scheduled");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user