From 6860afaa2553f8983ae824ddf9a89fff3725f873 Mon Sep 17 00:00:00 2001 From: Darien Kindlund Date: Sun, 8 Mar 2026 22:41:44 -0400 Subject: [PATCH] fix: make InvalidLinkReference fix atomic on PostgreSQL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The /link-fix endpoint's InvalidLinkReference repair has a race condition under concurrent write load. The current two-step approach: 1. checkLinks() — SELECT to detect desynced record IDs 2. fixLinks(recordIds) — UPDATE only those specific records Between steps 1 and 2, concurrent writes can: - Create NEW desyncs not in the recordIds list (missed until next run) - Worsen existing desyncs (JSONB changes between detection and fix) - Overwrite the fix immediately after it's applied In production under sustained concurrent API load (200-800 writes/hr), we've observed full JSONB array wipes (e.g., 9,781 entries reduced to 0) caused by the read-modify-write race in Teable's link update path. The integrity fix endpoint, meant to repair these desyncs, is vulnerable to the same concurrent writes because of this detection-fix gap. ## Solution Add atomicFixLinks() to IntegrityQueryPostgres that combines detection and fix into a single UPDATE ... WHERE __id IN (SELECT ...) statement. Detection and fix execute under the same MVCC snapshot, eliminating the application-level gap entirely. The service (LinkFieldIntegrityService.checkAndFix) tries the atomic method first. If the database engine doesn't support it (e.g., SQLite), it falls back to the existing two-step approach. This is a safe, backwards-compatible change — SQLite behavior is completely unchanged. ## What the /link-fix endpoint covers For context, the endpoint handles 10 integrity issue types. This PR improves only InvalidLinkReference. Here is the full list: | Issue Type | What it detects | Fix applied | |---|---|---| | InvalidLinkReference | JSONB link column diverged from junction/FK source of truth | Rebuilds JSONB from junction/FK data (THIS PR) | | MissingRecordReference | Junction table has rows pointing to deleted records | Deletes orphaned junction rows | | ForeignTableNotFound | Link field references a deleted table | No auto-fix (requires manual intervention) | | ForeignKeyHostTableNotFound | Junction table is missing | No auto-fix | | ForeignKeyNotFound | Missing FK columns in junction table | Recreates columns, backfills from JSONB | | SelfKeyNotFound | Missing self-reference key in junction | No auto-fix | | SymmetricFieldNotFound | Bidirectional link missing its counterpart | Converts to one-way link | | ReferenceFieldNotFound | Referenced record was deleted | Deletes orphaned reference | | UniqueIndexNotFound | Missing unique constraint for OneOne links | Creates the index | | EmptyString | Text fields have empty strings instead of NULL | Converts to NULL | ## Relationship types handled The atomic fix handles all four relationship types: - ManyMany (isMultiValue=true): Rebuilds JSONB array from junction table - OneMany (isMultiValue=true): Same as ManyMany - ManyOne (isMultiValue=false, FK in same table): Rebuilds single JSONB object from FK column - OneOne (isMultiValue=false, FK in host table): Rebuilds via cross-table join ## Files changed - abstract.ts: Added atomicFixLinks() with default null return - integrity-query.postgres.ts: PostgreSQL implementation of atomicFixLinks() - link-field.service.ts: checkAndFix() tries atomic first, falls back to two-step ## Related issues - #2680 (DataLoader cache invalidation under concurrency) - #2676 (Sort record IDs in lockForeignRecords) - #2677 (Wrap simpleUpdateRecords with transaction/timeout/retry) - #2679 (Add foreign record locking to ManyMany, OneMany, OneOne) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../db-provider/integrity-query/abstract.ts | 25 ++++ .../integrity-query.postgres.ts | 134 ++++++++++++++++++ .../features/integrity/link-field.service.ts | 15 ++ 3 files changed, 174 insertions(+) diff --git a/apps/nestjs-backend/src/db-provider/integrity-query/abstract.ts b/apps/nestjs-backend/src/db-provider/integrity-query/abstract.ts index 287a8454c2..2296d2517d 100644 --- a/apps/nestjs-backend/src/db-provider/integrity-query/abstract.ts +++ b/apps/nestjs-backend/src/db-provider/integrity-query/abstract.ts @@ -21,6 +21,31 @@ export abstract class IntegrityQueryAbstract { isMultiValue: boolean; }): string; + /** + * Atomically detect and fix InvalidLinkReference issues in a single SQL + * statement, eliminating the window between detection and fix where + * concurrent writes can re-introduce or worsen desyncs. + * + * Returns null if the database engine does not support atomic detect+fix + * (e.g., SQLite), in which case the caller should fall back to the + * two-step checkLinks() + fixLinks() approach. + * + * Currently implemented for PostgreSQL only. + */ + atomicFixLinks(_params: { + dbTableName: string; + foreignDbTableName: string; + fkHostTableName: string; + lookupDbFieldName: string; + selfKeyName: string; + foreignKeyName: string; + linkDbFieldName: string; + isMultiValue: boolean; + }): string | null { + // Default: not supported. Subclasses override for specific databases. + return null; + } + /** * Deprecated: Do NOT use in new code. * Link fields do not persist a display JSON column; their values are derived diff --git a/apps/nestjs-backend/src/db-provider/integrity-query/integrity-query.postgres.ts b/apps/nestjs-backend/src/db-provider/integrity-query/integrity-query.postgres.ts index 5cbf412d3e..f6b2bfb8ed 100644 --- a/apps/nestjs-backend/src/db-provider/integrity-query/integrity-query.postgres.ts +++ b/apps/nestjs-backend/src/db-provider/integrity-query/integrity-query.postgres.ts @@ -172,6 +172,140 @@ export class IntegrityQueryPostgres extends IntegrityQueryAbstract { .toQuery(); } + /** + * Atomically detect and fix InvalidLinkReference issues in a single SQL + * statement for PostgreSQL. + * + * Combines the detection query (find records where JSONB diverges from + * junction/FK source of truth) with the fix query (rebuild JSONB from + * junction/FK data) into one UPDATE ... WHERE __id IN (SELECT ...). + * + * This eliminates the window between detection and fix where concurrent + * writes can re-introduce desyncs — a known race condition under high + * write concurrency (see: https://github.com/teableio/teable/issues/2680). + */ + override atomicFixLinks({ + dbTableName, + foreignDbTableName, + fkHostTableName, + lookupDbFieldName, + selfKeyName, + foreignKeyName, + linkDbFieldName, + isMultiValue, + }: { + dbTableName: string; + foreignDbTableName: string; + fkHostTableName: string; + lookupDbFieldName: string; + selfKeyName: string; + foreignKeyName: string; + linkDbFieldName: string; + isMultiValue: boolean; + }): string | null { + if (isMultiValue) { + // Multi-value (ManyMany, OneMany): rebuild JSONB array from junction table + // Detection is inlined in the WHERE clause via count comparison + const detectionSubquery = this.knex + .select('sub.__id') + .from(`${dbTableName} as sub`) + .leftJoin( + this.knex(fkHostTableName) + .select({ id: selfKeyName }) + .count('* as jc') + .whereNotNull(selfKeyName) + .groupBy(selfKeyName) + .as('jct'), + 'sub.__id', + 'jct.id' + ) + .whereRaw( + `jsonb_array_length(COALESCE(sub."${linkDbFieldName}", '[]'::jsonb)) != COALESCE(jct.jc, 0)` + ); + + return this.knex(dbTableName) + .update({ + [linkDbFieldName]: this.knex + .select( + this.knex.raw( + "jsonb_agg(jsonb_build_object('id', ??, 'title', ??) ORDER BY ??)", + [ + `fk.${foreignKeyName}`, + `ft.${lookupDbFieldName}`, + `fk.${foreignKeyName}`, + ] + ) + ) + .from(`${fkHostTableName} as fk`) + .join(`${foreignDbTableName} as ft`, `ft.__id`, `fk.${foreignKeyName}`) + .where('fk.' + selfKeyName, `${dbTableName}.__id`), + }) + .whereIn('__id', detectionSubquery) + .toQuery(); + } + + if (fkHostTableName === dbTableName) { + // Single-value, FK in same table (ManyOne/OneOne): rebuild from FK column + const detectionSubquery = this.knex + .select('sub.__id') + .from(`${dbTableName} as sub`) + .where(function () { + this.whereRaw(`sub."${foreignKeyName}" IS NULL`) + .whereRaw(`sub."${linkDbFieldName}" IS NOT NULL`) + .orWhere(function () { + this.whereRaw(`sub."${linkDbFieldName}" IS NOT NULL`).andWhereRaw( + `(sub."${linkDbFieldName}"->>'id')::text != sub."${foreignKeyName}"::text` + ); + }); + }); + + return this.knex(dbTableName) + .update({ + [linkDbFieldName]: this.knex.raw( + `CASE WHEN ?? IS NULL THEN NULL + ELSE jsonb_build_object('id', ??, 'title', (SELECT ?? FROM ?? WHERE __id = ??)) + END`, + [foreignKeyName, foreignKeyName, lookupDbFieldName, foreignDbTableName, foreignKeyName] + ), + }) + .whereIn('__id', detectionSubquery) + .toQuery(); + } + + // Single-value, FK in host table (OneOne cross-table): rebuild via join + const detectionSubquery = this.knex + .select('sub.__id') + .from(`${dbTableName} as sub`) + .leftJoin(`${fkHostTableName} as t2`, `t2.${selfKeyName}`, 'sub.__id') + .where(function () { + this.whereRaw(`t2."${foreignKeyName}" IS NULL`) + .whereRaw(`sub."${linkDbFieldName}" IS NOT NULL`) + .orWhere(function () { + this.whereRaw(`sub."${linkDbFieldName}" IS NOT NULL`).andWhereRaw( + `(sub."${linkDbFieldName}"->>'id')::text != t2."${foreignKeyName}"::text` + ); + }); + }); + + return this.knex(dbTableName) + .update({ + [linkDbFieldName]: this.knex + .select( + this.knex.raw( + `CASE WHEN t2.?? IS NULL THEN NULL + ELSE jsonb_build_object('id', t2.??, 'title', t2.??) + END`, + [foreignKeyName, foreignKeyName, lookupDbFieldName] + ) + ) + .from(`${fkHostTableName} as t2`) + .where(`t2.${foreignKeyName}`, `${dbTableName}.__id`) + .limit(1), + }) + .whereIn('__id', detectionSubquery) + .toQuery(); + } + /** * Deprecated: Do NOT use in new code. * Link fields typically do not persist a display JSON column in Postgres; diff --git a/apps/nestjs-backend/src/features/integrity/link-field.service.ts b/apps/nestjs-backend/src/features/integrity/link-field.service.ts index b549e798f2..3e340be69d 100644 --- a/apps/nestjs-backend/src/features/integrity/link-field.service.ts +++ b/apps/nestjs-backend/src/features/integrity/link-field.service.ts @@ -105,6 +105,21 @@ export class LinkFieldIntegrityService { selfKeyName: string; }) { try { + // Prefer atomic detect+fix when supported (PostgreSQL). + // This eliminates the window between detection and fix where concurrent + // writes can re-introduce or worsen JSONB/junction desyncs. + const atomicQuery = this.dbProvider.integrityQuery().atomicFixLinks(params); + if (atomicQuery) { + const updatedCount = await this.prismaService.$executeRawUnsafe(atomicQuery); + if (updatedCount > 0) { + this.logger.debug( + `Atomically fixed ${updatedCount} records in ${params.dbTableName}` + ); + } + return updatedCount; + } + + // Fallback: two-step check-then-fix (SQLite and other engines). const inconsistentRecords = await this.checkLinks(params); if (inconsistentRecords.length > 0) {