From 711a50921cbf0a7e5d41c8ff2616e8dee1fa449e Mon Sep 17 00:00:00 2001 From: "Gavin M. Roy" Date: Thu, 11 Jun 2026 19:52:09 -0400 Subject: [PATCH 1/2] fix(codegen): map gram.y IDENT to quoted identifiers too postgres/grammar.js defined quoted_identifier but nothing referenced it, so CREATE TABLE "Foo" (id int) and SELECT "a""b" produced ERROR nodes. In PostgreSQL's lexer a double-quoted identifier is an IDENT terminal, so every IDENT call site must accept both forms. - Map IDENT/UIDENT in BASE_TOKEN_MAP to a new hidden _ident rule, choice($.identifier, $.quoted_identifier), emitted with the lexer rules. Hidden, so the CST keeps (ColId (identifier)) stable and quoted forms surface as (ColId (quoted_identifier)) - word stays on the bare identifier token as tree-sitter requires; the prec.left/prec.dynamic keyword-vs-identifier wrappers now wrap _ident, which is safe because quoted identifiers never lex as keywords - Add corpus cases for quoted table/column names, quoted qualified names, and COLLATE with a quoted collation in index options No new GLR conflicts. Validated against pglifecycle rust-rewrite: its quoted_identifiers_unquote and parses_index_options tests pass with this checkout patched in (the latter was fixed by #28). Co-authored-by: Claude --- postgres/grammar.js | 30 +++++++------ postgres/src/grammar.json | 37 ++++++++++------ postgres/src/node-types.json | 52 ++++++++++++++++++++++ postgres/src/parser.c | 4 +- postgres/test/corpus/ddl.txt | 63 +++++++++++++++++++++++++++ postgres/test/corpus/select.txt | 76 +++++++++++++++++++++++++++++++++ script/codegen.js | 10 ++++- 7 files changed, 244 insertions(+), 28 deletions(-) diff --git a/postgres/grammar.js b/postgres/grammar.js index 88281ab..f6efbc3 100644 --- a/postgres/grammar.js +++ b/postgres/grammar.js @@ -205,7 +205,7 @@ module.exports = grammar({ seq($.kw_connection, $.kw_limit, $.SignedIconst), seq($.kw_valid, $.kw_until, $.Sconst), seq($.kw_user, $.role_list), - prec.left(11, prec.dynamic(11, $.identifier)) + prec.left(11, prec.dynamic(11, $._ident)) ), CreateOptRoleElem: $ => choice( $.AlterOptRoleElem, @@ -311,7 +311,7 @@ module.exports = grammar({ ), zone_value: $ => choice( $.Sconst, - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), seq($.ConstInterval, $.Sconst, optional($.opt_interval)), prec.left(20, prec.dynamic(20, seq($.ConstInterval, '(', $.Iconst, ')', $.Sconst))), $.NumericOnly, @@ -991,7 +991,7 @@ module.exports = grammar({ RowSecurityOptionalWithCheck: $ => prec.left(11, prec.dynamic(11, seq($.kw_with, $.kw_check, '(', $.a_expr, ')'))), RowSecurityDefaultToRole: $ => seq($.kw_to, $.role_list), RowSecurityOptionalToRole: $ => seq($.kw_to, $.role_list), - RowSecurityDefaultPermissive: $ => prec.left(11, prec.dynamic(11, seq($.kw_as, $.identifier))), + RowSecurityDefaultPermissive: $ => prec.left(11, prec.dynamic(11, seq($.kw_as, $._ident))), RowSecurityDefaultForCmd: $ => seq($.kw_for, $.row_security_cmd), row_security_cmd: $ => choice( $.kw_all, @@ -1134,7 +1134,7 @@ module.exports = grammar({ $.old_aggr_elem, seq($.old_aggr_list, ',', $.old_aggr_elem) ), - old_aggr_elem: $ => prec.left(11, prec.dynamic(11, seq($.identifier, '=', $.def_arg))), + old_aggr_elem: $ => prec.left(11, prec.dynamic(11, seq($._ident, '=', $.def_arg))), opt_enum_val_list: $ => $.enum_val_list, enum_val_list: $ => choice( $.Sconst, @@ -1966,7 +1966,7 @@ module.exports = grammar({ seq($.createdb_opt_name, optional($.opt_equal), $.kw_default) ), createdb_opt_name: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), seq($.kw_connection, $.kw_limit), $.kw_encoding, $.kw_location, @@ -2505,7 +2505,7 @@ module.exports = grammar({ seq($.xmltable_column_option_list, $.xmltable_column_option_el) ), xmltable_column_option_el: $ => choice( - prec.left(11, prec.dynamic(11, seq($.identifier, $.b_expr))), + prec.left(11, prec.dynamic(11, seq($._ident, $.b_expr))), seq($.kw_default, $.b_expr), prec.right(5, prec.dynamic(5, seq($.kw_not, $.kw_null))), $.kw_null, @@ -2997,7 +2997,7 @@ module.exports = grammar({ ), extract_list: $ => seq($.extract_arg, $.kw_from, $.a_expr), extract_arg: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.kw_year, $.kw_month, $.kw_day, @@ -3226,30 +3226,30 @@ module.exports = grammar({ prec.left(7, prec.dynamic(7, '=')) ), ColId: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.unreserved_keyword, $.col_name_keyword ), type_function_name: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.unreserved_keyword, $.type_func_name_keyword ), NonReservedWord: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.unreserved_keyword, $.col_name_keyword, $.type_func_name_keyword ), ColLabel: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.unreserved_keyword, $.col_name_keyword, $.type_func_name_keyword, $.reserved_keyword ), BareColLabel: $ => choice( - prec.left(11, prec.dynamic(11, $.identifier)), + prec.left(11, prec.dynamic(11, $._ident)), $.bare_label_keyword ), bare_label_keyword: $ => choice( @@ -4729,6 +4729,12 @@ module.exports = grammar({ // scanner would be needed to handle these correctly. quoted_identifier: _ => token(/"([^"]|"")*"/), + // gram.y's IDENT terminal matches either identifier form (scan.l lexes + // quoted identifiers into IDENT). Hidden so the CST surfaces the + // (identifier) / (quoted_identifier) leaf directly. 'word' below must + // stay on the bare identifier token, so this cannot replace it there. + _ident: $ => choice($.identifier, $.quoted_identifier), + // Positional parameter: $1, $2, ... param: _ => /\$[0-9]+/, diff --git a/postgres/src/grammar.json b/postgres/src/grammar.json index 8c2390e..674e0af 100644 --- a/postgres/src/grammar.json +++ b/postgres/src/grammar.json @@ -822,7 +822,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } } @@ -2213,7 +2213,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -14215,7 +14215,7 @@ }, { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } ] } @@ -16112,7 +16112,7 @@ "members": [ { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" }, { "type": "STRING", @@ -31994,7 +31994,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -39799,7 +39799,7 @@ "members": [ { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" }, { "type": "SYMBOL", @@ -48339,7 +48339,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -51260,7 +51260,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -51285,7 +51285,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -51310,7 +51310,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -51339,7 +51339,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -51372,7 +51372,7 @@ "value": 11, "content": { "type": "SYMBOL", - "name": "identifier" + "name": "_ident" } } }, @@ -60943,6 +60943,19 @@ "value": "\"([^\"]|\"\")*\"" } }, + "_ident": { + "type": "CHOICE", + "members": [ + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "SYMBOL", + "name": "quoted_identifier" + } + ] + }, "param": { "type": "PATTERN", "value": "\\$[0-9]+" diff --git a/postgres/src/node-types.json b/postgres/src/node-types.json index 2a71ce2..1397b06 100644 --- a/postgres/src/node-types.json +++ b/postgres/src/node-types.json @@ -1064,6 +1064,10 @@ "type": "kw_valid", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "role_list", "named": true @@ -1973,6 +1977,10 @@ { "type": "identifier", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -2345,6 +2353,10 @@ "type": "identifier", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "unreserved_keyword", "named": true @@ -2368,6 +2380,10 @@ "type": "identifier", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "reserved_keyword", "named": true @@ -6203,6 +6219,10 @@ "type": "identifier", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "type_func_name_keyword", "named": true @@ -7627,6 +7647,10 @@ { "type": "kw_as", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -13722,6 +13746,10 @@ { "type": "kw_template", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -14446,6 +14474,10 @@ { "type": "kw_year", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -18040,6 +18072,10 @@ { "type": "identifier", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -23572,6 +23608,10 @@ "type": "identifier", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "type_func_name_keyword", "named": true @@ -25787,6 +25827,10 @@ { "type": "kw_path", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -25849,6 +25893,10 @@ { "type": "opt_interval", "named": true + }, + { + "type": "quoted_identifier", + "named": true } ] } @@ -27958,6 +28006,10 @@ "type": "param", "named": true }, + { + "type": "quoted_identifier", + "named": true + }, { "type": "string_literal", "named": true diff --git a/postgres/src/parser.c b/postgres/src/parser.c index 2498e2c..1a01824 100644 --- a/postgres/src/parser.c +++ b/postgres/src/parser.c @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b5d892a5330a4292d9d80d549eaedc22cd59298af257a04492e929b5e5a812f -size 96783450 +oid sha256:37faf2c5c7d96c8556b84a53dcd355385c733e4d045e9de1459529619f3e1f55 +size 97199903 diff --git a/postgres/test/corpus/ddl.txt b/postgres/test/corpus/ddl.txt index e712c72..c556b85 100644 --- a/postgres/test/corpus/ddl.txt +++ b/postgres/test/corpus/ddl.txt @@ -397,3 +397,66 @@ CREATE INDEX idx_expr ON t USING btree ((lower(a)) DESC); (index_elem_options (opt_asc_desc (kw_desc))))))))) + +================================================================================ +CREATE TABLE quoted identifiers +================================================================================ + +CREATE TABLE "My Table" ("Col""umn" int); + +-------------------------------------------------------------------------------- + +(source_file + (toplevel_stmt + (stmt + (CreateStmt + (kw_create) + (kw_table) + (qualified_name + (ColId + (quoted_identifier))) + (OptTableElementList + (TableElementList + (TableElement + (columnDef + (ColId + (quoted_identifier)) + (Typename + (SimpleTypename + (Numeric + (kw_int)))))))))))) + +================================================================================ +CREATE INDEX column COLLATE quoted DESC +================================================================================ + +CREATE INDEX i ON t (c COLLATE "fr_FR" DESC); + +-------------------------------------------------------------------------------- + +(source_file + (toplevel_stmt + (stmt + (IndexStmt + (kw_create) + (kw_index) + (opt_single_name + (ColId + (identifier))) + (kw_on) + (relation_expr + (qualified_name + (ColId + (identifier)))) + (index_params + (index_elem + (ColId + (identifier)) + (index_elem_options + (opt_collate + (kw_collate) + (any_name + (ColId + (quoted_identifier)))) + (opt_asc_desc + (kw_desc))))))))) diff --git a/postgres/test/corpus/select.txt b/postgres/test/corpus/select.txt index 4ce879f..49cd60b 100644 --- a/postgres/test/corpus/select.txt +++ b/postgres/test/corpus/select.txt @@ -257,3 +257,79 @@ VALUES (1), (2); (AexprConst (Iconst (integer_literal)))))))))))))) + +================================================================================ +SELECT quoted identifiers and qualified names +================================================================================ + +SELECT "a""b" FROM "Sch ema"."T"; + +-------------------------------------------------------------------------------- + +(source_file + (toplevel_stmt + (stmt + (SelectStmt + (select_no_parens + (simple_select + (kw_select) + (opt_target_list + (target_list + (target_el + (a_expr + (a_expr + (c_expr + (columnref + (ColId + (quoted_identifier))))))))) + (from_clause + (kw_from) + (from_list + (table_ref + (relation_expr + (qualified_name + (ColId + (quoted_identifier)) + (indirection + (indirection_el + (attr_name + (ColLabel + (quoted_identifier)))))))))))))))) + +================================================================================ +SELECT quoted schema with unquoted table +================================================================================ + +SELECT x FROM "Quoted".unquoted; + +-------------------------------------------------------------------------------- + +(source_file + (toplevel_stmt + (stmt + (SelectStmt + (select_no_parens + (simple_select + (kw_select) + (opt_target_list + (target_list + (target_el + (a_expr + (a_expr + (c_expr + (columnref + (ColId + (identifier))))))))) + (from_clause + (kw_from) + (from_list + (table_ref + (relation_expr + (qualified_name + (ColId + (quoted_identifier)) + (indirection + (indirection_el + (attr_name + (ColLabel + (identifier)))))))))))))))) diff --git a/script/codegen.js b/script/codegen.js index 811b176..ca49189 100644 --- a/script/codegen.js +++ b/script/codegen.js @@ -47,8 +47,8 @@ const MODE_TOKENS = new Set([ * Base (non-keyword) tokens from gram.y — map to tree-sitter rule references. */ const BASE_TOKEN_MAP = { - IDENT: '$.identifier', - UIDENT: '$.identifier', // reduced to IDENT in PG lexer + IDENT: '$._ident', // unquoted or double-quoted identifier + UIDENT: '$._ident', // reduced to IDENT in PG lexer FCONST: '$.float_literal', SCONST: 'choice($.string_literal, $.dollar_quoted_string)', USCONST: 'choice($.string_literal, $.dollar_quoted_string)', // reduced to SCONST in PG lexer @@ -509,6 +509,12 @@ function generateLexerRules() { // scanner would be needed to handle these correctly. quoted_identifier: _ => token(/"([^"]|"")*"/), + // gram.y's IDENT terminal matches either identifier form (scan.l lexes + // quoted identifiers into IDENT). Hidden so the CST surfaces the + // (identifier) / (quoted_identifier) leaf directly. 'word' below must + // stay on the bare identifier token, so this cannot replace it there. + _ident: $ => choice($.identifier, $.quoted_identifier), + // Positional parameter: $1, $2, ... param: _ => /\\$[0-9]+/, From 528eee3e4de911475cfc289385dbae7a3ed82458 Mon Sep 17 00:00:00 2001 From: "Gavin M. Roy" Date: Thu, 11 Jun 2026 19:52:19 -0400 Subject: [PATCH 2/2] fix(build): generate-postgres recipe wrote parser to wrong directory With the current tree-sitter CLI, `tree-sitter generate postgres/grammar.js` from the repo root writes output to ./src (gitignored) instead of postgres/src, silently leaving the committed parser stale. Run generate from inside postgres/ instead, matching the generate-plpgsql recipe. Co-authored-by: Claude --- justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justfile b/justfile index 2ed94b0..d4329aa 100644 --- a/justfile +++ b/justfile @@ -15,7 +15,7 @@ test: # Generate the postgres grammar from PostgreSQL source generate-postgres pg_dir=env("PG_SOURCE_DIR"): node script/generate-grammar.js {{pg_dir}} - {{ts}} generate postgres/grammar.js + cd postgres && ../node_modules/.bin/tree-sitter generate # Generate postgres language injection queries generate-injections: