Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
id: set-matrix
run: |
# List of all available parsers
ALL_PARSERS="redshift postgresql cql snowflake tsql doris trino plsql googlesql mysql partiql tidb bq mariadb"
ALL_PARSERS="redshift postgresql cql snowflake tsql doris trino plsql googlesql mysql partiql tidb bq mariadb cosmosdb"
# Add more parsers here as they are added to the repository
# ALL_PARSERS="redshift mysql postgresql"

Expand Down
129 changes: 129 additions & 0 deletions cosmosdb/CosmosDBLexer.g4
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
lexer grammar CosmosDBLexer;

options {
caseInsensitive = true;
}

fragment A: [a];
fragment B: [b];
fragment C: [c];
fragment D: [d];
fragment E: [e];
fragment F: [f];
fragment G: [g];
fragment H: [h];
fragment I: [i];
fragment J: [j];
fragment K: [k];
fragment L: [l];
fragment M: [m];
fragment N: [n];
fragment O: [o];
fragment P: [p];
fragment Q: [q];
fragment R: [r];
fragment S: [s];
fragment T: [t];
fragment U: [u];
fragment V: [v];
fragment W: [w];
fragment X: [x];
fragment Y: [y];
fragment Z: [z];

MULTIPLY_OPERATOR: '*';

AS_SYMBOL: 'AS';
SELECT_SYMBOL: 'SELECT';
FROM_SYMBOL: 'FROM';
DISTINCT_SYMBOL: 'DISTINCT';
UNDEFINED_SYMBOL: 'UNDEFINED';
NULL_SYMBOL: 'NULL';
FALSE_SYMBOL: 'FALSE';
TRUE_SYMBOL: 'TRUE';
NOT_SYMBOL: 'NOT';
UDF_SYMBOL: 'UDF';
WHERE_SYMBOL: 'WHERE';
AND_SYMBOL: 'AND';
OR_SYMBOL: 'OR';

AT_SYMBOL: '@';
LC_BRACKET_SYMBOL: '{';
RC_BRACKET_SYMBOL: '}';
LS_BRACKET_SYMBOL: '[';
RS_BRACKET_SYMBOL: ']';
LR_BRACKET_SYMBOL: '(';
RR_BRACKET_SYMBOL: ')';
SINGLE_QUOTE_SYMBOL: '\'';
DOUBLE_QUOTE_SYMBOL: '"';
COMMA_SYMBOL: ',';
DOT_SYMBOL: '.';
QUESTION_MARK_SYMBOL: '?';
COLON_SYMBOL: ':';
PLUS_SYMBOL: '+';
MINUS_SYMBOL: '-';
BIT_NOT_SYMBOL: '~';
DIVIDE_SYMBOL: '/';
MODULO_SYMBOL: '%';
BIT_AND_SYMBOL: '&';
BIT_OR_SYMBOL: '|';
DOUBLE_BAR_SYMBOL: '||';
BIT_XOR_SYMBOL: '^';
EQUAL_SYMBOL: '=';

/* Identifiers */
IDENTIFIER: [a-z] [a-z_0-9]*;

// White space handling
WHITESPACE:
[ \t\f\r\n] -> channel(HIDDEN); // Ignore whitespaces.

// Decimal literal.
fragment DEC_DIGIT: [0-9];
fragment DEC_DOT_DEC: (
DEC_DIGIT+ '.' DEC_DIGIT+
| DEC_DIGIT+ '.'
| '.' DEC_DIGIT+
);

DECIMAL: DEC_DIGIT+;
REAL: (DECIMAL | DEC_DOT_DEC) ('E' [+-]? DEC_DIGIT+);
FLOAT: DEC_DOT_DEC;

// Hexadecimal literal.
fragment HEX_DIGIT: [0-9A-F];
HEXADECIMAL: '0' 'X' HEX_DIGIT+;

fragment FullWidthLetter options {
caseInsensitive = false;
}:
'\u00c0' ..'\u00d6'
| '\u00d8' ..'\u00f6'
| '\u00f8' ..'\u00ff'
| '\u0100' ..'\u1fff'
| '\u2c00' ..'\u2fff'
| '\u3040' ..'\u318f'
| '\u3300' ..'\u337f'
| '\u3400' ..'\u3fff'
| '\u4e00' ..'\u9fff'
| '\ua000' ..'\ud7ff'
| '\uf900' ..'\ufaff'
| '\uff00' ..'\ufff0';
// | '\u10000'..'\u1F9FF' //not support four bytes chars | '\u20000'..'\u2FA1F'

// String literal.
fragment ESCAPE_SEQUENCE:
'\\' [btnrf"'\\/] // Basic escape sequences: \b, \t, \n, \r, \f, ", ', \, /
| '\\u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT; // Unicode escape: \uXXXX

fragment STRING_CHAR:
ESCAPE_SEQUENCE
| ~[\\"'\r\n]; // Any Unicode character EXCEPT: \, ", ', \r, \n

// String literals
SINGLE_QUOTE_STRING_LITERAL:
SINGLE_QUOTE_SYMBOL STRING_CHAR* SINGLE_QUOTE_SYMBOL;


DOUBLE_QUOTE_STRING_LITERAL:
DOUBLE_QUOTE_SYMBOL STRING_CHAR* DOUBLE_QUOTE_SYMBOL;
153 changes: 153 additions & 0 deletions cosmosdb/CosmosDBParser.g4
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
parser grammar CosmosDBParser;

options {
tokenVocab = CosmosDBLexer;
}

root: select EOF;

select: select_clause from_clause where_clause?;

select_clause: SELECT_SYMBOL select_specification;

select_specification:
MULTIPLY_OPERATOR
| DISTINCT_SYMBOL? object_property_list;

from_clause: FROM_SYMBOL from_specification;

where_clause: WHERE_SYMBOL scalar_expression_in_where;

from_specification: from_source;

from_source: container_expression;

container_expression: container_name (AS_SYMBOL? IDENTIFIER)?;

container_name: IDENTIFIER;

object_property_list:
object_property (COMMA_SYMBOL object_property)*;

object_property: scalar_expression (AS_SYMBOL? property_alias)?;

property_alias: IDENTIFIER;

// scalar_expression: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
scalar_expression:
input_alias
| scalar_expression DOT_SYMBOL property_name
| scalar_expression LS_BRACKET_SYMBOL (
(DOUBLE_QUOTE_STRING_LITERAL)
| (array_index)
) RS_BRACKET_SYMBOL
| unary_operator scalar_expression;

// TODO(zp): Merge scalar_expression and scalar_expression_in_where while supporting the project
// fully. https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
scalar_expression_in_where:
constant
| input_alias
| parameter_name
| scalar_expression_in_where AND_SYMBOL scalar_expression_in_where
| scalar_expression_in_where OR_SYMBOL scalar_expression_in_where
| scalar_expression_in_where DOT_SYMBOL property_name
| scalar_expression_in_where LS_BRACKET_SYMBOL (
(DOUBLE_QUOTE_STRING_LITERAL)
| (array_index)
) RS_BRACKET_SYMBOL
| unary_operator scalar_expression_in_where
| scalar_expression_in_where binary_operator scalar_expression_in_where
| scalar_expression_in_where QUESTION_MARK_SYMBOL scalar_expression_in_where COLON_SYMBOL
scalar_expression_in_where
| scalar_function_expression
| create_object_expression
| create_array_expression
| LR_BRACKET_SYMBOL scalar_expression_in_where RR_BRACKET_SYMBOL;

create_array_expression: array_constant;

create_object_expression: object_constant;

scalar_function_expression:
udf_scalar_function_expression
| builtin_function_expression;

udf_scalar_function_expression:
UDF_SYMBOL DOT_SYMBOL IDENTIFIER LR_BRACKET_SYMBOL (
scalar_expression_in_where (
COMMA_SYMBOL scalar_expression_in_where
)*
) RR_BRACKET_SYMBOL;

builtin_function_expression:
IDENTIFIER LR_BRACKET_SYMBOL (
scalar_expression_in_where (
COMMA_SYMBOL scalar_expression_in_where
)*
) RR_BRACKET_SYMBOL;

binary_operator:
MULTIPLY_OPERATOR
| DIVIDE_SYMBOL
| MODULO_SYMBOL
| PLUS_SYMBOL
| MINUS_SYMBOL
| BIT_AND_SYMBOL
| BIT_XOR_SYMBOL
| BIT_OR_SYMBOL
| DOUBLE_BAR_SYMBOL
| EQUAL_SYMBOL;

unary_operator: BIT_NOT_SYMBOL | PLUS_SYMBOL | MINUS_SYMBOL;

parameter_name: AT_SYMBOL IDENTIFIER;

// https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/constants
constant:
undefined_constant
| null_constant
| boolean_constant
| number_constant
| string_constant
| array_constant
| object_constant;

object_constant:
LC_BRACKET_SYMBOL (
object_constant_field_pair (
COMMA_SYMBOL object_constant_field_pair
)*
) RC_BRACKET_SYMBOL;

object_constant_field_pair: (
property_name
| (DOUBLE_QUOTE_SYMBOL property_name DOUBLE_QUOTE_SYMBOL)
) COMMA_SYMBOL constant;

array_constant:
LS_BRACKET_SYMBOL (constant (COMMA_SYMBOL constant)*)? RS_BRACKET_SYMBOL;

string_constant: string_literal;

undefined_constant: UNDEFINED_SYMBOL;

null_constant: NULL_SYMBOL;

boolean_constant: TRUE_SYMBOL | FALSE_SYMBOL;

number_constant: decimal_literal | hexadecimal_literal;

string_literal:
SINGLE_QUOTE_STRING_LITERAL
| DOUBLE_QUOTE_STRING_LITERAL;

decimal_literal: DECIMAL | REAL | FLOAT;

hexadecimal_literal: HEXADECIMAL;

property_name: IDENTIFIER;

array_index: DECIMAL;

input_alias: IDENTIFIER;
7 changes: 7 additions & 0 deletions cosmosdb/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
all: build test

build:
antlr -Dlanguage=Go -package cosmosdb -visitor -o . CosmosDBLexer.g4 CosmosDBParser.g4

test:
go test -v -run TestCosmosDBParser
7 changes: 7 additions & 0 deletions cosmosdb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# cosmosdb-parser

Cosmos DB SQL parser based on ANTLR4.

## References

- [Queries in Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/)
Loading