zorm/sqlParser.go at master · springrain/zorm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package zorm

// sqlSpan 表示 SQL 中某个片段的位置范围 (左闭右开)
// sqlSpan represents the position range of a SQL fragment (left-closed, right-open)
type sqlSpan struct {
	Start int // 起始位置 (包含) / Start position (inclusive)
	End   int // 结束位置 (不包含) / End position (exclusive)
}

// sqlPart 表示 SQL 语句的各个子句片段
// sqlPart represents the fragments of each clause in a SQL statement
type sqlPart struct {
	// @TODO: With 子句暂不处理 / WITH clause is not handled for now
	// With sqlSpan   // WITH 子句 / WITH clause

	Select    sqlSpan // SELECT 子句 / SELECT clause
	From      sqlSpan // FROM 子句 / FROM clause
	Where     sqlSpan // WHERE 子句 / WHERE clause
	GroupBy   sqlSpan // GROUP BY 子句 / GROUP BY clause
	OrderBy   sqlSpan // ORDER BY 子句 / ORDER BY clause
	Distinct  sqlSpan // DISTINCT 关键字 / DISTINCT keyword
	Union     sqlSpan // UNION 关键字 / UNION keyword
	Intersect sqlSpan // INTERSECT 关键字 / INTERSECT keyword
	Except    sqlSpan // EXCEPT 关键字 / EXCEPT keyword
}

// sqlScanner SQL 词法扫描器, 用于逐个字符解析 SQL
// sqlScanner SQL lexical scanner for parsing SQL character by character
type sqlScanner struct {
	sqlStr string // 原始 SQL 字符串 / Original SQL string
	index  int    // 当前扫描位置 / Current scan position
	sqlLen int    // SQL 字符串总长度 / Total length of SQL string
	depth  int    // 括号嵌套深度, 用于处理子查询 / Parentheses nesting depth for handling subqueries
}

// ================= 基础能力 / Basic Capabilities =================

// isIdentChar 判断字符是否为标识符字符 (字母、数字、下划线)
// isIdentChar checks if a character is an identifier character (letter, digit, underscore)
func isIdentChar(c byte) bool {
	return (c >= 'a' && c <= 'z') ||
		(c >= 'A' && c <= 'Z') ||
		(c >= '0' && c <= '9') ||
		c == '_'
}

// skipString 跳过字符串字面量 (支持单引号 ' 和双引号 ")
// 处理转义: \' 和 "" (SQL 标准双单引号转义)
// skipString skips string literals (supports single quote ' and double quote ")
// Handles escapes: \' and "" (SQL standard double single-quote escape)
func (sc *sqlScanner) skipString() {
	quote := sc.sqlStr[sc.index] // 记录字符串的引号类型 / Record the quote type of the string
	sc.index++                   // 跳过开引号 / Skip opening quote

	for sc.index < sc.sqlLen {
		// \ 转义: 处理 \' 这种情况
		// Backslash escape: handles cases like \'
		if sc.sqlStr[sc.index] == '\\' && sc.index+1 < sc.sqlLen {
			sc.index += 2 // 跳过转义字符和下一个字符 / Skip escape character and next character
			continue
		}

		// '' 转义 (SQL 标准) : 处理 'O''Brien' 这种情况
		// '' escape (SQL standard): handles cases like 'O''Brien'
		if sc.sqlStr[sc.index] == quote {
			if sc.index+1 < sc.sqlLen && sc.sqlStr[sc.index+1] == quote {
				sc.index += 2 // 跳过两个连续的引号 / Skip two consecutive quotes
				continue
			}
			sc.index++ // 跳过闭引号 / Skip closing quote
			return     // 字符串结束 / End of string
		}

		sc.index++ // 继续扫描下一个字符 / Continue to scan next character
	}
	// 字符串未闭合也会正常退出, 不会报错 / Exits normally even if string is unclosed, no error
}

// skipComment 跳过注释, 返回是否成功跳过
// 支持两种注释格式: -- 单行注释 和 /* */ 多行注释
// skipComment skips comments, returns whether it successfully skipped
// Supports two comment formats: -- single-line comment and /* */ multi-line comment
func (sc *sqlScanner) skipComment() bool {
	// -- comment: 单行注释 / Single-line comment
	if sc.index+1 < sc.sqlLen && sc.sqlStr[sc.index] == '-' && sc.sqlStr[sc.index+1] == '-' {
		sc.index += 2 // 跳过 -- / Skip --
		// 扫描到行尾或 EOF (EOF 也视为注释结束)
		// Scan to end of line or EOF (EOF also counts as end of comment)
		for sc.index < sc.sqlLen && sc.sqlStr[sc.index] != '\n' {
			sc.index++
		}
		return true
	}

	// /* comment */: 多行注释 / Multi-line comment
	if sc.index+1 < sc.sqlLen && sc.sqlStr[sc.index] == '/' && sc.sqlStr[sc.index+1] == '*' {
		sc.index += 2 // 跳过 /* / Skip /*
		for sc.index+1 < sc.sqlLen {
			if sc.sqlStr[sc.index] == '*' && sc.sqlStr[sc.index+1] == '/' {
				sc.index += 2 // 跳过 */ / Skip */
				return true
			}
			sc.index++
		}
		// 注释未闭合也返回 true / Returns true even if comment is unclosed
		return true
	}

	return false // 不是注释 / Not a comment
}

// ================= 关键字匹配 / Keyword Matching =================

// matchKeyword 忽略大小写匹配关键字, 并检查单词边界
// 例如: 匹配 "from" 时不会匹配到 "from_addr" 或 "afrom"
// matchKeyword matches keywords case-insensitively and checks word boundaries
// For example: matching "from" will not match "from_addr" or "afrom"
func matchKeyword(s string, i int, word string) bool {
	n := len(s)
	wlen := len(word)

	// 长度检查 / Length check
	if i+wlen > n {
		return false
	}

	// 前边界检查: 前面的字符不能是标识符字符
	// Front boundary check: previous character must not be an identifier character
	if i > 0 && isIdentChar(s[i-1]) {
		return false
	}

	// 匹配内容 (忽略大小写)
	// Match content (case-insensitive)
	for j := 0; j < wlen; j++ {
		c := s[i+j]
		if c >= 'A' && c <= 'Z' {
			c += 32 // 转换为小写 / Convert to lowercase
		}
		if c != word[j] {
			return false
		}
	}

	// 后边界检查: 后面的字符不能是标识符字符
	// Back boundary check: next character must not be an identifier character
	if i+wlen < n && isIdentChar(s[i+wlen]) {
		return false
	}

	return true
}

// matchTwoKeywords 匹配两个连续的关键字, 如 "group by" 或 "order by"
// 允许两个关键字之间有空格、制表符、换行符
// matchTwoKeywords matches two consecutive keywords like "group by" or "order by"
// Allows spaces, tabs, and newlines between the two keywords
func matchTwoKeywords(s string, i int, w1, w2 string) bool {
	// 先匹配第一个关键字 / First match the first keyword
	if !matchKeyword(s, i, w1) {
		return false
	}

	j := i + len(w1)

	// 跳过中间的空白字符 (空格、制表符、换行符)
	// Skip whitespace between keywords (spaces, tabs, newlines)
	for j < len(s) {
		switch s[j] {
		case ' ', '\t', '\n', '\r':
			j++
		default:
			return matchKeyword(s, j, w2)
		}
	}

	// 匹配第二个关键字 / Match the second keyword
	return matchKeyword(s, j, w2)
}

// ================= 核心解析 / Core Parsing =================

// parseSQL 解析 SQL 语句, 返回各个子句的位置片段
// 这是替代正则表达式方案的核心函数, 用于分页时包装 COUNT(*) 语句
// 特点:
//   - 单次扫描完成所有关键字解析, 性能优于多次正则匹配
//   - 正确处理括号嵌套 (子查询中的 FROM 不影响外层)
//   - 正确处理字符串和注释中的伪关键字
//   - 大小写不敏感
//
// parseSQL parses a SQL statement and returns position fragments for each clause
// This is the core function to replace the regex-based approach, used for wrapping COUNT(*) in pagination
// Features:
//   - Single scan completes all keyword parsing, better performance than multiple regex matches
//   - Correctly handles parentheses nesting (FROM in subquery doesn't affect outer query)
//   - Correctly handles pseudo-keywords in strings and comments
//   - Case-insensitive
func parseSQL(sqlStr string) sqlPart {
	// 使用局部变量存储字符串值, 避免频繁解引用
	// Use local variable to store string value and avoid frequent dereferencing
	sc := &sqlScanner{sqlStr: sqlStr, sqlLen: len(sqlStr)}
	var parts sqlPart
	current := &parts.Select // 当前正在解析的子句, 默认为 SELECT / Current clause being parsed, defaults to SELECT
	current.Start = 0        // SELECT 始终从位置 0 开始 / SELECT always starts at position 0

	for sc.index < sc.sqlLen {
		c := sqlStr[sc.index]

		// 1. 字符串字面量: 跳过整个字符串, 避免误解析字符串中的关键字
		// String literal: skip the entire string to avoid misparsing keywords inside
		if c == '\'' || c == '"' {
			sc.skipString()
			continue
		}

		// 2. 注释: 跳过注释内容
		// Comment: skip comment content
		if c == '-' || c == '/' {
			if sc.skipComment() {
				continue
			}
		}

		// 3. 括号深度管理: 用于处理子查询
		// Parentheses depth management: for handling subqueries
		switch c {
		case '(':
			sc.depth++ // 进入子查询 / Enter subquery
			sc.index++
			continue

		case ')':
			if sc.depth > 0 {
				sc.depth-- // 退出子查询 / Exit subquery
			}
			sc.index++
			continue
		}

		// 4. 只在最外层 (非子查询内) 解析关键字
		// Only parse keywords at the outermost level (not inside subqueries)
		if sc.depth == 0 {
			switch c {
			case 'f', 'F':
				if matchKeyword(sc.sqlStr, sc.index, "from") {
					current.End = sc.index      // 结束当前子句 / End current clause
					parts.From.Start = sc.index // 设置 FROM 起始位置 / Set FROM start position
					current = &parts.From       // 切换到 FROM 子句 / Switch to FROM clause
				}

			case 'w', 'W':
				if matchKeyword(sc.sqlStr, sc.index, "where") {
					current.End = sc.index       // 结束当前子句 / End current clause
					parts.Where.Start = sc.index // 设置 WHERE 起始位置 / Set WHERE start position
					current = &parts.Where       // 切换到 WHERE 子句 / Switch to WHERE clause
				}

			case 'g', 'G':
				if matchTwoKeywords(sc.sqlStr, sc.index, "group", "by") {
					current.End = sc.index         // 结束当前子句 / End current clause
					parts.GroupBy.Start = sc.index // 设置 GROUP BY 起始位置 / Set GROUP BY start position
					current = &parts.GroupBy       // 切换到 GROUP BY 子句 / Switch to GROUP BY clause
				}

			case 'o', 'O':
				if matchTwoKeywords(sc.sqlStr, sc.index, "order", "by") {
					current.End = sc.index         // 结束当前子句 / End current clause
					parts.OrderBy.Start = sc.index // 设置 ORDER BY 起始位置 / Set ORDER BY start position
					current = &parts.OrderBy       // 切换到 ORDER BY 子句 / Switch to ORDER BY clause
				}

			case 'd', 'D':
				if matchKeyword(sc.sqlStr, sc.index, "distinct") {
					parts.Distinct.Start = sc.index
					parts.Distinct.End = sc.index + 8
				}

			case 'u', 'U':
				if matchKeyword(sc.sqlStr, sc.index, "union") {
					parts.Union.Start = sc.index
					parts.Union.End = sc.index + 5
				}

			case 'i', 'I':
				if matchKeyword(sc.sqlStr, sc.index, "intersect") {
					parts.Intersect.Start = sc.index
					parts.Intersect.End = sc.index + 9
				}

			case 'e', 'E':
				if matchKeyword(sc.sqlStr, sc.index, "except") {
					parts.Except.Start = sc.index
					parts.Except.End = sc.index + 6
				}

			}
		}

		sc.index++
	}

	// 设置最后一个子句的结束位置
	// Set end position for the last clause
	if current != nil {
		current.End = sc.sqlLen
	}

	// 为所有已启动但未设置 End 的 part 补全 End 值
	// Complete End values for parts that were started but not ended
	// 使用 Start > 0 判断, 因为正常 SQL 中这些关键字不可能在位置 0
	// Uses Start > 0 check because these keywords cannot be at position 0 in normal SQL
	if parts.From.Start > 0 && parts.From.End == 0 {
		parts.From.End = sc.sqlLen
	}
	if parts.Where.Start > 0 && parts.Where.End == 0 {
		parts.Where.End = sc.sqlLen
	}
	if parts.GroupBy.Start > 0 && parts.GroupBy.End == 0 {
		parts.GroupBy.End = sc.sqlLen
	}
	if parts.OrderBy.Start > 0 && parts.OrderBy.End == 0 {
		parts.OrderBy.End = sc.sqlLen
	}

	return parts
}