Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/language/conformance.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ code appears among the diagnostics for that file. Diagnostic codes are the ones
registered in `internal/diagnostics/registry.go` and documented in
`docs/reference/diagnostic-codes.md`.

## Scope and limits

The corpus uses single-file `CheckSource`, so it pins what one file can verify
without a project: package and metadata declarations, route forms, `view {}`
markup, `style {}`, literal `build {}`, slots, and the rejection contracts below.

It cannot cleanly cover constructs that require project context: reactive `g:`
directives (`g:if`/`g:on`/`g:bind`) reference a Go-typed `state` contract that
does not resolve single-file; endpoint forms (`act`/`api`) need exported Go
handlers; and `layout`/`wasm`/`asset`/`css` need sibling files or config. Those
are exercised by the package- and build-level tests instead. Expanding the
corpus to a project-level harness for them is tracked separately.

## Coverage

`TestConformanceCorpusCoversRejectionContracts` fails when a rejection contract
Expand Down
5 changes: 5 additions & 0 deletions docs/product/language-server.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ Developers editing `.gwdk` files need live feedback from the same language tooli
missing GOWDK `use` aliases.
- Return full-document semantic tokens for `.gwdk` decorators, identifiers,
strings, and operators.
- Return a document outline (top-level package, metadata, imports, uses, blocks,
endpoints, and component/page declarations) from the recursive-descent outline
pass over the shared tokenizer.

### Non-Functional

Expand All @@ -78,6 +81,8 @@ Developers editing `.gwdk` files need live feedback from the same language tooli
- [x] `textDocument/references` returns open-document references for page IDs, routes, components, stores, and guards.
- [x] `textDocument/codeAction` returns quick fixes for old endpoint syntax and missing GOWDK use aliases.
- [x] `textDocument/semanticTokens/full` returns encoded token data for open `.gwdk` buffers.
- [x] `textDocument/documentSymbol` returns a top-level outline parsed by the
recursive-descent outline pass over the shared tokenizer (ADR 0010).
- [x] `go test ./...` and `go build ./cmd/gowdk` pass.

## Edge Cases
Expand Down
69 changes: 49 additions & 20 deletions internal/lang/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,44 @@ import "unicode"

// Lex tokenizes .gwdk source for editor and CLI tooling.
func Lex(source string) ([]Token, Diagnostics) {
runes := []rune(source)
// byteOffsets[i] is the 0-based byte offset of rune i in the original
// source; the final entry is the total byte length. Offsets are taken from
// ranging the original string (which reports true byte positions) rather
// than summing utf8.RuneLen, so malformed UTF-8 — where []rune turns each
// bad byte into a 3-byte U+FFFD — does not drift token offsets.
byteOffsets := make([]int, len(runes)+1)
runeIndex := 0
for byteIndex := range source {
byteOffsets[runeIndex] = byteIndex
runeIndex++
}
byteOffsets[len(runes)] = len(source)

lexer := scanner{
source: []rune(source),
line: 1,
column: 1,
source: runes,
byteOffsets: byteOffsets,
line: 1,
column: 1,
}
return lexer.scan()
}

type scanner struct {
source []rune
index int
line int
column int
source []rune
byteOffsets []int
index int
line int
column int
}

// offset returns the 0-based byte offset of the current rune in the original
// source.
func (scanner *scanner) offset() int {
if scanner.index < len(scanner.byteOffsets) {
return scanner.byteOffsets[scanner.index]
}
return scanner.byteOffsets[len(scanner.byteOffsets)-1]
}

func (scanner *scanner) scan() ([]Token, Diagnostics) {
Expand All @@ -26,13 +51,14 @@ func (scanner *scanner) scan() ([]Token, Diagnostics) {
for !scanner.done() {
ch := scanner.peek()
pos := scanner.position()
offset := scanner.offset()

switch {
case ch == '\r':
scanner.advance()
case ch == '\n':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenNewline, Lexeme: "\n", Pos: pos})
tokens = append(tokens, Token{Kind: TokenNewline, Lexeme: "\n", Pos: pos, Offset: offset})
case unicode.IsSpace(ch):
scanner.advance()
case ch == '/' && scanner.peekNext() == '/':
Expand All @@ -47,47 +73,49 @@ func (scanner *scanner) scan() ([]Token, Diagnostics) {
}
case ch == '{':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenLBrace, Lexeme: "{", Pos: pos})
tokens = append(tokens, Token{Kind: TokenLBrace, Lexeme: "{", Pos: pos, Offset: offset})
case ch == '}':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenRBrace, Lexeme: "}", Pos: pos})
tokens = append(tokens, Token{Kind: TokenRBrace, Lexeme: "}", Pos: pos, Offset: offset})
case ch == ',':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenComma, Lexeme: ",", Pos: pos})
tokens = append(tokens, Token{Kind: TokenComma, Lexeme: ",", Pos: pos, Offset: offset})
case ch == ':':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenColon, Lexeme: ":", Pos: pos})
tokens = append(tokens, Token{Kind: TokenColon, Lexeme: ":", Pos: pos, Offset: offset})
case ch == '?':
scanner.advance()
tokens = append(tokens, Token{Kind: TokenQuestion, Lexeme: "?", Pos: pos})
tokens = append(tokens, Token{Kind: TokenQuestion, Lexeme: "?", Pos: pos, Offset: offset})
case ch == '=' && scanner.peekNext() == '>':
scanner.advance()
scanner.advance()
tokens = append(tokens, Token{Kind: TokenArrow, Lexeme: "=>", Pos: pos})
tokens = append(tokens, Token{Kind: TokenArrow, Lexeme: "=>", Pos: pos, Offset: offset})
default:
tokens = append(tokens, scanner.text())
}
}

tokens = append(tokens, Token{Kind: TokenEOF, Pos: scanner.position()})
tokens = append(tokens, Token{Kind: TokenEOF, Pos: scanner.position(), Offset: scanner.offset()})
return tokens, diagnostics
}

func (scanner *scanner) identifier() Token {
pos := scanner.position()
offset := scanner.offset()
start := scanner.index
for !scanner.done() && (isIdentPart(scanner.peek()) || scanner.peek() == '.' || scanner.peek() == '-') {
scanner.advance()
}
lexeme := string(scanner.source[start:scanner.index])
if scanner.isLineLeading(start) && isMetadataLexeme(lexeme) {
return Token{Kind: TokenMetadata, Lexeme: lexeme, Pos: pos}
return Token{Kind: TokenMetadata, Lexeme: lexeme, Pos: pos, Offset: offset}
}
return Token{Kind: TokenIdentifier, Lexeme: lexeme, Pos: pos}
return Token{Kind: TokenIdentifier, Lexeme: lexeme, Pos: pos, Offset: offset}
}

func (scanner *scanner) quotedString() (Token, Diagnostic) {
pos := scanner.position()
offset := scanner.offset()
start := scanner.index
scanner.advance()
for !scanner.done() {
Expand All @@ -101,14 +129,14 @@ func (scanner *scanner) quotedString() (Token, Diagnostic) {
}
if ch == '"' {
scanner.advance()
return Token{Kind: TokenString, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}, Diagnostic{}
return Token{Kind: TokenString, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}, Diagnostic{}
}
if ch == '\n' {
break
}
scanner.advance()
}
return Token{Kind: TokenIllegal, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}, Diagnostic{
return Token{Kind: TokenIllegal, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}, Diagnostic{
Pos: pos,
Range: sourceRange(pos, scanner.position()),
Code: "unterminated_string",
Expand All @@ -129,6 +157,7 @@ func sourceRange(start, end Position) *Range {

func (scanner *scanner) text() Token {
pos := scanner.position()
offset := scanner.offset()
start := scanner.index
for !scanner.done() {
ch := scanner.peek()
Expand All @@ -140,7 +169,7 @@ func (scanner *scanner) text() Token {
}
scanner.advance()
}
return Token{Kind: TokenText, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}
return Token{Kind: TokenText, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}
}

func (scanner *scanner) skipLineComment() {
Expand Down
76 changes: 76 additions & 0 deletions internal/lang/lexer_offset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package lang

import (
"testing"

"github.com/cssbruno/gowdk/internal/source"
)

// TestLexTokenOffsetsAreByteAccurate verifies the tokenizer records each token's
// 0-based byte offset and that it stays consistent with the token's line/column
// via the source conversion helpers, including across a multi-byte rune. This is
// the substrate contract the recursive-descent parser (ADR 0010) depends on.
func TestLexTokenOffsetsAreByteAccurate(t *testing.T) {
// The euro sign is three bytes, so byte offsets and rune columns diverge
// after it.
src := "page home\ntitle \"€\"\nroute \"/\"\n"
tokens, _ := Lex(src)

buffer := []byte(src)
for _, token := range tokens {
if token.Kind == TokenEOF {
continue
}
// The token's recorded byte offset must point at its lexeme in the
// source buffer.
if token.Offset < 0 || token.Offset > len(buffer) {
t.Fatalf("token %q offset %d out of bounds", token.Lexeme, token.Offset)
}
if token.Kind != TokenNewline && token.Lexeme != "" {
got := string(buffer[token.Offset : token.Offset+len(token.Lexeme)])
if got != token.Lexeme {
t.Fatalf("token %q at offset %d points at %q", token.Lexeme, token.Offset, got)
}
}
// The byte offset and the line/column must describe the same position.
want := source.SourcePosition{Line: token.Pos.Line, Column: token.Pos.Column}
if off := source.OffsetOf(buffer, want); off != token.Offset {
t.Fatalf("token %q: OffsetOf(line %d,col %d)=%d, token offset=%d",
token.Lexeme, token.Pos.Line, token.Pos.Column, off, token.Offset)
}
}
}

// TestLexTokenOffsetsSurviveMalformedUTF8 guards against offset drift after an
// invalid byte: []rune turns a malformed byte into a 3-byte U+FFFD, so deriving
// offsets from utf8.RuneLen would push every later token two bytes past its true
// position. Offsets must stay anchored to the original byte buffer.
func TestLexTokenOffsetsSurviveMalformedUTF8(t *testing.T) {
// "x" then a lone 0xff byte, a newline, then "y": bytes x=0, 0xff=1, \n=2, y=3.
src := "x\xff\ny"
buffer := []byte(src)
tokens, _ := Lex(src)

for _, token := range tokens {
// Offset and line/column must agree against the real buffer (OffsetOf
// ranges the bytes, so it reports true positions even past a bad byte).
want := source.SourcePosition{Line: token.Pos.Line, Column: token.Pos.Column}
if off := source.OffsetOf(buffer, want); off != token.Offset {
t.Fatalf("token %q (kind %s): OffsetOf=%d, token offset=%d", token.Lexeme, token.Kind, off, token.Offset)
}
}

// The trailing valid token must land at byte 3, not 5 (the drifted value).
var found bool
for _, token := range tokens {
if token.Kind == TokenIdentifier && token.Lexeme == "y" {
found = true
if token.Offset != 3 {
t.Fatalf("trailing token y offset = %d, want 3", token.Offset)
}
}
}
if !found {
t.Fatal("expected to find the trailing identifier token y")
}
}
Loading