cssbruno · cssbruno · Jun 12, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/docs/language/conformance.md b/docs/language/conformance.md
@@ -42,6 +42,19 @@ code appears among the diagnostics for that file. Diagnostic codes are the ones
 registered in `internal/diagnostics/registry.go` and documented in
 `docs/reference/diagnostic-codes.md`.
 
+## Scope and limits
+
+The corpus uses single-file `CheckSource`, so it pins what one file can verify
+without a project: package and metadata declarations, route forms, `view {}`
+markup, `style {}`, literal `build {}`, slots, and the rejection contracts below.
+
+It cannot cleanly cover constructs that require project context: reactive `g:`
+directives (`g:if`/`g:on`/`g:bind`) reference a Go-typed `state` contract that
+does not resolve single-file; endpoint forms (`act`/`api`) need exported Go
+handlers; and `layout`/`wasm`/`asset`/`css` need sibling files or config. Those
+are exercised by the package- and build-level tests instead. Expanding the
+corpus to a project-level harness for them is tracked separately.
+
 ## Coverage
 
 `TestConformanceCorpusCoversRejectionContracts` fails when a rejection contract

diff --git a/docs/product/language-server.md b/docs/product/language-server.md
@@ -55,6 +55,9 @@ Developers editing `.gwdk` files need live feedback from the same language tooli
   missing GOWDK `use` aliases.
 - Return full-document semantic tokens for `.gwdk` decorators, identifiers,
   strings, and operators.
+- Return a document outline (top-level package, metadata, imports, uses, blocks,
+  endpoints, and component/page declarations) from the recursive-descent outline
+  pass over the shared tokenizer.
 
 ### Non-Functional
 
@@ -78,6 +81,8 @@ Developers editing `.gwdk` files need live feedback from the same language tooli
 - [x] `textDocument/references` returns open-document references for page IDs, routes, components, stores, and guards.
 - [x] `textDocument/codeAction` returns quick fixes for old endpoint syntax and missing GOWDK use aliases.
 - [x] `textDocument/semanticTokens/full` returns encoded token data for open `.gwdk` buffers.
+- [x] `textDocument/documentSymbol` returns a top-level outline parsed by the
+      recursive-descent outline pass over the shared tokenizer (ADR 0010).
 - [x] `go test ./...` and `go build ./cmd/gowdk` pass.
 
 ## Edge Cases

diff --git a/internal/lang/lexer.go b/internal/lang/lexer.go
@@ -4,19 +4,44 @@ import "unicode"
 
 // Lex tokenizes .gwdk source for editor and CLI tooling.
 func Lex(source string) ([]Token, Diagnostics) {
+	runes := []rune(source)
+	// byteOffsets[i] is the 0-based byte offset of rune i in the original
+	// source; the final entry is the total byte length. Offsets are taken from
+	// ranging the original string (which reports true byte positions) rather
+	// than summing utf8.RuneLen, so malformed UTF-8 — where []rune turns each
+	// bad byte into a 3-byte U+FFFD — does not drift token offsets.
+	byteOffsets := make([]int, len(runes)+1)
+	runeIndex := 0
+	for byteIndex := range source {
+		byteOffsets[runeIndex] = byteIndex
+		runeIndex++
+	}
+	byteOffsets[len(runes)] = len(source)
+
 	lexer := scanner{
-		source: []rune(source),
-		line:   1,
-		column: 1,
+		source:      runes,
+		byteOffsets: byteOffsets,
+		line:        1,
+		column:      1,
 	}
 	return lexer.scan()
 }
 
 type scanner struct {
-	source []rune
-	index  int
-	line   int
-	column int
+	source      []rune
+	byteOffsets []int
+	index       int
+	line        int
+	column      int
+}
+
+// offset returns the 0-based byte offset of the current rune in the original
+// source.
+func (scanner *scanner) offset() int {
+	if scanner.index < len(scanner.byteOffsets) {
+		return scanner.byteOffsets[scanner.index]
+	}
+	return scanner.byteOffsets[len(scanner.byteOffsets)-1]
 }
 
 func (scanner *scanner) scan() ([]Token, Diagnostics) {
@@ -26,13 +51,14 @@ func (scanner *scanner) scan() ([]Token, Diagnostics) {
 	for !scanner.done() {
 		ch := scanner.peek()
 		pos := scanner.position()
+		offset := scanner.offset()
 
 		switch {
 		case ch == '\r':
 			scanner.advance()
 		case ch == '\n':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenNewline, Lexeme: "\n", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenNewline, Lexeme: "\n", Pos: pos, Offset: offset})
 		case unicode.IsSpace(ch):
 			scanner.advance()
 		case ch == '/' && scanner.peekNext() == '/':
@@ -47,47 +73,49 @@ func (scanner *scanner) scan() ([]Token, Diagnostics) {
 			}
 		case ch == '{':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenLBrace, Lexeme: "{", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenLBrace, Lexeme: "{", Pos: pos, Offset: offset})
 		case ch == '}':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenRBrace, Lexeme: "}", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenRBrace, Lexeme: "}", Pos: pos, Offset: offset})
 		case ch == ',':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenComma, Lexeme: ",", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenComma, Lexeme: ",", Pos: pos, Offset: offset})
 		case ch == ':':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenColon, Lexeme: ":", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenColon, Lexeme: ":", Pos: pos, Offset: offset})
 		case ch == '?':
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenQuestion, Lexeme: "?", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenQuestion, Lexeme: "?", Pos: pos, Offset: offset})
 		case ch == '=' && scanner.peekNext() == '>':
 			scanner.advance()
 			scanner.advance()
-			tokens = append(tokens, Token{Kind: TokenArrow, Lexeme: "=>", Pos: pos})
+			tokens = append(tokens, Token{Kind: TokenArrow, Lexeme: "=>", Pos: pos, Offset: offset})
 		default:
 			tokens = append(tokens, scanner.text())
 		}
 	}
 
-	tokens = append(tokens, Token{Kind: TokenEOF, Pos: scanner.position()})
+	tokens = append(tokens, Token{Kind: TokenEOF, Pos: scanner.position(), Offset: scanner.offset()})
 	return tokens, diagnostics
 }
 
 func (scanner *scanner) identifier() Token {
 	pos := scanner.position()
+	offset := scanner.offset()
 	start := scanner.index
 	for !scanner.done() && (isIdentPart(scanner.peek()) || scanner.peek() == '.' || scanner.peek() == '-') {
 		scanner.advance()
 	}
 	lexeme := string(scanner.source[start:scanner.index])
 	if scanner.isLineLeading(start) && isMetadataLexeme(lexeme) {
-		return Token{Kind: TokenMetadata, Lexeme: lexeme, Pos: pos}
+		return Token{Kind: TokenMetadata, Lexeme: lexeme, Pos: pos, Offset: offset}
 	}
-	return Token{Kind: TokenIdentifier, Lexeme: lexeme, Pos: pos}
+	return Token{Kind: TokenIdentifier, Lexeme: lexeme, Pos: pos, Offset: offset}
 }
 
 func (scanner *scanner) quotedString() (Token, Diagnostic) {
 	pos := scanner.position()
+	offset := scanner.offset()
 	start := scanner.index
 	scanner.advance()
 	for !scanner.done() {
@@ -101,14 +129,14 @@ func (scanner *scanner) quotedString() (Token, Diagnostic) {
 		}
 		if ch == '"' {
 			scanner.advance()
-			return Token{Kind: TokenString, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}, Diagnostic{}
+			return Token{Kind: TokenString, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}, Diagnostic{}
 		}
 		if ch == '\n' {
 			break
 		}
 		scanner.advance()
 	}
-	return Token{Kind: TokenIllegal, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}, Diagnostic{
+	return Token{Kind: TokenIllegal, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}, Diagnostic{
 		Pos:      pos,
 		Range:    sourceRange(pos, scanner.position()),
 		Code:     "unterminated_string",
@@ -129,6 +157,7 @@ func sourceRange(start, end Position) *Range {
 
 func (scanner *scanner) text() Token {
 	pos := scanner.position()
+	offset := scanner.offset()
 	start := scanner.index
 	for !scanner.done() {
 		ch := scanner.peek()
@@ -140,7 +169,7 @@ func (scanner *scanner) text() Token {
 		}
 		scanner.advance()
 	}
-	return Token{Kind: TokenText, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos}
+	return Token{Kind: TokenText, Lexeme: string(scanner.source[start:scanner.index]), Pos: pos, Offset: offset}
 }
 
 func (scanner *scanner) skipLineComment() {

diff --git a/internal/lang/lexer_offset_test.go b/internal/lang/lexer_offset_test.go
@@ -0,0 +1,76 @@
+package lang
+
+import (
+	"testing"
+
+	"github.com/cssbruno/gowdk/internal/source"
+)
+
+// TestLexTokenOffsetsAreByteAccurate verifies the tokenizer records each token's
+// 0-based byte offset and that it stays consistent with the token's line/column
+// via the source conversion helpers, including across a multi-byte rune. This is
+// the substrate contract the recursive-descent parser (ADR 0010) depends on.
+func TestLexTokenOffsetsAreByteAccurate(t *testing.T) {
+	// The euro sign is three bytes, so byte offsets and rune columns diverge
+	// after it.
+	src := "page home\ntitle \"€\"\nroute \"/\"\n"
+	tokens, _ := Lex(src)
+
+	buffer := []byte(src)
+	for _, token := range tokens {
+		if token.Kind == TokenEOF {
+			continue
+		}
+		// The token's recorded byte offset must point at its lexeme in the
+		// source buffer.
+		if token.Offset < 0 || token.Offset > len(buffer) {
+			t.Fatalf("token %q offset %d out of bounds", token.Lexeme, token.Offset)
+		}
+		if token.Kind != TokenNewline && token.Lexeme != "" {
+			got := string(buffer[token.Offset : token.Offset+len(token.Lexeme)])
+			if got != token.Lexeme {
+				t.Fatalf("token %q at offset %d points at %q", token.Lexeme, token.Offset, got)
+			}
+		}
+		// The byte offset and the line/column must describe the same position.
+		want := source.SourcePosition{Line: token.Pos.Line, Column: token.Pos.Column}
+		if off := source.OffsetOf(buffer, want); off != token.Offset {
+			t.Fatalf("token %q: OffsetOf(line %d,col %d)=%d, token offset=%d",
+				token.Lexeme, token.Pos.Line, token.Pos.Column, off, token.Offset)
+		}
+	}
+}
+
+// TestLexTokenOffsetsSurviveMalformedUTF8 guards against offset drift after an
+// invalid byte: []rune turns a malformed byte into a 3-byte U+FFFD, so deriving
+// offsets from utf8.RuneLen would push every later token two bytes past its true
+// position. Offsets must stay anchored to the original byte buffer.
+func TestLexTokenOffsetsSurviveMalformedUTF8(t *testing.T) {
+	// "x" then a lone 0xff byte, a newline, then "y": bytes x=0, 0xff=1, \n=2, y=3.
+	src := "x\xff\ny"
+	buffer := []byte(src)
+	tokens, _ := Lex(src)
+
+	for _, token := range tokens {
+		// Offset and line/column must agree against the real buffer (OffsetOf
+		// ranges the bytes, so it reports true positions even past a bad byte).
+		want := source.SourcePosition{Line: token.Pos.Line, Column: token.Pos.Column}
+		if off := source.OffsetOf(buffer, want); off != token.Offset {
+			t.Fatalf("token %q (kind %s): OffsetOf=%d, token offset=%d", token.Lexeme, token.Kind, off, token.Offset)
+		}
+	}
+
+	// The trailing valid token must land at byte 3, not 5 (the drifted value).
+	var found bool
+	for _, token := range tokens {
+		if token.Kind == TokenIdentifier && token.Lexeme == "y" {
+			found = true
+			if token.Offset != 3 {
+				t.Fatalf("trailing token y offset = %d, want 3", token.Offset)
+			}
+		}
+	}
+	if !found {
+		t.Fatal("expected to find the trailing identifier token y")
+	}
+}