From 24bdc6cc37bdd9a3c91900cf61e477207517cb1f Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Wed, 1 Apr 2026 22:15:08 +0200 Subject: [PATCH 01/19] feat: tune session defaults and hide session-check-interval flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - idle-timeout: 60s → 10s, keepalive: 10s → 2s, session-check-interval: 20s → 500ms - hide session-check-interval from help output, README, and man page - improve README explanation of keepalive/idle-timeout relationship - fix server queue-size minimum validation (0 → 32) --- README.md | 19 +++++++++---------- client/client.go | 6 +++--- man/vaydns-client.1 | 5 ----- vaydns-client/main.go | 18 +++++++++++++++++- vaydns-server/main.go | 8 ++++---- 5 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f0beeb5..bde923c 100644 --- a/README.md +++ b/README.md @@ -130,8 +130,8 @@ sudo ip6tables -t nat -I PREROUTING -i eth0 -p udp --dport 53 -j REDIRECT --to-p | `-privkey HEX` | Server private key as hex string | — | | `-gen-key` | Generate a new keypair and exit | — | | `-mtu N` | Max UDP payload size for responses | `1232` | -| `-idle-timeout D` | Session idle timeout (must match client) | `60s` | -| `-keepalive D` | Keepalive ping interval (must match client, must be < idle-timeout) | `10s` | +| `-idle-timeout D` | Session idle timeout (must match client) | `10s` | +| `-keepalive D` | Keepalive ping interval (must match client, must be < idle-timeout) | `2s` | | `-fallback ADDR` | UDP endpoint to forward non-DNS packets to (e.g. `127.0.0.1:8888`) | — | | `-dnstt-compat` | Use original dnstt wire format (8-byte ClientID, padding prefixes). Also sets `-idle-timeout` to 2m and `-keepalive` to 10s unless explicitly overridden. | `false` | | `-clientid-size N` | ClientID size in bytes (ignored when `-dnstt-compat` is set) | `2` | @@ -164,17 +164,16 @@ sudo ip6tables -t nat -I PREROUTING -i eth0 -p udp --dport 53 -j REDIRECT --to-p | Flag | Description | Default | | --------------------------- | -------------------------------------------------- | ------- | -| `-idle-timeout D` | Session idle timeout (must match server) | `60s` | -| `-keepalive D` | Keepalive ping interval (must match server, must be < idle-timeout) | `10s` | +| `-idle-timeout D` | Session idle timeout (must match server) | `10s` | +| `-keepalive D` | Keepalive ping interval (must match server, must be < idle-timeout) | `2s` | | `-max-streams N` | Max concurrent streams per session (0 = unlimited) | `0` | | `-open-stream-timeout D` | Timeout for opening an smux stream | `10s` | | `-reconnect-min D` | Initial backoff delay for session reconnect | `1s` | | `-reconnect-max D` | Max backoff delay (must be >= reconnect-min) | `30s` | -| `-session-check-interval D` | How often to check if the session is alive (should be shorter than idle-timeout) | `20s` | -> **Note:** `idle-timeout` and `keepalive` must be set to the same values on both client and server — mismatched values will cause one side to close the session before the other detects it. Keep `keepalive` well below `idle-timeout` (the default 6x ratio allows ~6 ping attempts before timeout). +> **Note:** `idle-timeout` and `keepalive` must be set to the same values on both client and server — mismatched values will cause one side to close the session before the other detects it. Keep `keepalive` well below `idle-timeout` (the default 5x ratio allows ~5 ping attempts before timeout). > -> `session-check-interval` controls how quickly the client detects a dead session and starts reconnecting — it does not affect when the session dies. A lower value means faster reconnection but can cause unnecessary churn on lossy networks. It does not need to match on client and server. +> **How they relate:** `keepalive` controls how often smux sends ping frames to prove the session is alive. `idle-timeout` is how long smux waits with no received data (including pings) before declaring the session dead — it applies symmetrically on both sides. #### UDP transport tuning @@ -393,14 +392,14 @@ On both client and server, `-dnstt-compat` switches to the original dnstt wire f | Setting | VayDNS default | With `-dnstt-compat` | Applies to | | ------- | -------------- | -------------------- | ---------- | | `-max-qname-len` | `101` | `253` | client | -| `-idle-timeout` | `60s` | `2m` | client and server | -| `-keepalive` | `10s` | `10s` | client and server | +| `-idle-timeout` | `10s` | `2m` | client and server | +| `-keepalive` | `2s` | `10s` | client and server | All three can be explicitly overridden even when `-dnstt-compat` is set — the flag only changes the defaults, it does not lock the values. For example, `-dnstt-compat -idle-timeout 30s` uses the dnstt wire format with a 30-second idle timeout. > **Note:** `-dnstt-compat` forces `-record-type` to `txt` (with a warning if another type was set). dnstt only supports TXT records, so other record types are incompatible. > -> The timeout defaults are critical for interop with original dnstt binaries. dnstt uses a 10-second keepalive interval (smux default) and a 2-minute idle timeout. Setting `-idle-timeout` below 10s in compat mode will cause sessions to churn because dnstt peers only send keepalives every 10 seconds. When mixing with dnstt, keep the compat defaults unless you know what you're doing. +> The timeout defaults are critical for interop with original dnstt binaries. dnstt uses a 10-second keepalive interval (smux default) and a 2-minute idle timeout. Setting `-idle-timeout` below 10s in compat mode will cause sessions to churn because dnstt peers only send keepalives every 10 seconds. When connecting to dnstt, keep the compat defaults unless you know what you're doing. ### Record types diff --git a/client/client.go b/client/client.go index 5e1b754..5c4b713 100644 --- a/client/client.go +++ b/client/client.go @@ -41,12 +41,12 @@ import ( // Default timeouts for VayDNS mode. const ( - DefaultIdleTimeout = 60 * time.Second - DefaultKeepAlive = 10 * time.Second + DefaultIdleTimeout = 10 * time.Second + DefaultKeepAlive = 2 * time.Second DefaultOpenStreamTimeout = 10 * time.Second DefaultReconnectDelay = 1 * time.Second DefaultReconnectMaxDelay = 30 * time.Second - DefaultSessionCheckInterval = 20 * time.Second + DefaultSessionCheckInterval = 500 * time.Millisecond DefaultUDPResponseTimeout = 500 * time.Millisecond DefaultUDPWorkers = 100 DefaultMaxStreams = 0 // unlimited diff --git a/man/vaydns-client.1 b/man/vaydns-client.1 index c277d78..0b92691 100644 --- a/man/vaydns-client.1 +++ b/man/vaydns-client.1 @@ -182,11 +182,6 @@ Must be >= reconnect-min. Default: .Cm 30s . -.It Fl session-check-interval Ar DURATION -How often to check whether the current session is alive. -Default: -.Cm 500ms . - .It Fl rps Ar N Rate limit outgoing DNS queries to .Ar N diff --git a/vaydns-client/main.go b/vaydns-client/main.go index f6e9ac5..212e5bf 100644 --- a/vaydns-client/main.go +++ b/vaydns-client/main.go @@ -68,7 +68,23 @@ Examples: %[1]s -dot resolver.example:853 -pubkey-file server.pub -domain t.example.com -listen 127.0.0.1:7000 `, os.Args[0]) - flag.PrintDefaults() + flag.CommandLine.VisitAll(func(f *flag.Flag) { + if f.Name == "session-check-interval" { + return + } + fmt.Fprintf(flag.CommandLine.Output(), " -%s", f.Name) + name, usage := flag.UnquoteUsage(f) + if len(name) > 0 { + fmt.Fprintf(flag.CommandLine.Output(), " %s", name) + } + if len(f.DefValue) > 0 { + fmt.Fprintf(flag.CommandLine.Output(), " (default %s)", f.DefValue) + } + if len(usage) > 0 { + fmt.Fprintf(flag.CommandLine.Output(), "\n \t%s", usage) + } + fmt.Fprint(flag.CommandLine.Output(), "\n") + }) labels := make([]string, 0) labels = append(labels, "none") for _, entry := range client.UTLSClientHelloIDMap() { diff --git a/vaydns-server/main.go b/vaydns-server/main.go index ae8cbca..7fb3ec1 100644 --- a/vaydns-server/main.go +++ b/vaydns-server/main.go @@ -73,8 +73,8 @@ import ( ) const ( - defaultIdleTimeout = 60 * time.Second - defaultKeepAlive = 10 * time.Second + defaultIdleTimeout = 10 * time.Second + defaultKeepAlive = 2 * time.Second // Bound the pre-smux handshake so half-open KCP sessions cannot linger // indefinitely and consume server resources. defaultHandshakeTimeout = 15 * time.Second @@ -1395,8 +1395,8 @@ Example: fmt.Fprintf(os.Stderr, "-keepalive (%s) must be less than -idle-timeout (%s)\n", keepAlive, idleTimeout) os.Exit(1) } - if queueSize <= 0 { - fmt.Fprintf(os.Stderr, "-queue-size (%d) must be greater than 0\n", queueSize) + if queueSize < 32 { + fmt.Fprintf(os.Stderr, "-queue-size (%d) must be at least 32\n", queueSize) os.Exit(1) } if kcpWindowSize < 0 { From 280e389abd2f0a9cf351f68c94166e6b119eadcf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2026 22:27:45 +0200 Subject: [PATCH 02/19] chore(main): release 0.2.7 (#61) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .release-please-manifest.json | 2 +- CHANGELOG.md | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 954b159..054e8bc 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.2.6" + ".": "0.2.7" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 439342b..fb45f67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.2.7](https://github.com/net2share/vaydns/compare/v0.2.6...v0.2.7) (2026-04-01) + + +### Features + +* tune session defaults and hide session-check-interval flag ([24bdc6c](https://github.com/net2share/vaydns/commit/24bdc6cc37bdd9a3c91900cf61e477207517cb1f)) + + +### Bug Fixes + +* **client:** make max streams unlimited by default ([#63](https://github.com/net2share/vaydns/issues/63)) ([4cc8228](https://github.com/net2share/vaydns/commit/4cc8228190d415cc75a32f59d83f7b77ae2af68b)) +* **server:** clarify server accept session/stream warning logs ([#57](https://github.com/net2share/vaydns/issues/57)) ([654b2f1](https://github.com/net2share/vaydns/commit/654b2f1f3dbde6001e19c9f8391436c28e55eca9)) + ## [0.2.6](https://github.com/net2share/vaydns/compare/v0.2.5...v0.2.6) (2026-03-29) From c4150b272c230a261b4a19ac08057a7b2dcd7a71 Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Wed, 1 Apr 2026 22:42:34 +0200 Subject: [PATCH 03/19] docs(server): update stale dnstt references in package doc comments --- vaydns-server/main.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vaydns-server/main.go b/vaydns-server/main.go index 7fb3ec1..32b61c4 100644 --- a/vaydns-server/main.go +++ b/vaydns-server/main.go @@ -1,26 +1,26 @@ -// dnstt-server is the server end of a DNS tunnel. +// vaydns-server is the server end of a DNS tunnel. // // Usage: // -// dnstt-server -gen-key [-privkey-file PRIVKEYFILE] [-pubkey-file PUBKEYFILE] -// dnstt-server -udp ADDR [-privkey PRIVKEY|-privkey-file PRIVKEYFILE] [-fallback FALLBACKADDR] -domain DOMAIN -upstream UPSTREAMADDR +// vaydns-server -gen-key [-privkey-file PRIVKEYFILE] [-pubkey-file PUBKEYFILE] +// vaydns-server -udp ADDR [-privkey PRIVKEY|-privkey-file PRIVKEYFILE] [-fallback FALLBACKADDR] -domain DOMAIN -upstream UPSTREAMADDR // // Example: // -// dnstt-server -gen-key -privkey-file server.key -pubkey-file server.pub -// dnstt-server -udp :53 -privkey-file server.key -domain t.example.com -upstream 127.0.0.1:8000 +// vaydns-server -gen-key -privkey-file server.key -pubkey-file server.pub +// vaydns-server -udp :53 -privkey-file server.key -domain t.example.com -upstream 127.0.0.1:8000 // // With fallback for non-DNS traffic: // -// dnstt-server -udp :53 -privkey-file server.key -fallback 127.0.0.1:8888 -domain t.example.com -upstream 127.0.0.1:8000 +// vaydns-server -udp :53 -privkey-file server.key -fallback 127.0.0.1:8888 -domain t.example.com -upstream 127.0.0.1:8000 // // To generate a persistent server private key, first run with the -gen-key // option. By default the generated private and public keys are printed to // standard output. To save them to files instead, use the -privkey-file and // -pubkey-file options. // -// dnstt-server -gen-key -// dnstt-server -gen-key -privkey-file server.key -pubkey-file server.pub +// vaydns-server -gen-key +// vaydns-server -gen-key -privkey-file server.key -pubkey-file server.pub // // You can give the server's private key as a file or as a hex string. // From f0337b7daa907330dae8ca2e5a01b737ca70f95e Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Fri, 3 Apr 2026 18:40:42 +0200 Subject: [PATCH 04/19] docs(client): add missing fields and fix stale defaults in client library doc - Add RoundTripper, DialerControl, UDPAcceptErrors, RecordType, OpenStreamTimeout - Fix SessionCheckInterval default (20s -> 500ms) and HandshakeTimeout (30s -> 15s) --- client/client.go | 4 ++-- docs/client-library.md | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/client/client.go b/client/client.go index 5c4b713..6bc6acc 100644 --- a/client/client.go +++ b/client/client.go @@ -200,8 +200,8 @@ type Tunnel struct { MaxStreams int // default: 0 (0 = unlimited) ReconnectMinDelay time.Duration // default: 1s ReconnectMaxDelay time.Duration // default: 30s - SessionCheckInterval time.Duration // default: 20s - HandshakeTimeout time.Duration // default: 30s + SessionCheckInterval time.Duration // default: 500ms + HandshakeTimeout time.Duration // default: 15s PacketQueueSize int // default: QueueSize (512) KCPWindowSize int // default: PacketQueueSize/2 QueueOverflowMode turbotunnel.QueueOverflowMode // default: drop diff --git a/docs/client-library.md b/docs/client-library.md index cc1f84d..25514ec 100644 --- a/docs/client-library.md +++ b/docs/client-library.md @@ -71,9 +71,12 @@ All configuration is done through struct fields before calling `Initiate*` or `L ```go // Resolver options r.UTLSClientHelloID = &utls.ClientHelloID{...} // TLS fingerprint +r.RoundTripper = customTransport // custom HTTP transport for DoH (overrides UTLSClientHelloID) +r.DialerControl = controlFunc // socket options callback (SO_MARK, SO_BINDTODEVICE, etc.) r.UDPWorkers = 200 // concurrent UDP workers -r.UDPSharedSocket = true // single socket mode -r.UDPTimeout = 500 * time.Millisecond // per-query timeout +r.UDPSharedSocket = true // single socket mode +r.UDPTimeout = 500 * time.Millisecond // per-query timeout +r.UDPAcceptErrors = true // accept non-NOERROR responses (disables forged filtering) // Tunnel server options ts.DnsttCompat = true // original dnstt wire format @@ -81,12 +84,14 @@ ts.ClientIDSize = 1 // smaller ClientID ts.MaxQnameLen = 101 // QNAME length constraint ts.MaxNumLabels = 2 // label count constraint ts.RPS = 200 // rate limit queries/second +ts.RecordType = "cname" // DNS record type for downstream data (default: "txt") // Session options t.IdleTimeout = 60 * time.Second t.KeepAlive = 10 * time.Second +t.OpenStreamTimeout = 10 * time.Second t.MaxStreams = 256 -t.SessionCheckInterval = 20 * time.Second +t.SessionCheckInterval = 500 * time.Millisecond t.ReconnectMinDelay = 1 * time.Second t.ReconnectMaxDelay = 30 * time.Second t.HandshakeTimeout = 15 * time.Second From 056810f63c5c479e20bd65c5de5da9d247f28e63 Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sat, 4 Apr 2026 19:33:20 +0200 Subject: [PATCH 05/19] build: add CGO_ENABLED=0 to CI and release workflows Dockerfile already had it; CI and release-please builds did not, producing dynamically linked binaries unnecessarily. --- .github/workflows/ci.yml | 3 ++- .github/workflows/release-please.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5356bef..5f81ef2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: go-version: '1.24' - name: Build - run: go build -v ./... + run: CGO_ENABLED=0 go build -v ./... - name: Test run: go test -v ./... @@ -60,6 +60,7 @@ jobs: - name: Build env: + CGO_ENABLED: '0' GOOS: ${{ matrix.goos }} GOARCH: ${{ matrix.goarch }} GOARM: ${{ matrix.goarm }} diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index a968f49..0fd2657 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -68,6 +68,7 @@ jobs: - name: Build env: + CGO_ENABLED: '0' GOOS: ${{ matrix.goos }} GOARCH: ${{ matrix.goarch }} GOARM: ${{ matrix.goarm }} From 35331484bce1b628372dc37accae4e96b507841f Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sat, 4 Apr 2026 19:51:00 +0200 Subject: [PATCH 06/19] feat: add -v flag for printing version (#71) - inject version via ldflags in CI, release, and Docker builds - defaults to "dev" for local builds --- .github/workflows/ci.yml | 8 +++++--- .github/workflows/release-please.yml | 7 +++++-- Dockerfile | 5 +++-- vaydns-client/main.go | 9 +++++++++ vaydns-server/main.go | 9 +++++++++ 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f81ef2..9ded57a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: go-version: '1.24' - name: Build - run: CGO_ENABLED=0 go build -v ./... + run: CGO_ENABLED=0 go build -v -ldflags="-X main.version=${GITHUB_SHA::7}" ./... - name: Test run: go test -v ./... @@ -70,8 +70,9 @@ jobs: ARCH="armv7" fi SUFFIX="${{ matrix.suffix }}" - go build -trimpath -ldflags="-s -w" -o "vaydns-client-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-client - go build -trimpath -ldflags="-s -w" -o "vaydns-server-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-server + VERSION="${GITHUB_SHA::7}" + go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o "vaydns-client-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-client + go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o "vaydns-server-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-server - name: Upload artifacts uses: actions/upload-artifact@v4 @@ -119,5 +120,6 @@ jobs: context: . file: Dockerfile push: true + build-args: VERSION=dev-${{ env.SHORT_SHA }} tags: ghcr.io/${{ github.repository }}:dev-${{ env.SHORT_SHA }} diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 0fd2657..6b2ba35 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -78,8 +78,10 @@ jobs: ARCH="armv7" fi SUFFIX="${{ matrix.suffix }}" - go build -trimpath -ldflags="-s -w" -o "vaydns-client-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-client - go build -trimpath -ldflags="-s -w" -o "vaydns-server-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-server + TAG="${{ needs.release-please.outputs.tag_name }}" + VERSION="${TAG#v}" + go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o "vaydns-client-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-client + go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o "vaydns-server-${{ matrix.goos }}-${ARCH}${SUFFIX}" ./vaydns-server chmod +x vaydns-client-* vaydns-server-* - name: Upload artifact @@ -148,6 +150,7 @@ jobs: context: . file: Dockerfile push: true + build-args: VERSION=${{ steps.version.outputs.VERSION }} tags: | ghcr.io/${{ github.repository }}:${{ steps.version.outputs.VERSION }} ghcr.io/${{ github.repository }}:latest diff --git a/Dockerfile b/Dockerfile index bb2b842..d948aad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,8 +5,9 @@ COPY go.mod go.sum ./ RUN go mod download COPY . . -RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w" -o /vaydns-server ./vaydns-server -RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w" -o /vaydns-client ./vaydns-client +ARG VERSION=dev +RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o /vaydns-server ./vaydns-server +RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" -o /vaydns-client ./vaydns-client FROM alpine RUN apk add --no-cache curl diff --git a/vaydns-client/main.go b/vaydns-client/main.go index 212e5bf..c37cff2 100644 --- a/vaydns-client/main.go +++ b/vaydns-client/main.go @@ -20,6 +20,8 @@ import ( log "github.com/sirupsen/logrus" ) +var version = "dev" + func readKeyFromFile(filename string) ([]byte, error) { f, err := os.Open(filename) if err != nil { @@ -30,6 +32,7 @@ func readKeyFromFile(filename string) ([]byte, error) { } func main() { + var showVersion bool var dohURL string var dotAddr string var domainArg string @@ -140,8 +143,14 @@ Known TLS fingerprints for -utls are: var logLevel string flag.StringVar(&logLevel, "log-level", "info", "log level (debug, info, warning, error)") + flag.BoolVar(&showVersion, "v", false, "print version and exit") flag.Parse() + if showVersion { + fmt.Println(version) + os.Exit(0) + } + level, err := log.ParseLevel(logLevel) if err != nil { fmt.Fprintf(os.Stderr, "invalid log level: %s\n", logLevel) diff --git a/vaydns-server/main.go b/vaydns-server/main.go index 32b61c4..e043251 100644 --- a/vaydns-server/main.go +++ b/vaydns-server/main.go @@ -1194,7 +1194,10 @@ func run(privkey []byte, domain dns.Name, upstream string, dnsConn net.PacketCon return recvLoop(domain, dnsConn, ttConn, ch, fallbackMgr, stats, wireConfig) } +var version = "dev" + func main() { + var showVersion bool var genKey bool var domainArg string var upstream string @@ -1250,8 +1253,14 @@ Example: var logLevel string flag.StringVar(&logLevel, "log-level", "info", "log level (debug, info, warning, error)") + flag.BoolVar(&showVersion, "v", false, "print version and exit") flag.Parse() + if showVersion { + fmt.Println(version) + os.Exit(0) + } + level, err := log.ParseLevel(logLevel) if err != nil { fmt.Fprintf(os.Stderr, "invalid log level: %s\n", logLevel) From 8354db2a080ce9543594bc4e71592f33e6489d82 Mon Sep 17 00:00:00 2001 From: ebpfx <262974424+ebpfx@users.noreply.github.com> Date: Sat, 11 Apr 2026 01:02:27 +0330 Subject: [PATCH 07/19] feat: add NULL and CAA record type support and fix server EDNS mtu advertisement (#70) - add `NULL` and `CAA` to record type parsing and CLI flags - add `NULL` and `CAA` encode/decode support on client and server - reuse the existing single-RR MTU calculation path for `TXT`, `NULL`, and `CAA` - keep `-mtu` warning only for name-limited record types: `CNAME`, `NS`, `MX`, and `SRV` - use the configured server UDP payload limit in the response OPT RR instead of always advertising `4096` --- README.md | 6 ++++-- client/dns.go | 20 ++++++++++++-------- dns/dns.go | 42 +++++++++++++++++++++++++++++++++++++++++- vaydns-client/main.go | 2 +- vaydns-server/main.go | 30 ++++++++++++++++++++++-------- 5 files changed, 80 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index bde923c..b002a90 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ sudo ip6tables -t nat -I PREROUTING -i eth0 -p udp --dport 53 -j REDIRECT --to-p | `-fallback ADDR` | UDP endpoint to forward non-DNS packets to (e.g. `127.0.0.1:8888`) | — | | `-dnstt-compat` | Use original dnstt wire format (8-byte ClientID, padding prefixes). Also sets `-idle-timeout` to 2m and `-keepalive` to 10s unless explicitly overridden. | `false` | | `-clientid-size N` | ClientID size in bytes (ignored when `-dnstt-compat` is set) | `2` | -| `-record-type TYPE` | DNS record type for downstream data: `txt`, `cname`, `a`, `aaaa`, `mx`, `ns`, `srv`. Must match the client. Ignored (forced to `txt`) when `-dnstt-compat` is set. | `txt` | +| `-record-type TYPE` | DNS record type for downstream data: `txt`, `null`, `cname`, `a`, `aaaa`, `mx`, `ns`, `srv`, `caa`. Must match the client. Ignored (forced to `txt`) when `-dnstt-compat` is set. | `txt` | | `-queue-size N` | Packet queue size for transport and DNS layers | `512` | | `-kcp-window-size N` | KCP send/receive window size in packets (0 = queue-size/2) | `0` | | `-queue-overflow MODE` | Queue overflow behavior: `drop` (silent discard) or `block` (backpressure) | `drop` | @@ -224,7 +224,7 @@ These reduce upstream throughput but improve compatibility. The minimum effectiv | `-rps N` | Rate limit outgoing DNS queries per second (0 = unlimited). Uses a token bucket with 1-second burst allowance. | `0` | | `-dnstt-compat` | Use original dnstt wire format (8-byte ClientID, padding prefixes). Sets `-max-qname-len` to 253 unless explicitly overridden. Forces `-record-type` to `txt` with a warning if another type is set. | `false` | | `-clientid-size N` | ClientID size in bytes (ignored when `-dnstt-compat` is set) | `2` | -| `-record-type TYPE` | DNS record type for downstream data: `txt`, `cname`, `a`, `aaaa`, `mx`, `ns`, `srv`. Must match the server. | `txt` | +| `-record-type TYPE` | DNS record type for downstream data: `txt`, `null`, `cname`, `a`, `aaaa`, `mx`, `ns`, `srv`, `caa`. Must match the server. | `txt` | | `-utls SPEC` | TLS fingerprint distribution (see below) | weighted random | | `-log-level LEVEL` | Log level: debug, info, warning, error | `info` | @@ -408,12 +408,14 @@ VayDNS supports multiple DNS record types for downstream data encoding. Both cli | Type | Description | Capacity | | ---- | ----------- | -------- | | `txt` | TXT record (default). Highest capacity, compatible with dnstt. | Bounded by UDP payload (~1200 bytes) | +| `null` | NULL record. Raw binary payload in a single RR. | Bounded by UDP payload | | `cname` | CNAME record. Data encoded as a DNS name under the tunnel domain. | Bounded by 255-byte DNS name limit | | `ns` | NS record. Same encoding as CNAME. | Same as CNAME | | `mx` | MX record. 2-byte preference header + name encoding. | Same as CNAME | | `srv` | SRV record. 6-byte header + name encoding. | Same as CNAME | | `a` | A records. Data split into 4-byte chunks across multiple answer RRs. | Bounded by UDP payload | | `aaaa` | AAAA records. Data split into 16-byte chunks across multiple answer RRs. | Bounded by UDP payload | +| `caa` | CAA record. Payload encoded in the value portion of a fixed `issue` property. | Bounded by UDP payload | > **Compatibility:** Old VayDNS clients (pre-record-type) only send TXT queries. A new server with the default `-record-type txt` is fully compatible with old clients. Using a non-TXT type requires updating both client and server. diff --git a/client/dns.go b/client/dns.go index 729dece..0453b78 100644 --- a/client/dns.go +++ b/client/dns.go @@ -150,8 +150,8 @@ func forgedInfoMilestone(total uint64) bool { // DNSPacketConn provides a packet-sending and -receiving interface over various // forms of DNS. It handles the details of how packets and padding are encoded -// as a DNS name in the Question section of an upstream query, and as a TXT RR -// in downstream responses. +// as a DNS name in the Question section of an upstream query, and as an RR in +// downstream responses. // // DNSPacketConn does not handle the mechanics of actually sending and receiving // encoded DNS messages. That is rather the responsibility of some other @@ -166,7 +166,8 @@ type DNSPacketConn struct { clientID turbotunnel.ClientID wireConfig turbotunnel.WireConfig domain dns.Name - // rrType is the DNS record type used for downstream data (TXT, CNAME, A, AAAA, MX, NS, or SRV). + // rrType is the DNS record type used for downstream data (TXT, NULL, CNAME, + // A, AAAA, MX, NS, SRV, or CAA). rrType uint16 // Sending on pollChan permits sendLoop to send an empty polling query. // sendLoop also does its own polling according to a time schedule. @@ -259,11 +260,10 @@ func (c *DNSPacketConn) TransportErrors() <-chan error { return c.transportErr } -// dnsResponsePayload extracts the downstream payload of a DNS response, encoded -// into the RDATA of a TXT or CNAME RR. It returns (nil, true) when the response -// has a non-NoError RCODE, indicating a forged or hijacked response. It returns -// (payload, false) on success or (nil, false) when the response doesn't pass -// format checks. +// dnsResponsePayload extracts the downstream payload of a DNS response. It +// returns (nil, true) when the response has a non-NoError RCODE, indicating a +// forged or hijacked response. It returns (payload, false) on success or +// (nil, false) when the response doesn't pass format checks. func dnsResponsePayload(resp *dns.Message, domain dns.Name, rrType uint16) ([]byte, bool) { if resp.Flags&0x8000 != 0x8000 { // QR != 1, this is not a response. @@ -318,6 +318,10 @@ func dnsResponsePayload(resp *dns.Message, domain dns.Name, rrType uint16) ([]by var payload []byte var err error switch rrType { + case dns.RRTypeNULL: + payload, err = dns.DecodeRDataNULL(answer.Data) + case dns.RRTypeCAA: + payload, err = dns.DecodeRDataCAA(answer.Data) case dns.RRTypeCNAME: payload, err = dns.DecodeRDataCNAME(answer.Data, domain) case dns.RRTypeNS: diff --git a/dns/dns.go b/dns/dns.go index 8f35a6a..909e054 100644 --- a/dns/dns.go +++ b/dns/dns.go @@ -50,10 +50,12 @@ const ( RRTypeA = 1 RRTypeNS = 2 RRTypeCNAME = 5 + RRTypeNULL = 10 RRTypeMX = 15 RRTypeTXT = 16 RRTypeAAAA = 28 RRTypeSRV = 33 + RRTypeCAA = 257 // https://tools.ietf.org/html/rfc6891#section-6.1.1 RRTypeOPT = 41 @@ -78,6 +80,8 @@ func ParseRecordType(s string) (uint16, error) { return RRTypeTXT, nil case "cname": return RRTypeCNAME, nil + case "null": + return RRTypeNULL, nil case "a": return RRTypeA, nil case "aaaa": @@ -88,8 +92,10 @@ func ParseRecordType(s string) (uint16, error) { return RRTypeNS, nil case "srv": return RRTypeSRV, nil + case "caa": + return RRTypeCAA, nil default: - return 0, fmt.Errorf("unknown record type %q: must be one of: txt, cname, a, aaaa, mx, ns, srv", s) + return 0, fmt.Errorf("unknown record type %q: must be one of: txt, cname, null, a, aaaa, mx, ns, srv, caa", s) } } @@ -692,6 +698,40 @@ func EncodeRDataTXT(p []byte) []byte { return buf.Bytes() } +// DecodeRDataNULL decodes NULL RDATA as a raw byte slice. +// https://tools.ietf.org/html/rfc1035#section-3.3.10 +func DecodeRDataNULL(p []byte) ([]byte, error) { return p, nil } + +// EncodeRDataNULL encodes a slice of bytes as NULL RDATA. +// https://tools.ietf.org/html/rfc1035#section-3.3.10 +func EncodeRDataNULL(p []byte) []byte { return p } + +// DecodeRDataCAA decodes CAA RDATA and returns the value portion. +// https://datatracker.ietf.org/doc/html/rfc8659 +func DecodeRDataCAA(p []byte) ([]byte, error) { + if len(p) < 2 { + return nil, io.ErrUnexpectedEOF + } + tagLen := int(p[1]) + p = p[2:] + if len(p) < tagLen { + return nil, io.ErrUnexpectedEOF + } + return p[tagLen:], nil +} + +// EncodeRDataCAA encodes a slice of bytes as CAA RDATA using a fixed +// "issue" tag so the payload lives entirely in the value portion. +// https://datatracker.ietf.org/doc/html/rfc8659 +func EncodeRDataCAA(p []byte) []byte { + const tag = "issue" + rdata := make([]byte, 2+len(tag)+len(p)) + rdata[1] = byte(len(tag)) + copy(rdata[2:], tag) + copy(rdata[2+len(tag):], p) + return rdata +} + // base32Encoding is a base32 encoding without padding, used for CNAME RDATA. var base32Encoding = base32.StdEncoding.WithPadding(base32.NoPadding) diff --git a/vaydns-client/main.go b/vaydns-client/main.go index c37cff2..fc7e621 100644 --- a/vaydns-client/main.go +++ b/vaydns-client/main.go @@ -136,7 +136,7 @@ Known TLS fingerprints for -utls are: flag.BoolVar(&udpAcceptErrors, "udp-accept-errors", false, "accept DNS error responses instead of filtering them (disables censorship evasion)") flag.BoolVar(&compatDnstt, "dnstt-compat", false, "use original dnstt wire format (8-byte ClientID, padding prefixes)") flag.IntVar(&clientIDSize, "clientid-size", 2, "client ID size in bytes (ignored when -dnstt-compat is set)") - flag.StringVar(&recordTypeStr, "record-type", "txt", "DNS record type for downstream data (txt, cname, a, aaaa, mx, ns, srv)") + flag.StringVar(&recordTypeStr, "record-type", "txt", "DNS record type for downstream data (txt, null, cname, a, aaaa, mx, ns, srv, caa)") flag.IntVar(&queueSize, "queue-size", turbotunnel.QueueSize, "packet queue size for transport and DNS layers") flag.IntVar(&kcpWindowSize, "kcp-window-size", 0, "KCP send/receive window size in packets (0 = queue-size/2)") flag.StringVar(&queueOverflowStr, "queue-overflow", string(turbotunnel.DefaultQueueOverflowMode), "queue overflow behavior: drop or block") diff --git a/vaydns-server/main.go b/vaydns-server/main.go index e043251..71e56ec 100644 --- a/vaydns-server/main.go +++ b/vaydns-server/main.go @@ -421,6 +421,11 @@ func nextPacketDnstt(r *bytes.Reader) ([]byte, error) { // this query. If the returned dns.Message has an Rcode() of dns.RcodeNoError, // the message is a candidate for for carrying downstream data in a TXT record. func responseFor(query *dns.Message, domain dns.Name) (*dns.Message, []byte) { + responsePayloadSize := uint16(maxUDPPayload) + if int(responsePayloadSize) != maxUDPPayload { + responsePayloadSize = 0xffff + } + resp := &dns.Message{ ID: query.ID, Flags: 0x8000, // QR = 1, RCODE = no error @@ -455,7 +460,7 @@ func responseFor(query *dns.Message, domain dns.Name) (*dns.Message, []byte) { resp.Additional = append(resp.Additional, dns.RR{ Name: dns.Name{}, Type: dns.RRTypeOPT, - Class: 4096, // responder's UDP payload size + Class: responsePayloadSize, // responder's UDP payload size TTL: 0, Data: []byte{}, }) @@ -778,6 +783,10 @@ func encodeResponsePayload(rec *record, data []byte, domain dns.Name) error { Data: chunk, } } + case dns.RRTypeNULL: + rec.Resp.Answer[0].Data = dns.EncodeRDataNULL(data) + case dns.RRTypeCAA: + rec.Resp.Answer[0].Data = dns.EncodeRDataCAA(data) case dns.RRTypeCNAME: rdata, err := dns.EncodeRDataCNAME(data, domain) if err != nil { @@ -941,15 +950,15 @@ func sendLoop(dnsConn net.PacketConn, ttConn *turbotunnel.QueuePacketConn, ch <- return nil } -// computeMaxEncodedPayload computes the maximum amount of downstream TXT RR -// data that keep the overall response size less than maxUDPPayload, in the +// computeMaxEncodedPayload computes the maximum amount of downstream single-RR +// payload that keeps the overall response size less than maxUDPPayload, in the // worst case when the response answers a query that has a maximum-length name // in its Question section. Returns 0 in the case that no amount of data makes // the overall response size small enough. // // This function needs to be kept in sync with sendLoop with regard to how it // builds candidate responses. -func computeMaxEncodedPayload(limit int) int { +func computeMaxEncodedPayload(limit int, encode func([]byte) []byte) int { // 64+64+64+62 octets, needs to be base32-decodable. maxLengthName, err := dns.NewName([][]byte{ []byte("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), @@ -1014,7 +1023,7 @@ func computeMaxEncodedPayload(limit int) int { high := 32768 for low+1 < high { mid := (low + high) / 2 - resp.Answer[0].Data = dns.EncodeRDataTXT(make([]byte, mid)) + resp.Answer[0].Data = encode(make([]byte, mid)) buf, err := resp.WireFormat() if err != nil { panic(err) @@ -1136,8 +1145,12 @@ func run(privkey []byte, domain dns.Name, upstream string, dnsConn net.PacketCon maxEncodedPayload = computeMaxEncodedPayloadMultiRR(maxUDPPayload, 4) case dns.RRTypeAAAA: maxEncodedPayload = computeMaxEncodedPayloadMultiRR(maxUDPPayload, 16) + case dns.RRTypeNULL: + maxEncodedPayload = computeMaxEncodedPayload(maxUDPPayload, dns.EncodeRDataNULL) + case dns.RRTypeCAA: + maxEncodedPayload = computeMaxEncodedPayload(maxUDPPayload, dns.EncodeRDataCAA) default: - maxEncodedPayload = computeMaxEncodedPayload(maxUDPPayload) + maxEncodedPayload = computeMaxEncodedPayload(maxUDPPayload, dns.EncodeRDataTXT) } // 2 bytes accounts for a packet length prefix. mtu := maxEncodedPayload - 2 @@ -1246,7 +1259,7 @@ Example: flag.StringVar(&keepAliveStr, "keepalive", defaultKeepAlive.String(), "keepalive ping interval (e.g. 2s, 500ms); must be less than idle-timeout") flag.BoolVar(&compatDnstt, "dnstt-compat", false, "use original dnstt wire format (8-byte ClientID, padding prefixes)") flag.IntVar(&clientIDSize, "clientid-size", 2, "client ID size in bytes (ignored when -dnstt-compat is set)") - flag.StringVar(&recordTypeStr, "record-type", "txt", "DNS record type for downstream data (txt, cname, a, aaaa, mx, ns, srv)") + flag.StringVar(&recordTypeStr, "record-type", "txt", "DNS record type for downstream data (txt, null, cname, a, aaaa, mx, ns, srv, caa)") flag.IntVar(&queueSize, "queue-size", turbotunnel.QueueSize, "packet queue size for DNS tunnel transport") flag.IntVar(&kcpWindowSize, "kcp-window-size", 0, "KCP send/receive window size in packets (0 = queue-size/2)") flag.StringVar(&queueOverflowStr, "queue-overflow", string(turbotunnel.DefaultQueueOverflowMode), "queue overflow behavior: drop or block") @@ -1462,7 +1475,8 @@ Example: } log.Infof("wire config: clientid-size=%d compat=%v", wireConfig.ClientIDSize, wireConfig.Compat) - if recordType != dns.RRTypeTXT { + switch recordType { + case dns.RRTypeCNAME, dns.RRTypeNS, dns.RRTypeMX, dns.RRTypeSRV: explicitFlags := make(map[string]bool) flag.Visit(func(f *flag.Flag) { explicitFlags[f.Name] = true From 0d87441d53356f9f675ede19b7bc0f72b7a214e7 Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Fri, 10 Apr 2026 23:40:42 +0200 Subject: [PATCH 08/19] test(dns): add NULL and CAA encode/decode unit tests - Add round-trip, identity, error-path, and wire-format tests - Add resolver filtering note for NULL record in README - Add CAA flags comment in EncodeRDataCAA - Update client library doc with new record type options --- README.md | 2 +- dns/dns.go | 1 + dns/dns_test.go | 96 ++++++++++++++++++++++++++++++++++++++++++ docs/client-library.md | 2 +- 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b002a90..7259542 100644 --- a/README.md +++ b/README.md @@ -408,7 +408,7 @@ VayDNS supports multiple DNS record types for downstream data encoding. Both cli | Type | Description | Capacity | | ---- | ----------- | -------- | | `txt` | TXT record (default). Highest capacity, compatible with dnstt. | Bounded by UDP payload (~1200 bytes) | -| `null` | NULL record. Raw binary payload in a single RR. | Bounded by UDP payload | +| `null` | NULL record. Raw binary payload in a single RR. Some recursive resolvers may filter or refuse to relay NULL records. | Bounded by UDP payload | | `cname` | CNAME record. Data encoded as a DNS name under the tunnel domain. | Bounded by 255-byte DNS name limit | | `ns` | NS record. Same encoding as CNAME. | Same as CNAME | | `mx` | MX record. 2-byte preference header + name encoding. | Same as CNAME | diff --git a/dns/dns.go b/dns/dns.go index 909e054..57eb1a3 100644 --- a/dns/dns.go +++ b/dns/dns.go @@ -726,6 +726,7 @@ func DecodeRDataCAA(p []byte) ([]byte, error) { func EncodeRDataCAA(p []byte) []byte { const tag = "issue" rdata := make([]byte, 2+len(tag)+len(p)) + // rdata[0] = 0 (flags; bit 7 is the "critical" flag per RFC 8659 §4.1) rdata[1] = byte(len(tag)) copy(rdata[2:], tag) copy(rdata[2+len(tag):], p) diff --git a/dns/dns_test.go b/dns/dns_test.go index 4c90b9e..59fe22f 100644 --- a/dns/dns_test.go +++ b/dns/dns_test.go @@ -805,6 +805,102 @@ func TestEncodeDecodeRDataAAAA(t *testing.T) { } } +func TestEncodeDecodeRDataNULL(t *testing.T) { + for _, p := range [][]byte{ + {}, + {0x00}, + {0x01, 0x02, 0x03}, + bytes.Repeat([]byte{0xab}, 100), + bytes.Repeat([]byte{0xff}, 1000), + } { + rdata := EncodeRDataNULL(p) + decoded, err := DecodeRDataNULL(rdata) + if err != nil { + t.Errorf("DecodeRDataNULL(%x): %v", rdata, err) + continue + } + if !bytes.Equal(decoded, p) { + t.Errorf("NULL round-trip failed for len=%d: got len=%d", len(p), len(decoded)) + } + } +} + +func TestRDataNULLIdentity(t *testing.T) { + // NULL encode/decode should be identity — no framing overhead. + p := []byte{0x01, 0x02, 0x03} + if !bytes.Equal(EncodeRDataNULL(p), p) { + t.Error("EncodeRDataNULL should return input unchanged") + } + decoded, _ := DecodeRDataNULL(p) + if !bytes.Equal(decoded, p) { + t.Error("DecodeRDataNULL should return input unchanged") + } +} + +func TestDecodeRDataCAA(t *testing.T) { + for _, test := range []struct { + desc string + p []byte + decoded []byte + err error + }{ + {"empty input", []byte{}, nil, io.ErrUnexpectedEOF}, + {"single byte", []byte{0x00}, nil, io.ErrUnexpectedEOF}, + {"tag length exceeds data", []byte{0x00, 0x05, 'a'}, nil, io.ErrUnexpectedEOF}, + {"tag only, no value", []byte{0x00, 0x05, 'i', 's', 's', 'u', 'e'}, []byte{}, nil}, + {"tag + value", []byte{0x00, 0x05, 'i', 's', 's', 'u', 'e', 0xaa, 0xbb}, []byte{0xaa, 0xbb}, nil}, + {"zero-length tag", []byte{0x00, 0x00, 0x01, 0x02}, []byte{0x01, 0x02}, nil}, + {"flags byte ignored", []byte{0x80, 0x05, 'i', 's', 's', 'u', 'e', 0xff}, []byte{0xff}, nil}, + } { + decoded, err := DecodeRDataCAA(test.p) + if err != test.err { + t.Errorf("%s: got err %v, want %v", test.desc, err, test.err) + continue + } + if err == nil && !bytes.Equal(decoded, test.decoded) { + t.Errorf("%s: got %x, want %x", test.desc, decoded, test.decoded) + } + } +} + +func TestEncodeRDataCAA(t *testing.T) { + p := []byte{0x01, 0x02, 0x03} + rdata := EncodeRDataCAA(p) + // Expected: flags(0) + tagLen(5) + "issue" + payload + expected := append([]byte{0x00, 0x05, 'i', 's', 's', 'u', 'e'}, p...) + if !bytes.Equal(rdata, expected) { + t.Errorf("EncodeRDataCAA(%x) = %x, want %x", p, rdata, expected) + } +} + +func TestEncodeRDataCAAEmpty(t *testing.T) { + rdata := EncodeRDataCAA([]byte{}) + // Even with empty payload, should have flags + tagLen + tag. + if len(rdata) != 7 { + t.Errorf("EncodeRDataCAA(empty) length = %d, want 7", len(rdata)) + } +} + +func TestRDataCAARoundTrip(t *testing.T) { + for _, p := range [][]byte{ + {}, + {0x00}, + {0x01, 0x02, 0x03}, + bytes.Repeat([]byte{0xab}, 100), + bytes.Repeat([]byte{0xff}, 1000), + } { + rdata := EncodeRDataCAA(p) + decoded, err := DecodeRDataCAA(rdata) + if err != nil { + t.Errorf("CAA round-trip decode error for len=%d: %v", len(p), err) + continue + } + if !bytes.Equal(decoded, p) { + t.Errorf("CAA round-trip failed for len=%d: got len=%d", len(p), len(decoded)) + } + } +} + func TestReadRRMXCompression(t *testing.T) { // DNS message with MX answer using compression pointer in exchange name. msg := []byte{ diff --git a/docs/client-library.md b/docs/client-library.md index 25514ec..391b6b0 100644 --- a/docs/client-library.md +++ b/docs/client-library.md @@ -84,7 +84,7 @@ ts.ClientIDSize = 1 // smaller ClientID ts.MaxQnameLen = 101 // QNAME length constraint ts.MaxNumLabels = 2 // label count constraint ts.RPS = 200 // rate limit queries/second -ts.RecordType = "cname" // DNS record type for downstream data (default: "txt") +ts.RecordType = "cname" // DNS record type for downstream data: txt, null, cname, a, aaaa, mx, ns, srv, caa (default: "txt") // Session options t.IdleTimeout = 60 * time.Second From a0ff70110d5e96686ab8f4c8e1c44cd09be07a75 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 10 Apr 2026 23:43:53 +0200 Subject: [PATCH 09/19] chore(main): release 0.2.8 (#65) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 054e8bc..e7cf9b3 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.2.7" + ".": "0.2.8" } diff --git a/CHANGELOG.md b/CHANGELOG.md index fb45f67..5b48bc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.2.8](https://github.com/net2share/vaydns/compare/v0.2.7...v0.2.8) (2026-04-10) + + +### Features + +* add -v flag for printing version ([#71](https://github.com/net2share/vaydns/issues/71)) ([3533148](https://github.com/net2share/vaydns/commit/35331484bce1b628372dc37accae4e96b507841f)) +* add NULL and CAA record type support and fix server EDNS mtu advertisement ([#70](https://github.com/net2share/vaydns/issues/70)) ([8354db2](https://github.com/net2share/vaydns/commit/8354db2a080ce9543594bc4e71592f33e6489d82)) + ## [0.2.7](https://github.com/net2share/vaydns/compare/v0.2.6...v0.2.7) (2026-04-01) From 4dcdee0310fe7729500c3c9be5b541dfbe93a9a4 Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sun, 12 Apr 2026 00:36:37 +0200 Subject: [PATCH 10/19] fix(multi-resolver): isolate single-entry transport errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A read or write error on any one resolver entry previously propagated to DNSPacketConn via recvChan, tearing down the entire tunnel session — the opposite of what multi-resolver is meant to deliver. In practice only DoT triggered the cascade (UDP/DoH workers retry silently), but a mixed setup with one flaky DoT would reconnect in a loop. - Reader goroutines now mark their entry dead on transport error and exit, instead of pushing error-bearing results onto recvChan. - resolverEntry gains a sticky atomic dead flag; selection helpers and probe targeting skip dead entries; recomputeState short-circuits on them so the decay logic cannot flip Down back to Unknown. - MultiResolver gains an aliveCount/allDead pair; the last dying reader closes allDead, causing ReadFrom/WriteTo to return a terminal "all resolvers are down" error. This is the only condition that should cascade to tunnel reconnect. - WriteTo retries with the next alive entry on a writePacket failure, marking the failed entry dead. Error wrapping handles the empty case without a %w nil. - ReadFrom drains any buffered packet from a deceased reader before surfacing the terminal error, avoiding packet loss at teardown. - markDead holds mu during the transition and returns true only on the alive-to-dead edge, so concurrent observers (reader + writer seeing the same Close) cannot double-decrement aliveCount. Unit tests in client/multi_resolver_test.go exercise both symptoms with a fake net.PacketConn: - TestMultiResolver_DeadEntryDoesNotBreakReadFrom — ReadFrom must return bytes from the working entry after a peer errors. - TestMultiResolver_FailedEntryExcludedFromSelection — selectPrimary must stop returning the failed entry after its reader exits. --- client/multi_resolver.go | 186 +++++++++++++++++++++++---- client/multi_resolver_test.go | 229 ++++++++++++++++++++++++++++++++++ 2 files changed, 389 insertions(+), 26 deletions(-) create mode 100644 client/multi_resolver_test.go diff --git a/client/multi_resolver.go b/client/multi_resolver.go index c46f791..d1fea54 100644 --- a/client/multi_resolver.go +++ b/client/multi_resolver.go @@ -66,6 +66,13 @@ type resolverEntry struct { invalidCount atomic.Int64 timeoutCount atomic.Int64 + // dead is set when the entry's transport has reported an unrecoverable + // error and its reader goroutine has exited. A dead entry is permanently + // excluded from selection and never transitions back — the only way to + // recover is to rebuild the entire MultiResolver (which happens on a + // full tunnel reconnect). + dead atomic.Bool + mu sync.Mutex pending map[uint16]time.Time lastWrite time.Time @@ -74,6 +81,31 @@ type resolverEntry struct { state ResolverState } +// markDead transitions the entry to a permanent Down state. It is idempotent +// and returns true only when this call was the one that transitioned the +// entry — callers rely on this to avoid double-counting the entry in +// MultiResolver.aliveCount when multiple goroutines detect the same transport +// failure concurrently. +// +// The dead flag and the state field are set together under mu so that +// recomputeState (also holding mu) cannot observe an inconsistent snapshot +// where dead is set but state has not yet been transitioned to Down. +func (e *resolverEntry) markDead() bool { + e.mu.Lock() + defer e.mu.Unlock() + if e.dead.Load() { + return false + } + e.dead.Store(true) + e.state = ResolverStateDown + return true +} + +// isDead reports whether markDead has been called on this entry. +func (e *resolverEntry) isDead() bool { + return e.dead.Load() +} + func (e *resolverEntry) writePacket(b []byte) (int, error) { now := time.Now() e.trackOutgoingID(b, now) @@ -156,8 +188,17 @@ func (e *resolverEntry) expirePending(now time.Time) { } func (e *resolverEntry) recomputeState(now time.Time) { + // Dead is sticky — never recompute it back to any other state. We + // re-check under the lock after acquiring it, in case markDead raced + // in between the outer check and acquiring the lock. + if e.isDead() { + return + } e.mu.Lock() defer e.mu.Unlock() + if e.dead.Load() { + return + } timeouts := e.timeoutCount.Load() invalid := e.invalidCount.Load() @@ -238,6 +279,15 @@ type MultiResolver struct { recvChan chan multiReadResult closed chan struct{} closeOnce sync.Once + + // aliveCount tracks how many entries still have a live reader + // goroutine. When it reaches zero, allDead is closed, causing pending + // ReadFrom/WriteTo calls to return an "all resolvers down" error, + // which is the only condition that should propagate a transport + // error to the upper tunnel session. + aliveCount atomic.Int32 + allDead chan struct{} + allDeadOnce sync.Once } // NewMultiResolver creates a MultiResolver from a slice of Resolver configs. @@ -269,25 +319,57 @@ func NewMultiResolver(resolvers []Resolver, mode SelectionMode, queueSize int, o mode: mode, recvChan: make(chan multiReadResult, len(entries)*4), closed: make(chan struct{}), + allDead: make(chan struct{}), } - for _, e := range entries { + mr.aliveCount.Store(int32(len(entries))) + mr.startReaders() + go mr.healthWorker() + return mr, nil +} + +// entryDied is called whenever a transport error is observed for an entry, +// either from its reader goroutine or from a failed write. It marks the entry +// dead and, if this was the transition from alive to dead (as opposed to a +// second observer of the same failure), decrements aliveCount. When the last +// alive entry dies, allDead is closed so pending ReadFrom/WriteTo callers can +// unblock with a terminal error. +func (mr *MultiResolver) entryDied(entry *resolverEntry, err error) { + if !entry.markDead() { + return + } + log.Warnf("multi-resolver: entry %s transport error: %v; marking down", entry.name, err) + if mr.aliveCount.Add(-1) == 0 { + mr.allDeadOnce.Do(func() { close(mr.allDead) }) + } +} + +// startReaders launches one reader goroutine per entry. Each goroutine reads +// packets from its entry's transport and pushes the result onto recvChan. +// Extracted so tests can construct MultiResolver with synthetic entries. +// +// On a transport error, the reader calls entryDied to mark the entry dead and +// signal allDead when the last entry dies. It never pushes error-bearing +// results onto recvChan: doing so would surface a single resolver's failure +// to DNSPacketConn.recvLoop, which would tear down the entire tunnel session +// and defeat the point of having multiple resolvers. +func (mr *MultiResolver) startReaders() { + for _, e := range mr.entries { entry := e go func() { for { res := entry.readPacket() + if res.err != nil { + mr.entryDied(entry, res.err) + return + } select { case mr.recvChan <- res: case <-mr.closed: return } - if res.err != nil { - return - } } }() } - go mr.healthWorker() - return mr, nil } func (mr *MultiResolver) healthWorker() { @@ -350,32 +432,66 @@ func isRateLimitedResponse(resp dns.Message) bool { } // ReadFrom receives a packet from whichever resolver responds first. +// It only returns an error when the MultiResolver has been closed or every +// entry has died; a single resolver's transport error is isolated at the +// reader goroutine and does not propagate here. func (mr *MultiResolver) ReadFrom(b []byte) (n int, addr net.Addr, err error) { select { case <-mr.closed: return 0, nil, net.ErrClosed case res := <-mr.recvChan: - if res.err != nil { - return 0, res.addr, res.err - } n = copy(b, res.buf[:res.n]) return n, turbotunnel.DummyAddr{}, nil + case <-mr.allDead: + // Drain any packet that was buffered by a reader before it + // died, so packets already delivered by the transport are not + // discarded in favour of the terminal error. No more readers + // are pushing (allDead is closed only after every reader has + // exited), so a non-blocking read here races only with the + // mr.closed case above, which is handled on the next call. + select { + case res := <-mr.recvChan: + n = copy(b, res.buf[:res.n]) + return n, turbotunnel.DummyAddr{}, nil + default: + return 0, nil, fmt.Errorf("multi-resolver: all resolvers are down") + } } } // WriteTo sends b to the selected primary resolver and may duplicate b to one -// unhealthy resolver as a probe to detect recovery. +// unhealthy resolver as a probe to detect recovery. If the primary's write +// fails, the entry is marked dead and WriteTo retries with the next alive +// entry. An error is returned only when the MultiResolver is closed or every +// entry has been marked dead. func (mr *MultiResolver) WriteTo(b []byte, _ net.Addr) (n int, err error) { select { case <-mr.closed: return 0, net.ErrClosed + case <-mr.allDead: + return 0, fmt.Errorf("multi-resolver: all resolvers are down") default: } - primary := mr.selectPrimary() - n, err = primary.writePacket(b) - if err != nil { - return n, err + // Try entries until one accepts the write or all alive entries have + // been exhausted. Each write error marks the entry dead. + var primary *resolverEntry + for attempts := 0; attempts < len(mr.entries); attempts++ { + primary = mr.selectPrimary() + if primary == nil { + break + } + n, err = primary.writePacket(b) + if err == nil { + break + } + mr.entryDied(primary, err) + } + switch { + case err != nil: + return 0, fmt.Errorf("multi-resolver: all write attempts failed: %w", err) + case primary == nil: + return 0, fmt.Errorf("multi-resolver: no alive resolver for write") } if probe := mr.selectProbeTarget(primary); probe != nil { @@ -385,17 +501,17 @@ func (mr *MultiResolver) WriteTo(b []byte, _ net.Addr) (n int, err error) { return n, nil } +// selectPrimary returns the entry that should receive the next outgoing +// query, or nil if every entry has been marked dead. The caller must handle +// nil (e.g., return an "all resolvers down" error). func (mr *MultiResolver) selectPrimary() *resolverEntry { if mr.mode == SelectionRoundRobin { - if e := mr.selectRoundRobinHealthy(); e != nil { - return e - } - return mr.entries[0] + return mr.selectRoundRobinHealthy() } if e := mr.selectBestScore(); e != nil { return e } - return mr.entries[0] + return mr.selectRoundRobinHealthy() } func (mr *MultiResolver) selectRoundRobinHealthy() *resolverEntry { @@ -407,28 +523,43 @@ func (mr *MultiResolver) selectRoundRobinHealthy() *resolverEntry { } start := mr.rrIndex + // First pass: prefer Healthy or Unknown, skipping dead entries. for i := 0; i < len(mr.entries); i++ { idx := (start + i) % len(mr.entries) + if mr.entries[idx].isDead() { + continue + } state := mr.entries[idx].stateSnapshot() if state == ResolverStateHealthy || state == ResolverStateUnknown { mr.rrIndex = (idx + 1) % len(mr.entries) return mr.entries[idx] } } - idx := start % len(mr.entries) - mr.rrIndex = (idx + 1) % len(mr.entries) - return mr.entries[idx] + // Second pass: accept any non-dead entry even if RateLimited/Down. + for i := 0; i < len(mr.entries); i++ { + idx := (start + i) % len(mr.entries) + if mr.entries[idx].isDead() { + continue + } + mr.rrIndex = (idx + 1) % len(mr.entries) + return mr.entries[idx] + } + // Every entry is dead. + return nil } func (mr *MultiResolver) selectBestScore() *resolverEntry { if len(mr.entries) == 0 { return nil } - best := mr.entries[0] - bestScore := resolverScore(best) - for _, e := range mr.entries[1:] { + var best *resolverEntry + var bestScore int64 + for _, e := range mr.entries { + if e.isDead() { + continue + } s := resolverScore(e) - if s > bestScore { + if best == nil || s > bestScore { best = e bestScore = s } @@ -462,6 +593,9 @@ func (mr *MultiResolver) selectProbeTarget(primary *resolverEntry) *resolverEntr if e == primary { continue } + if e.isDead() { + continue + } state := e.stateSnapshot() if state == ResolverStateHealthy { continue diff --git a/client/multi_resolver_test.go b/client/multi_resolver_test.go new file mode 100644 index 0000000..ec31411 --- /dev/null +++ b/client/multi_resolver_test.go @@ -0,0 +1,229 @@ +package client + +import ( + "bytes" + "net" + "sync" + "testing" + "time" + + "github.com/net2share/vaydns/turbotunnel" +) + +// fakePacketConn is a controllable net.PacketConn for MultiResolver tests. +// Pushed responses are consumed in order by successive ReadFrom calls; each +// response can be either data bytes or an error. +type fakePacketConn struct { + name string + readCh chan fakeReadResp + closed chan struct{} + once sync.Once +} + +type fakeReadResp struct { + data []byte + err error +} + +func newFakePacketConn(name string) *fakePacketConn { + return &fakePacketConn{ + name: name, + readCh: make(chan fakeReadResp, 16), + closed: make(chan struct{}), + } +} + +func (f *fakePacketConn) pushData(data []byte) { + cp := make([]byte, len(data)) + copy(cp, data) + f.readCh <- fakeReadResp{data: cp} +} + +func (f *fakePacketConn) pushError(err error) { + f.readCh <- fakeReadResp{err: err} +} + +func (f *fakePacketConn) ReadFrom(p []byte) (int, net.Addr, error) { + select { + case r, ok := <-f.readCh: + if !ok { + return 0, nil, net.ErrClosed + } + if r.err != nil { + return 0, nil, r.err + } + return copy(p, r.data), turbotunnel.DummyAddr{}, nil + case <-f.closed: + return 0, nil, net.ErrClosed + } +} + +func (f *fakePacketConn) WriteTo(p []byte, _ net.Addr) (int, error) { + return len(p), nil +} + +func (f *fakePacketConn) Close() error { + f.once.Do(func() { close(f.closed) }) + return nil +} + +func (f *fakePacketConn) LocalAddr() net.Addr { return turbotunnel.DummyAddr{} } +func (f *fakePacketConn) SetDeadline(time.Time) error { return nil } +func (f *fakePacketConn) SetReadDeadline(time.Time) error { return nil } +func (f *fakePacketConn) SetWriteDeadline(time.Time) error { return nil } + +// newFakeEntry builds a resolverEntry around a fakePacketConn, suitable for +// injection into a hand-constructed MultiResolver in tests. +func newFakeEntry(name string, conn *fakePacketConn) *resolverEntry { + return &resolverEntry{ + name: name, + addr: turbotunnel.DummyAddr{}, + conn: conn, + pending: make(map[uint16]time.Time), + state: ResolverStateUnknown, + } +} + +// newTestMultiResolver mirrors what NewMultiResolver does after +// GetResolverConnection returns, but with pre-built synthetic entries. It does +// not spawn the healthWorker — tests don't need it, and omitting it keeps them +// hermetic with respect to timing. +func newTestMultiResolver(entries []*resolverEntry, mode SelectionMode) *MultiResolver { + mr := &MultiResolver{ + entries: entries, + mode: mode, + recvChan: make(chan multiReadResult, len(entries)*4), + closed: make(chan struct{}), + allDead: make(chan struct{}), + } + mr.aliveCount.Store(int32(len(entries))) + mr.startReaders() + return mr +} + +// TestMultiResolver_DeadEntryDoesNotBreakReadFrom exercises C1: when one +// resolver entry's transport returns a fatal read error, MultiResolver.ReadFrom +// must continue to deliver packets from healthy entries instead of propagating +// that error to the upper DNSPacketConn layer (which would tear down the whole +// tunnel session). +// +// Pre-fix expectation: this test FAILS — the reader goroutine pushes the error +// onto recvChan, ReadFrom returns it, and the test sees net.ErrClosed instead +// of the bytes from the working entry. +// +// Post-fix expectation: this test PASSES — the reader goroutine handles the +// error internally (e.g. marks the entry Down and exits), leaving recvChan to +// deliver only real response bytes from the surviving entries. +func TestMultiResolver_DeadEntryDoesNotBreakReadFrom(t *testing.T) { + failing := newFakePacketConn("failing") + working := newFakePacketConn("working") + + entries := []*resolverEntry{ + newFakeEntry("failing", failing), + newFakeEntry("working", working), + } + mr := newTestMultiResolver(entries, SelectionRoundRobin) + defer mr.Close() + + // Step 1: make the failing entry's ReadFrom return a fatal error. + // The reader goroutine will pick this up immediately. + failing.pushError(net.ErrClosed) + + // Give the reader goroutine time to handle the error. Under the buggy + // implementation it pushes an error-bearing multiReadResult onto + // recvChan and exits; under a correct implementation it would quietly + // mark the entry down and exit without poisoning recvChan. + time.Sleep(100 * time.Millisecond) + + // Step 2: push a valid response to the working entry. This lands on + // recvChan strictly AFTER any push from the failing entry, so the + // ordering is deterministic. + wantResponse := []byte("valid-response-bytes-from-working-resolver") + working.pushData(wantResponse) + + // Step 3: ReadFrom must return the bytes from the working entry, not + // the error from the failing one. + type readResult struct { + n int + err error + buf []byte + } + done := make(chan readResult, 1) + go func() { + buf := make([]byte, 4096) + n, _, err := mr.ReadFrom(buf) + done <- readResult{n: n, err: err, buf: buf} + }() + + select { + case r := <-done: + if r.err != nil { + t.Fatalf("MultiResolver.ReadFrom returned error %v; expected it to ignore the failed entry and return the bytes from the working entry. This is the C1 bug: a single resolver's read error tears down the tunnel.", r.err) + } + if !bytes.Equal(r.buf[:r.n], wantResponse) { + t.Fatalf("MultiResolver.ReadFrom returned wrong bytes.\n got: %x\n want: %x", r.buf[:r.n], wantResponse) + } + case <-time.After(3 * time.Second): + t.Fatal("MultiResolver.ReadFrom timed out; expected it to return bytes from the working entry within 3s") + } +} + +// TestMultiResolver_FailedEntryExcludedFromSelection exercises the selection +// side of C1: after a reader goroutine has processed a fatal read error and +// exited, the entry must no longer be returned by selectPrimary. Otherwise +// the round-robin scheduler will keep sending queries to a resolver whose +// reader goroutine is gone (so responses will never come back), defeating the +// health state machine. +// +// Pre-fix expectation: this test FAILS — the reader goroutine exits without +// updating the entry's state, so it remains ResolverStateUnknown, which +// selectRoundRobinHealthy treats as an acceptable target. The "failing" entry +// is returned roughly half the time. +// +// Post-fix expectation: this test PASSES — the reader goroutine transitions +// the entry to ResolverStateDown before exiting, and selectPrimary skips it. +func TestMultiResolver_FailedEntryExcludedFromSelection(t *testing.T) { + failing := newFakePacketConn("failing") + working := newFakePacketConn("working") + + entries := []*resolverEntry{ + newFakeEntry("failing", failing), + newFakeEntry("working", working), + } + mr := newTestMultiResolver(entries, SelectionRoundRobin) + defer mr.Close() + + // Kill the failing entry's reader; leave working blocked in ReadFrom. + failing.pushError(net.ErrClosed) + + // Let the reader goroutine process the error and (ideally) mark the + // entry down before it exits. + time.Sleep(100 * time.Millisecond) + + // Drain any error result that the buggy reader may have pushed onto + // recvChan. A correct reader wouldn't push anything here, so this + // drain is a no-op post-fix. Without it, the buggy pre-fix state + // machine would leave an error lingering, which isn't what this + // specific assertion is about. +drain: + for { + select { + case <-mr.recvChan: + default: + break drain + } + } + + // Call selectPrimary repeatedly. It must never return the failing + // entry — otherwise real queries will be sent to a resolver whose + // reader is gone, so responses will never come back. + for i := range 10 { + selected := mr.selectPrimary() + if selected == nil { + t.Fatalf("iter %d: selectPrimary returned nil", i) + } + if selected.name == "failing" { + t.Fatalf("iter %d: selectPrimary returned the failed entry %q; expected it to be excluded after its reader exited on error. State is %s.", i, selected.name, selected.stateSnapshot()) + } + } +} From 922faf0ff6c706d6330df9eb824d8ae8071c4afa Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sun, 12 Apr 2026 01:06:27 +0200 Subject: [PATCH 11/19] test(e2e): add multi-resolver integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exercise the multi-resolver client path end-to-end with three Docker-based scenarios covering the happy path, a mid-flight resolver kill, and a forging resolver: - multi-resolver: two healthy CoreDNS forwarders, both serving the tunnel domain; verifies the client works with -udp flag repeated. - multi-resolver-runtime-failure: two healthy resolvers, then docker kill one mid-flight; asserts HTTP traffic keeps flowing through the survivor AND that no new tunnel sessions are created — proving the single-entry failure is isolated, not escalated to a full session reconnect. - multi-resolver-forge: one healthy resolver plus one CoreDNS instance using the template plugin to inject NXDOMAIN for every query; asserts the tunnel still delivers while the UDP worker's forged-response filter absorbs the injections, and sanity-checks the forging path by grepping dns-forge's own logs for NXDOMAIN to rule out a round-robin false pass. All three tests use explicit -idle-timeout 10s -keepalive 2s on both client and server (matching recovery/transport-recovery) so they stay anchored to a production-like timing profile and won't drift when the branch's defaults are later reconciled with main. The full e2e suite runner (run-test.sh) is updated to include the three new tests after transport-recovery. --- e2e/multi-resolver-forge/Corefile.forge | 6 ++ e2e/multi-resolver-forge/docker-compose.yml | 82 +++++++++++++++++++ e2e/multi-resolver-forge/run.sh | 59 +++++++++++++ .../docker-compose.yml | 82 +++++++++++++++++++ e2e/multi-resolver-runtime-failure/run.sh | 82 +++++++++++++++++++ e2e/multi-resolver/docker-compose.yml | 82 +++++++++++++++++++ e2e/multi-resolver/run.sh | 29 +++++++ e2e/run-test.sh | 2 +- 8 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 e2e/multi-resolver-forge/Corefile.forge create mode 100644 e2e/multi-resolver-forge/docker-compose.yml create mode 100755 e2e/multi-resolver-forge/run.sh create mode 100644 e2e/multi-resolver-runtime-failure/docker-compose.yml create mode 100755 e2e/multi-resolver-runtime-failure/run.sh create mode 100644 e2e/multi-resolver/docker-compose.yml create mode 100755 e2e/multi-resolver/run.sh diff --git a/e2e/multi-resolver-forge/Corefile.forge b/e2e/multi-resolver-forge/Corefile.forge new file mode 100644 index 0000000..224cbc9 --- /dev/null +++ b/e2e/multi-resolver-forge/Corefile.forge @@ -0,0 +1,6 @@ +. { + template IN ANY . { + rcode NXDOMAIN + } + log +} diff --git a/e2e/multi-resolver-forge/docker-compose.yml b/e2e/multi-resolver-forge/docker-compose.yml new file mode 100644 index 0000000..b5be9f2 --- /dev/null +++ b/e2e/multi-resolver-forge/docker-compose.yml @@ -0,0 +1,82 @@ +networks: + dns-net: + ipam: + config: + - subnet: 172.28.0.0/24 + backend-net: + +volumes: + keys: + +services: + keygen: + build: + context: ../.. + dockerfile: Dockerfile + volumes: + - keys:/keys + command: > + sh -c "vaydns-server -gen-key -privkey-file /keys/server.key -pubkey-file /keys/server.pub" + + dns-good: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.10 + volumes: + - ../Corefile:/Corefile + command: ["-conf", "/Corefile"] + + dns-forge: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.11 + volumes: + - ./Corefile.forge:/Corefile + command: ["-conf", "/Corefile"] + + backend: + image: nginx:alpine + networks: + - backend-net + + server: + build: + context: ../.. + dockerfile: Dockerfile + networks: + dns-net: + ipv4_address: 172.28.0.20 + backend-net: + volumes: + - keys:/keys + command: > + vaydns-server -udp :53 -privkey-file /keys/server.key + -domain t.example.com -upstream backend:80 + -idle-timeout 10s -keepalive 2s + depends_on: + keygen: + condition: service_completed_successfully + dns-good: + condition: service_started + dns-forge: + condition: service_started + + client: + build: + context: ../.. + dockerfile: Dockerfile + networks: + - dns-net + volumes: + - keys:/keys + command: > + vaydns-client -udp 172.28.0.10:53 -udp 172.28.0.11:53 + -pubkey-file /keys/server.pub + -domain t.example.com -listen 0.0.0.0:7000 + -idle-timeout 10s -keepalive 2s -session-check-interval 500ms + -reconnect-min 1s -reconnect-max 5s -log-level info + depends_on: + server: + condition: service_started diff --git a/e2e/multi-resolver-forge/run.sh b/e2e/multi-resolver-forge/run.sh new file mode 100755 index 0000000..418fa3c --- /dev/null +++ b/e2e/multi-resolver-forge/run.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Test: one resolver returns forged responses while another works. +# dns-forge always replies with NXDOMAIN (CoreDNS template plugin), simulating +# a censor or broken resolver injecting fake responses. dns-good behaves +# normally. The tunnel must work through dns-good, with the per-query UDP +# worker's forged-response filter absorbing dns-forge's NXDOMAINs and +# MultiResolver's health state machine eventually routing around it. +set -euo pipefail +cd "$(dirname "$0")" + +cleanup() { docker compose down -v 2>/dev/null; } +trap cleanup EXIT + +fetch() { + docker compose exec -T client wget -q -O- http://localhost:7000 2>/dev/null | grep -q "Welcome to nginx" +} + +echo "--- Building and starting services ---" +docker compose up -d --build + +echo "--- Waiting for tunnel through dns-good while dns-forge injects NXDOMAINs (up to 60s) ---" +ok_count=0 +for i in $(seq 1 60); do + if fetch; then + ok_count=$((ok_count + 1)) + # Require two consecutive successes so one lucky query doesn't pass + # the test while the forging resolver is still in rotation. + if [ "$ok_count" -ge 2 ]; then + echo "" + # Sanity check: make sure dns-forge actually saw queries, so we + # know the tunnel was exercising the forging code path and didn't + # only hit dns-good by chance. CoreDNS's log plugin may buffer + # output briefly, so give it a moment to flush before grepping. + sleep 2 + forge_logs=$(docker compose logs dns-forge 2>&1) + if ! grep -q 'NXDOMAIN' <<<"$forge_logs"; then + echo "--- dns-forge never served NXDOMAIN; the forging code path may not have been exercised ---" + echo "$forge_logs" + echo "=== FAIL (forging path not exercised) ===" + exit 1 + fi + nxdomain_count=$(grep -c 'NXDOMAIN' <<<"$forge_logs" || true) + echo "--- dns-forge served $nxdomain_count NXDOMAIN responses; forging path exercised ---" + echo "--- Tunnel delivers consistent responses despite forged NXDOMAINs ---" + echo "=== PASS ===" + exit 0 + fi + else + ok_count=0 + fi + printf "." + sleep 1 +done + +echo "" +echo "--- Tunnel did not come up through dns-good ---" +docker compose logs client server dns-good dns-forge +echo "=== FAIL ===" +exit 1 diff --git a/e2e/multi-resolver-runtime-failure/docker-compose.yml b/e2e/multi-resolver-runtime-failure/docker-compose.yml new file mode 100644 index 0000000..d456182 --- /dev/null +++ b/e2e/multi-resolver-runtime-failure/docker-compose.yml @@ -0,0 +1,82 @@ +networks: + dns-net: + ipam: + config: + - subnet: 172.28.0.0/24 + backend-net: + +volumes: + keys: + +services: + keygen: + build: + context: ../.. + dockerfile: Dockerfile + volumes: + - keys:/keys + command: > + sh -c "vaydns-server -gen-key -privkey-file /keys/server.key -pubkey-file /keys/server.pub" + + dns1: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.10 + volumes: + - ../Corefile:/Corefile + command: ["-conf", "/Corefile"] + + dns2: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.11 + volumes: + - ../Corefile:/Corefile + command: ["-conf", "/Corefile"] + + backend: + image: nginx:alpine + networks: + - backend-net + + server: + build: + context: ../.. + dockerfile: Dockerfile + networks: + dns-net: + ipv4_address: 172.28.0.20 + backend-net: + volumes: + - keys:/keys + command: > + vaydns-server -udp :53 -privkey-file /keys/server.key + -domain t.example.com -upstream backend:80 + -idle-timeout 10s -keepalive 2s + depends_on: + keygen: + condition: service_completed_successfully + dns1: + condition: service_started + dns2: + condition: service_started + + client: + build: + context: ../.. + dockerfile: Dockerfile + networks: + - dns-net + volumes: + - keys:/keys + command: > + vaydns-client -udp 172.28.0.10:53 -udp 172.28.0.11:53 + -pubkey-file /keys/server.pub + -domain t.example.com -listen 0.0.0.0:7000 + -idle-timeout 10s -keepalive 2s -session-check-interval 500ms + -reconnect-min 1s -reconnect-max 5s -log-level info + depends_on: + server: + condition: service_started diff --git a/e2e/multi-resolver-runtime-failure/run.sh b/e2e/multi-resolver-runtime-failure/run.sh new file mode 100755 index 0000000..a863b79 --- /dev/null +++ b/e2e/multi-resolver-runtime-failure/run.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# Test: one of several UDP resolvers dies mid-flight. +# Start the tunnel with two working DNS resolvers, verify HTTP traffic works, +# kill one resolver, and verify traffic continues through the remaining one. +# +# A failing UDP resolver does not surface an error to MultiResolver (the +# per-query worker retries silently), so this test validates that the health +# state machine + round-robin selection routes around it without tearing +# down the tunnel session. +set -euo pipefail +cd "$(dirname "$0")" + +cleanup() { docker compose down -v 2>/dev/null; } +trap cleanup EXIT + +fetch() { + docker compose exec -T client wget -q -O- http://localhost:7000 2>/dev/null | grep -q "Welcome to nginx" +} + +echo "--- Building and starting services ---" +docker compose up -d --build + +echo "--- Waiting for initial tunnel (up to 30s) ---" +for i in $(seq 1 30); do + if fetch; then + echo "" + echo "--- Initial tunnel is up ---" + break + fi + if [ "$i" -eq 30 ]; then + echo "" + docker compose logs client server dns1 dns2 + echo "=== FAIL (initial tunnel not ready) ===" + exit 1 + fi + printf "." + sleep 1 +done + +# Snapshot the session id before the kill, to detect any reconnect later. +pre_kill_sessions=$(docker compose logs client 2>&1 | grep -c 'session .* ready' || true) + +echo "--- Killing dns1 (half the queries will start dropping) ---" +docker compose kill dns1 + +# Give the client a moment to notice and start routing around dns1. +sleep 3 + +echo "--- Verifying tunnel still works through dns2 (up to 45s) ---" +ok_count=0 +for i in $(seq 1 45); do + if fetch; then + ok_count=$((ok_count + 1)) + # Require two consecutive successes so we don't declare victory on + # a lucky query that happened to go to dns2. + if [ "$ok_count" -ge 2 ]; then + post_kill_sessions=$(docker compose logs client 2>&1 | grep -c 'session .* ready' || true) + new_sessions=$((post_kill_sessions - pre_kill_sessions)) + if [ "$new_sessions" -gt 0 ]; then + echo "" + echo "--- Tunnel recovered but triggered $new_sessions new session(s) — resolver failure should be isolated without a full reconnect ---" + docker compose logs client | tail -30 + echo "=== FAIL (session was rebuilt instead of isolated) ===" + exit 1 + fi + echo "" + echo "--- Tunnel survived with 0 new sessions (single-entry failure isolated) ---" + echo "=== PASS ===" + exit 0 + fi + else + ok_count=0 + fi + printf "." + sleep 1 +done + +echo "" +echo "--- Tunnel did not survive dns1 kill ---" +docker compose logs client server dns1 dns2 +echo "=== FAIL ===" +exit 1 diff --git a/e2e/multi-resolver/docker-compose.yml b/e2e/multi-resolver/docker-compose.yml new file mode 100644 index 0000000..36f9222 --- /dev/null +++ b/e2e/multi-resolver/docker-compose.yml @@ -0,0 +1,82 @@ +networks: + dns-net: + ipam: + config: + - subnet: 172.28.0.0/24 + backend-net: + +volumes: + keys: + +services: + keygen: + build: + context: ../.. + dockerfile: Dockerfile + volumes: + - keys:/keys + command: > + sh -c "vaydns-server -gen-key -privkey-file /keys/server.key -pubkey-file /keys/server.pub" + + dns1: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.10 + volumes: + - ../Corefile:/Corefile + command: ["-conf", "/Corefile"] + + dns2: + image: coredns/coredns + networks: + dns-net: + ipv4_address: 172.28.0.11 + volumes: + - ../Corefile:/Corefile + command: ["-conf", "/Corefile"] + + backend: + image: nginx:alpine + networks: + - backend-net + + server: + build: + context: ../.. + dockerfile: Dockerfile + networks: + dns-net: + ipv4_address: 172.28.0.20 + backend-net: + volumes: + - keys:/keys + command: > + vaydns-server -udp :53 -privkey-file /keys/server.key + -domain t.example.com -upstream backend:80 + -idle-timeout 10s -keepalive 2s + depends_on: + keygen: + condition: service_completed_successfully + dns1: + condition: service_started + dns2: + condition: service_started + + client: + build: + context: ../.. + dockerfile: Dockerfile + networks: + - dns-net + volumes: + - keys:/keys + command: > + vaydns-client -udp 172.28.0.10:53 -udp 172.28.0.11:53 + -pubkey-file /keys/server.pub + -domain t.example.com -listen 0.0.0.0:7000 + -idle-timeout 10s -keepalive 2s -session-check-interval 500ms + -reconnect-min 1s -reconnect-max 5s + depends_on: + server: + condition: service_started diff --git a/e2e/multi-resolver/run.sh b/e2e/multi-resolver/run.sh new file mode 100755 index 0000000..c64921d --- /dev/null +++ b/e2e/multi-resolver/run.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Test: multi-resolver smoke test. +# Verifies that vaydns-client works when configured with two UDP resolvers, +# and that an HTTP request through the tunnel succeeds. +set -euo pipefail +cd "$(dirname "$0")" + +cleanup() { docker compose down -v 2>/dev/null; } +trap cleanup EXIT + +echo "--- Building and starting services ---" +docker compose up -d --build + +echo "--- Waiting for tunnel (up to 30s) ---" +for i in $(seq 1 30); do + if docker compose exec -T client wget -q -O- http://localhost:7000 2>/dev/null | grep -q "Welcome to nginx"; then + echo "" + echo "=== PASS ===" + exit 0 + fi + printf "." + sleep 1 +done + +echo "" +echo "--- Tunnel did not come up. Dumping logs ---" +docker compose logs client server dns1 dns2 +echo "=== FAIL ===" +exit 1 diff --git a/e2e/run-test.sh b/e2e/run-test.sh index b07c0b7..9784ae1 100755 --- a/e2e/run-test.sh +++ b/e2e/run-test.sh @@ -20,7 +20,7 @@ for rt in txt cname a aaaa mx ns srv; do fi done -for test_dir in socks-download recovery transport-recovery; do +for test_dir in socks-download recovery transport-recovery multi-resolver multi-resolver-runtime-failure multi-resolver-forge; do total=$((total + 1)) echo "" echo "========================================" From fcd228a2b98bcf371fa984b4735604ec3f31c2d8 Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sun, 12 Apr 2026 01:31:44 +0200 Subject: [PATCH 12/19] fix(client): apply uTLS fingerprint to DoT resolvers in multi mode The multi-resolver refactor split the single "resolver" variable into three per-type loops but only re-added the UTLSClientHelloID assignment to the DoH loop, silently dropping fingerprint camouflage for every DoT resolver. Pre-refactor main set this field unconditionally on the single configured resolver, so single-resolver DoT users on main got their fingerprint; multi-resolver DoT users on this branch do not. Restore the assignment in the DoT loop, mirroring the DoH loop's pattern. --- vaydns-client/main.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vaydns-client/main.go b/vaydns-client/main.go index 0924f7e..a93d35f 100644 --- a/vaydns-client/main.go +++ b/vaydns-client/main.go @@ -238,10 +238,12 @@ Known TLS fingerprints for -utls are: resolvers = append(resolvers, resolver) } for _, dotAddr := range dotAddrs { - resolvers = append(resolvers, client.Resolver{ + resolver := client.Resolver{ ResolverType: client.ResolverTypeDOT, ResolverAddr: dotAddr, - }) + } + resolver.UTLSClientHelloID = utlsClientHelloID + resolvers = append(resolvers, resolver) } for _, udpAddr := range udpAddrs { resolver := client.Resolver{ From 6f35ceb1fa292d807410186321348ba53256c3de Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sun, 12 Apr 2026 01:43:48 +0200 Subject: [PATCH 13/19] fix(multi-resolver): move health counters under mu to close decay race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decay logic in recomputeState read each counter with atomic Load, then wrote back Load-1 with atomic Store. Concurrent Adds from writePacket, evaluateIncoming, and expirePending (all running in different goroutines) could land between the Load and Store and be silently overwritten — classic lost-update on the invalid/timeout counters. timeoutCount.Store(0) in the valid-response path had the same shape. The Go race detector does not catch this: mixed atomic operations on the same field are considered synchronized, and the bug is a logic lost-update rather than an unsynchronized memory access. Move validCount, invalidCount, and timeoutCount from atomic.Int64 to plain int64 fields protected by the entry's existing mu, and consolidate the several small mu acquisitions in evaluateIncoming, expirePending, and writePacket into one per branch. recomputeState's decay becomes a simple "if count > 0 { count-- }" under the lock. dead and MultiResolver.aliveCount remain atomic: they are read from selection hot paths that should not acquire e.mu on every iteration. resolverScore now acquires e.mu instead of calling stateSnapshot() and reading counters separately — side benefit: eliminates a minor pre-existing inconsistency where the state and the counter values could come from different snapshots. --- client/multi_resolver.go | 106 +++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/client/multi_resolver.go b/client/multi_resolver.go index d1fea54..1b34535 100644 --- a/client/multi_resolver.go +++ b/client/multi_resolver.go @@ -62,23 +62,28 @@ type resolverEntry struct { addr net.Addr conn net.PacketConn - validCount atomic.Int64 - invalidCount atomic.Int64 - timeoutCount atomic.Int64 - // dead is set when the entry's transport has reported an unrecoverable // error and its reader goroutine has exited. A dead entry is permanently // excluded from selection and never transitions back — the only way to // recover is to rebuild the entire MultiResolver (which happens on a - // full tunnel reconnect). + // full tunnel reconnect). Kept as an atomic.Bool so selection helpers + // can read it without acquiring mu on the hot path. dead atomic.Bool - mu sync.Mutex - pending map[uint16]time.Time - lastWrite time.Time - lastValid time.Time - lastProbe time.Time - state ResolverState + // mu protects every field below. Counters are deliberately not atomic: + // the decay path in recomputeState would lose concurrent Adds if the + // counters were atomic and decremented via Load→Store, and splitting + // decay across a CAS loop is uglier than just holding mu for the + // short window each counter update requires. + mu sync.Mutex + validCount int64 + invalidCount int64 + timeoutCount int64 + pending map[uint16]time.Time + lastWrite time.Time + lastValid time.Time + lastProbe time.Time + state ResolverState } // markDead transitions the entry to a permanent Down state. It is idempotent @@ -110,11 +115,11 @@ func (e *resolverEntry) writePacket(b []byte) (int, error) { now := time.Now() e.trackOutgoingID(b, now) n, err := e.conn.WriteTo(b, e.addr) - if err != nil { - e.invalidCount.Add(1) - } e.mu.Lock() e.lastWrite = now + if err != nil { + e.invalidCount++ + } e.mu.Unlock() return n, err } @@ -140,50 +145,51 @@ func (e *resolverEntry) readPacket() multiReadResult { } func (e *resolverEntry) evaluateIncoming(packet []byte) { + now := time.Now() resp, err := dns.MessageFromWireFormat(packet) if err != nil { - e.invalidCount.Add(1) - e.recomputeState(time.Now()) + e.mu.Lock() + e.invalidCount++ + e.mu.Unlock() + e.recomputeState(now) return } - e.mu.Lock() - delete(e.pending, resp.ID) - e.mu.Unlock() - if isValidDNSResponse(resp) { - e.validCount.Add(1) - e.timeoutCount.Store(0) e.mu.Lock() - e.lastValid = time.Now() + delete(e.pending, resp.ID) + e.validCount++ + e.timeoutCount = 0 + e.lastValid = now e.state = ResolverStateHealthy e.mu.Unlock() return } - e.invalidCount.Add(1) + e.mu.Lock() + delete(e.pending, resp.ID) + e.invalidCount++ if isRateLimitedResponse(resp) { - e.mu.Lock() e.state = ResolverStateRateLimited - e.mu.Unlock() } - e.recomputeState(time.Now()) + e.mu.Unlock() + e.recomputeState(now) } func (e *resolverEntry) expirePending(now time.Time) { - expired := int64(0) e.mu.Lock() + expired := int64(0) for id, t := range e.pending { if now.Sub(t) >= pendingResponseTimeout { delete(e.pending, id) expired++ } } - e.mu.Unlock() if expired > 0 { - e.timeoutCount.Add(expired) - e.invalidCount.Add(expired) + e.timeoutCount += expired + e.invalidCount += expired } + e.mu.Unlock() e.recomputeState(now) } @@ -200,29 +206,25 @@ func (e *resolverEntry) recomputeState(now time.Time) { return } - timeouts := e.timeoutCount.Load() - invalid := e.invalidCount.Load() - valid := e.validCount.Load() - switch { - case timeouts >= downTimeoutThreshold: + case e.timeoutCount >= downTimeoutThreshold: e.state = ResolverStateDown - case invalid >= rateLimitThreshold && valid == 0: + case e.invalidCount >= rateLimitThreshold && e.validCount == 0: e.state = ResolverStateRateLimited - case valid > 0 && now.Sub(e.lastValid) <= 30*time.Second: + case e.validCount > 0 && now.Sub(e.lastValid) <= 30*time.Second: e.state = ResolverStateHealthy - case valid == 0: + case e.validCount == 0: e.state = ResolverStateUnknown default: e.state = ResolverStateUnknown } // Slow decay to avoid sticky penalties. - if invalid > 0 { - e.invalidCount.Store(invalid - 1) + if e.invalidCount > 0 { + e.invalidCount-- } - if timeouts > 0 { - e.timeoutCount.Store(timeouts - 1) + if e.timeoutCount > 0 { + e.timeoutCount-- } } @@ -250,9 +252,9 @@ func (e *resolverEntry) snapshot() ResolverStat { return ResolverStat{ Address: e.name, State: e.state, - ValidCount: e.validCount.Load(), - InvalidCount: e.invalidCount.Load(), - TimeoutCount: e.timeoutCount.Load(), + ValidCount: e.validCount, + InvalidCount: e.invalidCount, + TimeoutCount: e.timeoutCount, LastWrite: e.lastWrite, LastValid: e.lastValid, } @@ -568,8 +570,10 @@ func (mr *MultiResolver) selectBestScore() *resolverEntry { } func resolverScore(e *resolverEntry) int64 { - statePenalty := int64(0) - switch e.stateSnapshot() { + e.mu.Lock() + defer e.mu.Unlock() + var statePenalty int64 + switch e.state { case ResolverStateHealthy: statePenalty = 0 case ResolverStateUnknown: @@ -579,7 +583,7 @@ func resolverScore(e *resolverEntry) int64 { case ResolverStateDown: statePenalty = 30 } - return e.validCount.Load()*4 - e.invalidCount.Load()*2 - e.timeoutCount.Load()*3 - statePenalty + return e.validCount*4 - e.invalidCount*2 - e.timeoutCount*3 - statePenalty } func (mr *MultiResolver) selectProbeTarget(primary *resolverEntry) *resolverEntry { @@ -621,7 +625,9 @@ func (mr *MultiResolver) ResolverStats() []ResolverStat { func (mr *MultiResolver) ValidInvalidCounts() map[string][2]int64 { out := make(map[string][2]int64, len(mr.entries)) for _, e := range mr.entries { - out[e.name] = [2]int64{e.validCount.Load(), e.invalidCount.Load()} + e.mu.Lock() + out[e.name] = [2]int64{e.validCount, e.invalidCount} + e.mu.Unlock() } return out } From 04d4811a3f11293dbed6e079fdfa2d28f7de9faf Mon Sep 17 00:00:00 2001 From: crazydi4mond <255249920+crazydi4mond@users.noreply.github.com> Date: Sun, 12 Apr 2026 03:49:44 +0200 Subject: [PATCH 14/19] fix(multi-resolver): per-resolver forged response tracking with labeled logs Each resolver entry now owns a labeled ForgedStats instance so operators can see exactly which resolver is targeted by DNS injection. Milestone logs include the resolver address: forged DNS responses from 8.8.8.8:53: total=10 SERVFAIL=0 NXDOMAIN=10 other=0 In multi-resolver mode, the reader goroutine filters forged responses (QR=1, RCODE != NoError) at the MultiResolver layer and records them per-entry, preventing double-counting at the DNSPacketConn safety net. In single-resolver mode, ForgedStats is created by the Tunnel and shared between UDPPacketConn and DNSPacketConn, matching pre-PR behavior with the new labeled format. - ForgedStats gains a Label field; Record() includes "from