From e42bd42ed6db22d0e8f11cb8dac6244a39bcd0f8 Mon Sep 17 00:00:00 2001 From: Jingwen Wu Date: Thu, 23 Apr 2026 10:36:39 -0700 Subject: [PATCH 1/5] perf: optimize localdns provisioning polling and iptables setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce polling intervals from 1s to 0.1s in start_localdns() and wait_for_localdns_ready() since CoreDNS typically starts in <100ms. Batch iptables rules using iptables-restore instead of individual calls to avoid repeated xtables lock acquisition. Estimated total savings: ~1.3–2.7s during node provisioning. Co-Authored-By: Claude Opus 4.6 --- parts/linux/cloud-init/artifacts/localdns.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index 7141f0ea914..e036022b7d5 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -403,11 +403,12 @@ start_localdns() { ${COREDNS_COMMAND} & # Wait until the PID file is created. + # Use 0.1s polling interval since CoreDNS typically creates the PID file in <100ms. local elapsed=0 while [ ! -f "${LOCALDNS_PID_FILE}" ]; do - sleep 1 + sleep 0.1 elapsed=$((elapsed + 1)) - if [ "$elapsed" -ge "$START_LOCALDNS_TIMEOUT" ]; then + if [ "$elapsed" -ge "$((START_LOCALDNS_TIMEOUT * 10))" ]; then echo "Timed out waiting for CoreDNS to create PID file at ${LOCALDNS_PID_FILE}." return 1 fi @@ -438,7 +439,7 @@ wait_for_localdns_ready() { echo "Localdns failed to come online after $timeout_duration seconds (timeout)." return 1 fi - sleep 1 + sleep 0.1 ((attempts++)) done echo "Localdns is online and ready to serve traffic." @@ -459,10 +460,16 @@ add_iptable_rules_to_skip_conntrack_from_pods(){ ip addr add ${LOCALDNS_CLUSTER_LISTENER_IP}/32 dev localdns # Add IPtables rules that skip conntrack for DNS connections coming from pods. + # Use iptables-restore to batch all rules in a single lock acquisition for performance. echo "Adding iptables rules to skip conntrack for queries to localdns." + local restore_input="*raw" for RULE in "${IPTABLES_RULES[@]}"; do - eval "${IPTABLES}" -A "${RULE}" + restore_input="${restore_input} +-A ${RULE} -m comment --comment \"localdns: skip conntrack\"" done + restore_input="${restore_input} +COMMIT" + echo "${restore_input}" | iptables-restore -w --noflush } # Wait for localdns IP to be removed from resolv.conf after networkctl reload. @@ -822,7 +829,6 @@ replace_azurednsip_in_corefile || exit $ERR_LOCALDNS_FAIL # Build IPtable rules. # --------------------------------------------------------------------------------------------------------------------- -IPTABLES='iptables -w -t raw -m comment --comment "localdns: skip conntrack"' IPTABLES_RULES=() build_localdns_iptable_rules @@ -852,7 +858,7 @@ fi start_localdns || exit $ERR_LOCALDNS_FAIL # Wait to direct traffic to localdns until it's ready. -wait_for_localdns_ready 60 60 || exit $ERR_LOCALDNS_FAIL +wait_for_localdns_ready 600 60 || exit $ERR_LOCALDNS_FAIL # Disable DNS from DHCP and point the system at localdns. # -------------------------------------------------------------------------------------------------------------------- From 52c9699d3f23468a431088dc57ae37b927a060cb Mon Sep 17 00:00:00 2001 From: Jingwen Wu Date: Fri, 24 Apr 2026 13:53:12 -0700 Subject: [PATCH 2/5] test: update shellspec tests for iptables-restore batching Update add_iptable_rules_to_skip_conntrack_from_pods tests to mock iptables-restore via PATH instead of the old IPTABLES variable, and assert on the new iptables-restore input format (*raw, -A rules, COMMIT). Co-Authored-By: Claude Opus 4.6 --- .../linux/cloud-init/artifacts/localdns_spec.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index f8a0973a399..a250e14e13e 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -749,7 +749,14 @@ EOF LOCALDNS_NODE_LISTENER_IP="10.0.0.1" LOCALDNS_CLUSTER_LISTENER_IP="10.0.0.2" IPTABLES_RULES=("raw -t raw -p udp --dport 53 -j NOTRACK" "raw -t raw -p tcp --dport 53 -j NOTRACK") - IPTABLES="echo iptables" + MOCK_BIN_DIR=$(mktemp -d) + cat > "${MOCK_BIN_DIR}/iptables-restore" << 'MOCK' +#!/bin/sh +echo "iptables-restore called with args: $*" +cat +MOCK + chmod +x "${MOCK_BIN_DIR}/iptables-restore" + export PATH="${MOCK_BIN_DIR}:${PATH}" } BeforeEach 'setup' #------------------------- add_iptable_rules_to_skip_conntrack_from_pods ------------------------------------- @@ -773,11 +780,12 @@ EOF ;; esac } - Path prepend "$(pwd)" When call add_iptable_rules_to_skip_conntrack_from_pods The output should include "Adding iptables rules to skip conntrack for queries to localdns." - The output should include "iptables -A raw -t raw -p udp --dport 53 -j NOTRACK" - The output should include "iptables -A raw -t raw -p tcp --dport 53 -j NOTRACK" + The output should include "*raw" + The output should include "-A raw -t raw -p udp --dport 53 -j NOTRACK" + The output should include "-A raw -t raw -p tcp --dport 53 -j NOTRACK" + The output should include "COMMIT" End It 'should delete existing localdns interface' @@ -795,7 +803,6 @@ EOF esac } - Path prepend "$(pwd)" When call add_iptable_rules_to_skip_conntrack_from_pods The output should include "Interface localdns already exists, deleting it." The output should include "Deleting interface: link delete localdns" From 569886a291632adb975a393e2b979246fe2d3e6b Mon Sep 17 00:00:00 2001 From: Jingwen Wu Date: Fri, 24 Apr 2026 13:55:33 -0700 Subject: [PATCH 3/5] fix: place -m comment before -j target in iptables-restore rules iptables requires -j (jump target) to be last in the rule. The comment match module must come before it, otherwise iptables-restore rejects the rule as invalid syntax. Co-Authored-By: Claude Opus 4.6 --- parts/linux/cloud-init/artifacts/localdns.sh | 5 ++++- spec/parts/linux/cloud-init/artifacts/localdns_spec.sh | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index e036022b7d5..fde9a7bde59 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -464,8 +464,11 @@ add_iptable_rules_to_skip_conntrack_from_pods(){ echo "Adding iptables rules to skip conntrack for queries to localdns." local restore_input="*raw" for RULE in "${IPTABLES_RULES[@]}"; do + # Insert comment match before -j target since -j must be last. + local rule_prefix="${RULE% -j *}" + local rule_target="${RULE#"$rule_prefix" }" restore_input="${restore_input} --A ${RULE} -m comment --comment \"localdns: skip conntrack\"" +-A ${rule_prefix} -m comment --comment \"localdns: skip conntrack\" ${rule_target}" done restore_input="${restore_input} COMMIT" diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index a250e14e13e..485bed273d2 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -783,8 +783,9 @@ MOCK When call add_iptable_rules_to_skip_conntrack_from_pods The output should include "Adding iptables rules to skip conntrack for queries to localdns." The output should include "*raw" - The output should include "-A raw -t raw -p udp --dport 53 -j NOTRACK" - The output should include "-A raw -t raw -p tcp --dport 53 -j NOTRACK" + The output should include "-A raw -t raw -p udp --dport 53 -m comment --comment" + The output should include "-A raw -t raw -p tcp --dport 53 -m comment --comment" + The output should include "-j NOTRACK" The output should include "COMMIT" End From c9fb2a8dcf0c4b2c16ffc33e561dc659e3bbb17d Mon Sep 17 00:00:00 2001 From: Jingwen Wu Date: Fri, 24 Apr 2026 15:50:06 -0700 Subject: [PATCH 4/5] fix: correct iptables-restore comment ordering and add e2e validation Place -m comment immediately after chain name in iptables-restore input so that iptables -S displays comment before protocol match, matching the Cilium eBPF host routing regex. The previous ordering placed comment after --dport which caused nft backend to display it after the protocol match extension. Add ValidateLocalDNSIptablesRules e2e validator that checks: - localdns.sh uses iptables-restore (batched rules) - NOTRACK rules exist in both OUTPUT and PREROUTING chains - Comment tag is present for cleanup logic - NOTRACK is functional (no conntrack entries for localdns DNS traffic) Co-Authored-By: Claude Opus 4.6 --- e2e/validation.go | 1 + e2e/validators.go | 59 +++++++++++++++++++ parts/linux/cloud-init/artifacts/localdns.sh | 8 +-- .../cloud-init/artifacts/localdns_spec.sh | 5 +- 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/e2e/validation.go b/e2e/validation.go index 2d02b4d127e..5eac3848ccf 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -77,6 +77,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { if !s.VHD.UnsupportedLocalDns && !config.Config.TestPreProvision && !s.VHDCaching { ValidateLocalDNSService(ctx, s, "enabled") ValidateLocalDNSResolution(ctx, s, "169.254.10.10") + ValidateLocalDNSIptablesRules(ctx, s) ValidateLocalDNSExporterMetrics(ctx, s) } diff --git a/e2e/validators.go b/e2e/validators.go index 293ad401974..1dba0b1367c 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1482,6 +1482,65 @@ func ValidateLocalDNSResolution(ctx context.Context, s *Scenario, server string) assert.Contains(s.T, execResult.stdout, fmt.Sprintf("SERVER: %s", server)) } +// ValidateLocalDNSIptablesRules checks that the NOTRACK iptables rules for localdns are correctly +// applied in the raw table. These rules skip connection tracking for DNS traffic to localdns IPs +// to prevent conntrack table exhaustion on busy nodes. +func ValidateLocalDNSIptablesRules(ctx context.Context, s *Scenario) { + s.T.Helper() + script := `set -euo pipefail +echo "Checking iptables raw table for localdns NOTRACK rules..." +rules=$(sudo iptables -w -t raw -S 2>&1) +echo "$rules" + +# Verify the localdns script uses iptables-restore (not legacy individual iptables calls) +if grep -q "iptables-restore" /opt/azure/containers/localdns/localdns.sh; then + echo "PASS: localdns.sh uses iptables-restore (batched rules)" +else + echo "FAIL: localdns.sh does not use iptables-restore — VHD may be outdated" + exit 1 +fi + +# Verify rules exist in both OUTPUT and PREROUTING chains for both protocols +for chain in OUTPUT PREROUTING; do + chain_rules=$(sudo iptables -w -t raw -S "$chain" 2>&1) + for proto in tcp udp; do + if ! echo "$chain_rules" | grep -q "\-p ${proto}.*--dport 53.*NOTRACK"; then + echo "FAIL: missing NOTRACK rule for $proto in $chain chain" + exit 1 + fi + done +done + +# Verify the comment tag is present (used by cleanup logic) +if ! sudo iptables -w -t raw -S | grep -q "localdns: skip conntrack"; then + echo "FAIL: localdns comment tag not found in iptables rules" + exit 1 +fi + +echo "PASS: all localdns NOTRACK iptables rules verified" + +# Verify NOTRACK rules are functional by doing DNS lookups and checking no conntrack entries exist +echo "Verifying NOTRACK rules are functional..." +dig bing.com @169.254.10.10 +short +timeout=2 +tries=1 > /dev/null 2>&1 || true +dig bing.com @169.254.10.11 +short +timeout=2 +tries=1 > /dev/null 2>&1 || true + +# Check that no conntrack entries exist for localdns IPs on port 53 +for ip in 169.254.10.10 169.254.10.11; do + ct_count=$(sudo conntrack -C 2>/dev/null || echo "0") + ct_dns=$(sudo conntrack -L -d "$ip" -p udp --dport 53 2>/dev/null | wc -l) + if [ "$ct_dns" -gt 0 ]; then + echo "FAIL: found $ct_dns conntrack entries for $ip:53 — NOTRACK rules not working" + sudo conntrack -L -d "$ip" -p udp --dport 53 2>/dev/null + exit 1 + fi + echo "PASS: no conntrack entries for $ip:53 (NOTRACK working, total ct entries: $ct_count)" +done + +echo "PASS: NOTRACK rules are functional — DNS traffic bypasses conntrack" +` + execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, "localdns iptables NOTRACK rules validation failed") +} + // ValidateJournalctlOutput checks if specific content exists in the systemd service logs func ValidateJournalctlOutput(ctx context.Context, s *Scenario, serviceName string, expectedContent string) { s.T.Helper() diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index fde9a7bde59..ca5faaa20d6 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -464,11 +464,11 @@ add_iptable_rules_to_skip_conntrack_from_pods(){ echo "Adding iptables rules to skip conntrack for queries to localdns." local restore_input="*raw" for RULE in "${IPTABLES_RULES[@]}"; do - # Insert comment match before -j target since -j must be last. - local rule_prefix="${RULE% -j *}" - local rule_target="${RULE#"$rule_prefix" }" + # Extract chain name and remainder, insert comment after chain to match legacy display order. + local chain="${RULE%% *}" + local rule_rest="${RULE#"$chain" }" restore_input="${restore_input} --A ${rule_prefix} -m comment --comment \"localdns: skip conntrack\" ${rule_target}" +-A ${chain} -m comment --comment \"localdns: skip conntrack\" ${rule_rest}" done restore_input="${restore_input} COMMIT" diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 485bed273d2..8f1c8f59a93 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -783,8 +783,9 @@ MOCK When call add_iptable_rules_to_skip_conntrack_from_pods The output should include "Adding iptables rules to skip conntrack for queries to localdns." The output should include "*raw" - The output should include "-A raw -t raw -p udp --dport 53 -m comment --comment" - The output should include "-A raw -t raw -p tcp --dport 53 -m comment --comment" + The output should include "-A raw -m comment --comment" + The output should include "-p udp" + The output should include "-p tcp" The output should include "-j NOTRACK" The output should include "COMMIT" End From c11058cba973b938a4c6ee997fbd4bcd5e234658 Mon Sep 17 00:00:00 2001 From: Jingwen Wu Date: Fri, 24 Apr 2026 15:56:45 -0700 Subject: [PATCH 5/5] test: add negative test for NOTRACK conntrack bypass validation Drop NOTRACK rules temporarily, do a DNS lookup, and verify conntrack entries appear. This proves the conntrack check is actually capable of detecting entries and isn't silently passing. Rules are restored after. Co-Authored-By: Claude Opus 4.6 --- e2e/validators.go | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 1dba0b1367c..980dcbfabaf 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1521,21 +1521,61 @@ echo "PASS: all localdns NOTRACK iptables rules verified" # Verify NOTRACK rules are functional by doing DNS lookups and checking no conntrack entries exist echo "Verifying NOTRACK rules are functional..." + +# First, flush any stale conntrack entries for localdns IPs +for ip in 169.254.10.10 169.254.10.11; do + sudo conntrack -D -d "$ip" -p udp --dport 53 2>/dev/null || true + sudo conntrack -D -d "$ip" -p tcp --dport 53 2>/dev/null || true +done + +# Do DNS lookups with NOTRACK rules in place — should create NO conntrack entries dig bing.com @169.254.10.10 +short +timeout=2 +tries=1 > /dev/null 2>&1 || true dig bing.com @169.254.10.11 +short +timeout=2 +tries=1 > /dev/null 2>&1 || true -# Check that no conntrack entries exist for localdns IPs on port 53 for ip in 169.254.10.10 169.254.10.11; do - ct_count=$(sudo conntrack -C 2>/dev/null || echo "0") ct_dns=$(sudo conntrack -L -d "$ip" -p udp --dport 53 2>/dev/null | wc -l) if [ "$ct_dns" -gt 0 ]; then echo "FAIL: found $ct_dns conntrack entries for $ip:53 — NOTRACK rules not working" sudo conntrack -L -d "$ip" -p udp --dport 53 2>/dev/null exit 1 fi - echo "PASS: no conntrack entries for $ip:53 (NOTRACK working, total ct entries: $ct_count)" + echo "PASS: no conntrack entries for $ip:53 with NOTRACK rules active" +done + +# Negative test: temporarily drop NOTRACK rules, do a DNS lookup, and verify conntrack entries DO appear. +# This proves our conntrack check is actually capable of detecting entries. +echo "Negative test: verifying conntrack entries appear WITHOUT NOTRACK rules..." +saved_rules=$(sudo iptables -w -t raw -S | grep "localdns: skip conntrack") +sudo iptables -w -t raw -S | grep "localdns: skip conntrack" | while IFS= read -r rule; do + # Convert -A to -D to delete the rule + sudo iptables -w -t raw $(echo "$rule" | sed 's/^-A/-D/') 2>/dev/null || true +done + +# Flush any leftover conntrack entries before the negative test +for ip in 169.254.10.10 169.254.10.11; do + sudo conntrack -D -d "$ip" -p udp --dport 53 2>/dev/null || true done +# Do a DNS lookup without NOTRACK — this SHOULD create conntrack entries +dig bing.com @169.254.10.10 +short +timeout=2 +tries=1 > /dev/null 2>&1 || true + +ct_dns_neg=$(sudo conntrack -L -d 169.254.10.10 -p udp --dport 53 2>/dev/null | wc -l) +echo "Conntrack entries for 169.254.10.10:53 without NOTRACK: $ct_dns_neg" + +# Restore NOTRACK rules +echo "$saved_rules" | while IFS= read -r rule; do + sudo iptables -w -t raw $rule 2>/dev/null || true +done + +# Clean up conntrack entries created during negative test +sudo conntrack -D -d 169.254.10.10 -p udp --dport 53 2>/dev/null || true + +if [ "$ct_dns_neg" -eq 0 ]; then + echo "FAIL: no conntrack entries appeared even without NOTRACK rules — conntrack check may be broken" + exit 1 +fi +echo "PASS: conntrack entries appeared without NOTRACK, confirming NOTRACK enforcement is real" + echo "PASS: NOTRACK rules are functional — DNS traffic bypasses conntrack" ` execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, "localdns iptables NOTRACK rules validation failed")