From 04226092ffff76ad9097ccad55cb2746ac9e9fc6 Mon Sep 17 00:00:00 2001 From: Gavin Elder Date: Thu, 30 Apr 2026 17:58:02 +0100 Subject: [PATCH] Fix K8s projected service account token expiry (#6918) Re-read the SA token from disk with a 30s TTL cache rather than snapshotting it at startup. Kubelet rotates projected tokens at ~80% of their lifetime (default ~48min), so the old approach caused 401s on pipelines running longer than ~1hr. The previous workaround (PR #6742 / #6925) periodically recreated K8sClient via a Guava cache, but K8sTaskHandler cached the client instance at construction time so the cache was never consulted. The root fix is in ClientConfig.getBearerToken(), which reads the token file fresh when the 30s TTL expires rather than at any higher layer. The Guava cache and clientRefreshInterval config option are removed as they are no longer needed. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Gavin Elder --- docs/reference/config.md | 6 --- plugins/nf-k8s/changelog.txt | 1 + .../src/main/nextflow/k8s/K8sConfig.groovy | 7 --- .../src/main/nextflow/k8s/K8sExecutor.groovy | 24 ++-------- .../nextflow/k8s/client/ClientConfig.groovy | 32 ++++++++++++- .../k8s/client/ConfigDiscovery.groovy | 4 +- .../main/nextflow/k8s/client/K8sClient.groovy | 5 +- .../test/nextflow/k8s/K8sConfigTest.groovy | 17 ------- .../test/nextflow/k8s/K8sExecutorTest.groovy | 32 +++---------- .../k8s/client/ConfigDiscoveryTest.groovy | 5 +- .../nextflow/k8s/client/K8sClientTest.groovy | 47 +++++++++++++++++++ 11 files changed, 98 insertions(+), 82 deletions(-) diff --git a/docs/reference/config.md b/docs/reference/config.md index f2bb7efa83..c1d5ba3149 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -1075,12 +1075,6 @@ The following settings are available: - `clientKey` - `clientKeyFile` -`k8s.clientRefreshInterval` -: :::{versionadded} 26.04.0 - ::: -: The interval after which the Kubernetes client configuration is refreshed (default: `50m`). -: This setting is useful when the Kubernetes authentication token has a limited lifespan and needs to be periodically refreshed. The client configuration will be automatically reloaded after the specified interval, allowing Nextflow to obtain fresh credentials from the Kubernetes configuration. - `k8s.computeResourceType` : Whether to use Kubernetes `Pod` or `Job` resource type to carry out Nextflow tasks (default: `Pod`). diff --git a/plugins/nf-k8s/changelog.txt b/plugins/nf-k8s/changelog.txt index bbebc43024..5068f00fd0 100644 --- a/plugins/nf-k8s/changelog.txt +++ b/plugins/nf-k8s/changelog.txt @@ -1,6 +1,7 @@ nf-k8s changelog =================== 1.5.2 - 20 Apr 2026 +- Fix projected service account token expiry by re-reading token file from disk with 30s TTL cache (#6918) - Bump org.bouncycastle:bcpkix-jdk18on from 1.79 to 1.84 (#7042) [59d847d52] - Allow running pipeline from URL and main script path (#6602) [83196d4be] diff --git a/plugins/nf-k8s/src/main/nextflow/k8s/K8sConfig.groovy b/plugins/nf-k8s/src/main/nextflow/k8s/K8sConfig.groovy index 0761b37834..54052fe80d 100644 --- a/plugins/nf-k8s/src/main/nextflow/k8s/K8sConfig.groovy +++ b/plugins/nf-k8s/src/main/nextflow/k8s/K8sConfig.groovy @@ -80,12 +80,6 @@ class K8sConfig implements ConfigScope { """) final Map client - @ConfigOption - @Description(""" - The interval after which the Kubernetes client configuration is refreshed (default: `50m`). - """) - final Duration clientRefreshInterval - @ConfigOption @Description(""" The Kubernetes [configuration context](https://kubernetes.io/docs/tasks/access-application-cluster/configure-access-multiple-clusters/) to use. @@ -220,7 +214,6 @@ class K8sConfig implements ConfigScope { autoMountHostPaths = opts.autoMountHostPaths as boolean cleanup = opts.cleanup as Boolean client = opts.client as Map - clientRefreshInterval = opts.clientRefreshInterval as Duration ?: Duration.of('50m') computeResourceType = opts.computeResourceType context = opts.context cpuLimits = opts.cpuLimits as boolean diff --git a/plugins/nf-k8s/src/main/nextflow/k8s/K8sExecutor.groovy b/plugins/nf-k8s/src/main/nextflow/k8s/K8sExecutor.groovy index f03eb88742..05dc0875a1 100644 --- a/plugins/nf-k8s/src/main/nextflow/k8s/K8sExecutor.groovy +++ b/plugins/nf-k8s/src/main/nextflow/k8s/K8sExecutor.groovy @@ -16,10 +16,6 @@ package nextflow.k8s -import java.util.concurrent.TimeUnit - -import com.google.common.cache.Cache -import com.google.common.cache.CacheBuilder import groovy.transform.CompileStatic import groovy.transform.Memoized import groovy.util.logging.Slf4j @@ -44,19 +40,9 @@ import org.pf4j.ExtensionPoint @ServiceName('k8s') class K8sExecutor extends Executor implements ExtensionPoint { - /** - * Cache for the Kubernetes HTTP client. The client is refreshed periodically - * so that the service account token is re-read when it expires. - */ - private Cache clientCache + private K8sClient client - /** - * @return The Kubernetes HTTP client. Delegates to a Guava cache that refreshes - * the client (including the service account token) when the configured interval expires. - */ - protected K8sClient getClient() { - clientCache.get('client', () -> new K8sClient(k8sConfig.getClient())) - } + protected K8sClient getClient() { client } /** * @return The `k8s` configuration scope in the nextflow configuration object @@ -73,11 +59,7 @@ class K8sExecutor extends Executor implements ExtensionPoint { protected void register() { super.register() final k8sConfig = getK8sConfig() - final refreshInterval = k8sConfig.clientRefreshInterval - this.clientCache = CacheBuilder.newBuilder() - .expireAfterWrite(refreshInterval.toMillis(), TimeUnit.MILLISECONDS) - .build() - final client = getClient() + this.client = new K8sClient(k8sConfig.getClient()) log.debug "[K8s] config=$k8sConfig; API client config=$client.config" } diff --git a/plugins/nf-k8s/src/main/nextflow/k8s/client/ClientConfig.groovy b/plugins/nf-k8s/src/main/nextflow/k8s/client/ClientConfig.groovy index aed479e130..bc67e16633 100644 --- a/plugins/nf-k8s/src/main/nextflow/k8s/client/ClientConfig.groovy +++ b/plugins/nf-k8s/src/main/nextflow/k8s/client/ClientConfig.groovy @@ -20,6 +20,7 @@ import groovy.util.logging.Slf4j import nextflow.util.Duration import javax.net.ssl.KeyManager +import java.nio.file.Files import java.nio.file.Path import java.nio.file.Paths @@ -49,6 +50,12 @@ class ClientConfig { String token + /** + * Path to the service account token file. When set, {@link #getBearerToken()} re-reads + * the file periodically so that kubelet-rotated projected tokens are always current. + */ + Path tokenFile + byte[] sslCert byte[] clientCert @@ -76,6 +83,9 @@ class ClientConfig { */ boolean isFromCluster + private volatile String cachedToken + private volatile long tokenCachedAt + String getNamespace() { namespace ?: 'default' } ClientConfig() { @@ -83,7 +93,27 @@ class ClientConfig { } String toString() { - "${this.class.getSimpleName()}[ server=$server, namespace=$namespace, serviceAccount=$serviceAccount, token=${cut(token)}, sslCert=${cut(sslCert)}, clientCert=${cut(clientCert)}, clientKey=${cut(clientKey)}, verifySsl=$verifySsl, fromFile=$isFromCluster, httpReadTimeout=$httpReadTimeout, httpConnectTimeout=$httpConnectTimeout, retryConfig=$retryConfig ]" + "${this.class.getSimpleName()}[ server=$server, namespace=$namespace, serviceAccount=$serviceAccount, token=${cut(token)}, tokenFile=$tokenFile, sslCert=${cut(sslCert)}, clientCert=${cut(clientCert)}, clientKey=${cut(clientKey)}, verifySsl=$verifySsl, fromFile=$isFromCluster, httpReadTimeout=$httpReadTimeout, httpConnectTimeout=$httpConnectTimeout, retryConfig=$retryConfig ]" + } + + /** + * @return The current bearer token, re-read from {@link #tokenFile} periodically. + * Falls back to the last known-good value on read failure. + */ + String getBearerToken() { + if( !tokenFile ) + return token + final now = System.currentTimeMillis() + if( cachedToken == null || now - tokenCachedAt >= 30_000 ) { + try { + cachedToken = Files.readString(tokenFile).trim() + tokenCachedAt = now + } + catch( Exception e ) { + log.warn "Failed to read Kubernetes service account token from $tokenFile -- ${e.message}" + } + } + return cachedToken } private String cut(String str) { diff --git a/plugins/nf-k8s/src/main/nextflow/k8s/client/ConfigDiscovery.groovy b/plugins/nf-k8s/src/main/nextflow/k8s/client/ConfigDiscovery.groovy index 9f85dfc535..0093d4eeac 100644 --- a/plugins/nf-k8s/src/main/nextflow/k8s/client/ConfigDiscovery.groovy +++ b/plugins/nf-k8s/src/main/nextflow/k8s/client/ConfigDiscovery.groovy @@ -84,12 +84,12 @@ class ConfigDiscovery { final server = formatHostName(host, port) final cert = path('/var/run/secrets/kubernetes.io/serviceaccount/ca.crt').bytes - final token = path('/var/run/secrets/kubernetes.io/serviceaccount/token').text + final tokenFile = path('/var/run/secrets/kubernetes.io/serviceaccount/token') final namespace = path('/var/run/secrets/kubernetes.io/serviceaccount/namespace').text return new ClientConfig( server: server, - token: token, + tokenFile: tokenFile, namespace: cfgNamespace ?: namespace, serviceAccount: serviceAccount, sslCert: cert, diff --git a/plugins/nf-k8s/src/main/nextflow/k8s/client/K8sClient.groovy b/plugins/nf-k8s/src/main/nextflow/k8s/client/K8sClient.groovy index 48c95929c4..6b163d8701 100644 --- a/plugins/nf-k8s/src/main/nextflow/k8s/client/K8sClient.groovy +++ b/plugins/nf-k8s/src/main/nextflow/k8s/client/K8sClient.groovy @@ -642,8 +642,9 @@ class K8sClient { final prefix = config.server.contains("://") ? config.server : "https://$config.server" final conn = createConnection0(prefix + path) conn.setRequestProperty("Content-Type", "application/json") - if( config.token ) { - conn.setRequestProperty("Authorization", "Bearer $config.token") + final bearerToken = config.getBearerToken() + if( bearerToken ) { + conn.setRequestProperty("Authorization", "Bearer $bearerToken") } if( conn instanceof HttpsURLConnection ) { diff --git a/plugins/nf-k8s/src/test/nextflow/k8s/K8sConfigTest.groovy b/plugins/nf-k8s/src/test/nextflow/k8s/K8sConfigTest.groovy index f1eb2e2fee..572e18c7fd 100644 --- a/plugins/nf-k8s/src/test/nextflow/k8s/K8sConfigTest.groovy +++ b/plugins/nf-k8s/src/test/nextflow/k8s/K8sConfigTest.groovy @@ -467,21 +467,4 @@ class K8sConfigTest extends Specification { cfg.fetchNodeName() == false } - def 'should set clientRefreshInterval' () { - when: - def cfg = new K8sConfig() - then: - cfg.clientRefreshInterval == Duration.of('50m') - - when: - cfg = new K8sConfig(clientRefreshInterval: '30m') - then: - cfg.clientRefreshInterval == Duration.of('30m') - - when: - cfg = new K8sConfig(clientRefreshInterval: '1h') - then: - cfg.clientRefreshInterval == Duration.of('1h') - } - } diff --git a/plugins/nf-k8s/src/test/nextflow/k8s/K8sExecutorTest.groovy b/plugins/nf-k8s/src/test/nextflow/k8s/K8sExecutorTest.groovy index 9a0630cc0c..2130c04d42 100644 --- a/plugins/nf-k8s/src/test/nextflow/k8s/K8sExecutorTest.groovy +++ b/plugins/nf-k8s/src/test/nextflow/k8s/K8sExecutorTest.groovy @@ -16,10 +16,6 @@ package nextflow.k8s -import java.util.concurrent.TimeUnit - -import com.google.common.cache.CacheBuilder -import nextflow.k8s.client.ClientConfig import nextflow.k8s.client.K8sClient import spock.lang.Specification @@ -28,39 +24,25 @@ import spock.lang.Specification */ class K8sExecutorTest extends Specification { - def 'should cache k8s client and refresh after expiration' () { + def 'should return the same k8s client instance on repeated calls' () { given: def CONFIG = new K8sConfig( client: [server: 'http://k8s-server'], namespace: 'test-ns', - serviceAccount: 'test-sa', - clientRefreshInterval: '100ms' + serviceAccount: 'test-sa' ) and: def executor = Spy(K8sExecutor) executor.getK8sConfig() >> CONFIG - // use a short-lived cache for the test - executor.@clientCache = CacheBuilder.newBuilder() - .expireAfterWrite(100, TimeUnit.MILLISECONDS) - .build() + executor.@client = new K8sClient(CONFIG.getClient()) - when: 'first call to getClient' + when: def client1 = executor.getClient() - then: 'a new K8sClient is created' - client1 instanceof K8sClient - client1.config.server == 'http://k8s-server' - - when: 'second call within cache interval' def client2 = executor.getClient() - then: 'returns the same cached instance' - client2.is(client1) - when: 'call after cache expiration' - sleep(150) - def client3 = executor.getClient() - then: 'a new K8sClient instance is created' - client3 instanceof K8sClient - !client3.is(client1) + then: + client1 instanceof K8sClient + client2.is(client1) } } diff --git a/plugins/nf-k8s/src/test/nextflow/k8s/client/ConfigDiscoveryTest.groovy b/plugins/nf-k8s/src/test/nextflow/k8s/client/ConfigDiscoveryTest.groovy index 38ac946839..8cbaf1d311 100644 --- a/plugins/nf-k8s/src/test/nextflow/k8s/client/ConfigDiscoveryTest.groovy +++ b/plugins/nf-k8s/src/test/nextflow/k8s/client/ConfigDiscoveryTest.groovy @@ -386,7 +386,9 @@ class ConfigDiscoveryTest extends Specification { and: config.server == 'foo.com:4343' config.namespace == 'foo-namespace' - config.token == 'my-token' + config.token == null + config.tokenFile == TOKEN_FILE + config.getBearerToken() == 'my-token' config.sslCert == CERT_FILE.text.bytes config.isFromCluster @@ -400,6 +402,7 @@ class ConfigDiscoveryTest extends Specification { and: config.server == 'https://host.com' config.namespace == 'my-namespace' + config.tokenFile == TOKEN_FILE } def 'should create key managers' () { diff --git a/plugins/nf-k8s/src/test/nextflow/k8s/client/K8sClientTest.groovy b/plugins/nf-k8s/src/test/nextflow/k8s/client/K8sClientTest.groovy index 263a6e3cad..0a12bf8338 100644 --- a/plugins/nf-k8s/src/test/nextflow/k8s/client/K8sClientTest.groovy +++ b/plugins/nf-k8s/src/test/nextflow/k8s/client/K8sClientTest.groovy @@ -16,6 +16,8 @@ package nextflow.k8s.client +import java.nio.file.Files + import nextflow.exception.K8sOutOfCpuException import nextflow.exception.K8sOutOfMemoryException @@ -68,6 +70,51 @@ class K8sClientTest extends Specification { } + def 'should cache token from file and refresh after TTL' () { + + given: + def tokenFile = Files.createTempFile('k8s-token', null) + tokenFile.text = 'token-v1' + def config = new ClientConfig(server: 'host.com:443', tokenFile: tokenFile) + + when: 'first read primes the cache' + def t1 = config.getBearerToken() + then: + t1 == 'token-v1' + + when: 'token file rotated but TTL not yet expired — cached value returned' + tokenFile.text = 'token-v2' + def t2 = config.getBearerToken() + then: + t2 == 'token-v1' + + when: 'TTL expires — next read picks up the rotated token' + config.@tokenCachedAt = System.currentTimeMillis() - 31_000 + def t3 = config.getBearerToken() + then: + t3 == 'token-v2' + + cleanup: + Files.deleteIfExists(tokenFile) + } + + def 'should return last good token when file read fails' () { + + given: + def tokenFile = Files.createTempFile('k8s-token', null) + tokenFile.text = 'good-token' + def config = new ClientConfig(server: 'host.com:443', tokenFile: tokenFile) + config.getBearerToken() // prime the cache + + when: 'file is deleted (simulates a transient read error) and TTL expires' + Files.delete(tokenFile) + config.@tokenCachedAt = System.currentTimeMillis() - 31_000 + def result = config.getBearerToken() + + then: 'last known-good token is returned rather than null' + result == 'good-token' + } + def 'should make a get request' () { given: