diff --git a/.gcloudignore b/.gcloudignore new file mode 100644 index 000000000..87d27134b --- /dev/null +++ b/.gcloudignore @@ -0,0 +1,5 @@ +.git +downloads/ +scratch/ +*.md +.claude/ diff --git a/cmd/server_foreground.go b/cmd/server_foreground.go index bd667e528..7ce07bfff 100644 --- a/cmd/server_foreground.go +++ b/cmd/server_foreground.go @@ -859,6 +859,17 @@ func parseAdminEmails(cfg *config.GlobalConfig) []string { return adminEmailList } +// resolveSessionSecret resolves the deployment-wide session secret from the +// --session-secret flag, falling back to the SCION_SERVER_SESSION_SECRET env +// var. The same value backs both the web session cookie store and the hub JWT +// signing keys so that all replicas behind the load balancer agree. +func resolveSessionSecret() string { + if webSessionSecret != "" { + return webSessionSecret + } + return os.Getenv("SCION_SERVER_SESSION_SECRET") +} + // initHubServer creates and configures the Hub server. func initHubServer(ctx context.Context, cfg *config.GlobalConfig, s store.Store, hubEndpoint, devAuthToken string, adminEmailList []string, adminMode bool, maintenanceMessage string, requestLogger, messageLogger *slog.Logger, globalDir string, pluginMgr *scionplugin.Manager, secretBackend secret.SecretBackend) (*hub.Server, error) { hubCfg := hub.ServerConfig{ @@ -929,6 +940,12 @@ func initHubServer(ctx context.Context, cfg *config.GlobalConfig, s store.Store, MaintenanceConfig: resolveMaintenanceConfig(cfg), SecretBackend: secretBackend, GCPProjectID: cfg.Hub.GCPProjectID, + // Derive the agent/user JWT signing keys from the same shared session + // secret the web cookie store uses, so every replica behind the load + // balancer agrees on the signing key regardless of its host-derived + // HubID. Without this, a JWT minted by one replica fails validation on + // another (cross-replica "session_expired" login loop). + SharedSigningSecret: resolveSessionSecret(), } hubSrv, err := hub.New(hubCfg, s) @@ -1123,10 +1140,7 @@ func initWebServer(ctx context.Context, cfg *config.GlobalConfig, hubSrv *hub.Se } // Allow env var overrides for session/OAuth config - sessionSecret := webSessionSecret - if sessionSecret == "" { - sessionSecret = os.Getenv("SCION_SERVER_SESSION_SECRET") - } + sessionSecret := resolveSessionSecret() baseURL := webBaseURL if baseURL == "" { baseURL = os.Getenv("SCION_SERVER_BASE_URL") diff --git a/pkg/hub/auth.go b/pkg/hub/auth.go index ac506bffd..b916b06c5 100644 --- a/pkg/hub/auth.go +++ b/pkg/hub/auth.go @@ -286,7 +286,7 @@ func extractBearerToken(r *http.Request) string { // isHealthEndpoint returns true if the path is a health check endpoint. func isHealthEndpoint(path string) bool { - return path == "/healthz" || path == "/readyz" + return path == "/healthz" || path == "/health" || path == "/readyz" } // isUnauthenticatedEndpoint returns true if the path does not require authentication. diff --git a/pkg/hub/web.go b/pkg/hub/web.go index a7fdc896c..268bebb19 100644 --- a/pkg/hub/web.go +++ b/pkg/hub/web.go @@ -670,6 +670,7 @@ func (ws *WebServer) sessionToBearerMiddleware(next http.Handler) http.Handler { // registerRoutes sets up the web server routes. func (ws *WebServer) registerRoutes() { ws.mux.HandleFunc("/healthz", ws.handleHealthz) + ws.mux.HandleFunc("/health", ws.handleHealthz) ws.mux.Handle("/assets/", ws.staticHandler()) ws.mux.Handle("/shoelace/", ws.staticHandler()) // Auth routes (no session auth required) @@ -1097,7 +1098,7 @@ func isAllowedSubjectChar(c rune) bool { // isPublicRoute returns true for routes that do not require authentication. func isPublicRoute(path string) bool { switch { - case path == "/healthz": + case path == "/healthz" || path == "/health": return true case strings.HasPrefix(path, "/assets/"): return true diff --git a/pkg/hub/web_test.go b/pkg/hub/web_test.go index 2853400da..5dafa97eb 100644 --- a/pkg/hub/web_test.go +++ b/pkg/hub/web_test.go @@ -27,7 +27,6 @@ import ( "time" "github.com/GoogleCloudPlatform/scion/pkg/store" - "github.com/gorilla/securecookie" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -1288,20 +1287,85 @@ func TestSessionStore_CookieConfiguration(t *testing.T) { "HTTP base URL should produce non-secure cookies") } -func TestSessionStore_NoMaxLengthLimit(t *testing.T) { - // The FilesystemStore stores data on disk, not in cookies, so the default - // securecookie 4096-byte limit must be removed. JWT tokens in the session - // regularly exceed that limit after gob+base64 encoding. - ws := newTestWebServer(t, WebServerConfig{}) - for _, codec := range ws.sessionStore.Codecs { - if sc, ok := codec.(*securecookie.SecureCookie); ok { - // Encode a large value — if MaxLength were still 4096 this would fail. - large := make(map[interface{}]interface{}) - large["token"] = string(make([]byte, 8000)) - _, err := securecookie.EncodeMulti("test", large, sc) - assert.NoError(t, err, "session store should allow values larger than 4096 bytes") - } +func TestSessionStore_CrossReplicaRoundTrip(t *testing.T) { + // Behind a load balancer the OAuth login, the provider callback, and every + // follow-up API request can each land on a different replica. With a + // cookie-backed session store, any replica configured with the same + // SESSION_SECRET must be able to read a session cookie minted by another + // replica. This is the regression test for the "state_mismatch" login + // failures (and dropped post-login sessions) caused by the previous + // filesystem-backed, process-local store. + const secret = "test-shared-session-secret-value-1234567890" + + replicaA := newTestWebServer(t, WebServerConfig{SessionSecret: secret}) + replicaB := newTestWebServer(t, WebServerConfig{SessionSecret: secret}) + + // A realistic post-login payload: identity plus access/refresh JWTs, in + // addition to the short-lived OAuth CSRF state. + svc, err := NewUserTokenService(UserTokenConfig{}) + require.NoError(t, err) + access, refresh, _, err := svc.GenerateTokenPair("user_123", "user@example.com", "Test User", "admin", ClientTypeWeb) + require.NoError(t, err) + + // Replica A writes the session and emits the cookie (e.g. during /auth/login + // and the callback that completes login). + reqA := httptest.NewRequest(http.MethodGet, "/auth/login/google", nil) + recA := httptest.NewRecorder() + sessA, err := replicaA.sessionStore.Get(reqA, webSessionName) + require.NoError(t, err) + sessA.Values[sessKeyOAuthState] = "state-token-abc123" + sessA.Values[sessKeyUserID] = "user_123" + sessA.Values[sessKeyUserEmail] = "user@example.com" + sessA.Values[sessKeyHubAccessToken] = access + sessA.Values[sessKeyHubRefreshToken] = refresh + require.NoError(t, sessA.Save(reqA, recA)) + + cookies := recA.Result().Cookies() + require.NotEmpty(t, cookies, "replica A should set a session cookie") + + // Replica B receives the cookie minted by replica A and must decode it. + reqB := httptest.NewRequest(http.MethodGet, "/auth/callback/google", nil) + for _, c := range cookies { + reqB.AddCookie(c) + } + sessB, err := replicaB.sessionStore.Get(reqB, webSessionName) + require.NoError(t, err) + assert.False(t, sessB.IsNew, "replica B must decode the session cookie minted by replica A") + assert.Equal(t, "state-token-abc123", sessB.Values[sessKeyOAuthState], + "OAuth state must survive across replicas (fixes state_mismatch)") + assert.Equal(t, "user_123", sessB.Values[sessKeyUserID]) + assert.Equal(t, access, sessB.Values[sessKeyHubAccessToken], + "post-login access token must survive across replicas") + assert.Equal(t, refresh, sessB.Values[sessKeyHubRefreshToken]) +} + +func TestSessionStore_DifferentSecretCannotDecode(t *testing.T) { + // A replica configured with a different SESSION_SECRET must NOT be able to + // read another replica's session cookie — the cookie is authenticated and + // encrypted with keys derived from the shared secret. + replicaA := newTestWebServer(t, WebServerConfig{SessionSecret: "secret-A-1234567890-abcdefghijklmnop"}) + replicaC := newTestWebServer(t, WebServerConfig{SessionSecret: "secret-C-1234567890-abcdefghijklmnop"}) + + reqA := httptest.NewRequest(http.MethodGet, "/auth/login/google", nil) + recA := httptest.NewRecorder() + sessA, err := replicaA.sessionStore.Get(reqA, webSessionName) + require.NoError(t, err) + sessA.Values[sessKeyOAuthState] = "state-token-abc123" + require.NoError(t, sessA.Save(reqA, recA)) + + reqC := httptest.NewRequest(http.MethodGet, "/auth/callback/google", nil) + for _, c := range recA.Result().Cookies() { + reqC.AddCookie(c) + } + sessC, err := replicaC.sessionStore.Get(reqC, webSessionName) + // A cookie authenticated/encrypted with a different secret fails to decode: + // gorilla returns a decode error together with a fresh, empty session. + // Either way, the state must not leak across mismatched secrets. + if err == nil { + assert.True(t, sessC.IsNew, "session from a mismatched secret should be new/empty") } + assert.Nil(t, sessC.Values[sessKeyOAuthState], + "OAuth state must not decode under a different secret") } func TestSetters(t *testing.T) { diff --git a/pkg/hubclient/client.go b/pkg/hubclient/client.go index f0bfb7d2d..3a7bd0aab 100644 --- a/pkg/hubclient/client.go +++ b/pkg/hubclient/client.go @@ -344,6 +344,13 @@ func (c *client) Health(ctx context.Context) (*HealthResponse, error) { if err != nil { return nil, err } + if resp.StatusCode == 404 { + resp.Body.Close() + resp, err = c.get(ctx, "/health", nil) + if err != nil { + return nil, err + } + } return apiclient.DecodeResponse[HealthResponse](resp) } diff --git a/pkg/sciontool/hub/client.go b/pkg/sciontool/hub/client.go index 20834adc5..bc07143fc 100644 --- a/pkg/sciontool/hub/client.go +++ b/pkg/sciontool/hub/client.go @@ -175,7 +175,7 @@ func NewClient() *Client { return nil } - return &Client{ + c := &Client{ hubURL: hubURL, token: token, agentID: agentID, @@ -186,11 +186,13 @@ func NewClient() *Client { Timeout: DefaultTimeout, }, } + c.maybeConfigureOIDC() + return c } // NewClientWithConfig creates a new Hub client with explicit configuration. func NewClientWithConfig(hubURL, token, agentID string) *Client { - return &Client{ + c := &Client{ hubURL: hubURL, token: token, agentID: agentID, @@ -201,6 +203,8 @@ func NewClientWithConfig(hubURL, token, agentID string) *Client { Timeout: DefaultTimeout, }, } + c.maybeConfigureOIDC() + return c } // IsConfigured returns true if the client is properly configured. diff --git a/pkg/sciontool/hub/oidc.go b/pkg/sciontool/hub/oidc.go new file mode 100644 index 000000000..4ec51ccb3 --- /dev/null +++ b/pkg/sciontool/hub/oidc.go @@ -0,0 +1,157 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hub + +import ( + "fmt" + "io" + "net/http" + "os" + "sync" + "time" + + "cloud.google.com/go/compute/metadata" + + "github.com/GoogleCloudPlatform/scion/pkg/sciontool/log" +) + +const ( + // EnvHubOIDCAudience overrides the audience claim in the OIDC identity token. + EnvHubOIDCAudience = "SCION_HUB_OIDC_AUDIENCE" + + gcpMetadataBaseURL = "http://metadata.google.internal" + + oidcRefreshMargin = 5 * time.Minute + oidcDefaultTTL = 1 * time.Hour + oidcFetchTimeout = 2 * time.Second +) + +// isOnGCPFunc detects whether we're running on GCP. Override in tests. +var isOnGCPFunc = func() bool { return metadata.OnGCE() } + +// oidcTokenSource fetches and caches Google OIDC identity tokens from the +// GCE metadata server. +type oidcTokenSource struct { + audience string + metadataBaseURL string + httpClient *http.Client + + mu sync.RWMutex + token string + expiresAt time.Time +} + +func (s *oidcTokenSource) getToken() (string, error) { + s.mu.RLock() + if s.token != "" && time.Now().Before(s.expiresAt.Add(-oidcRefreshMargin)) { + tok := s.token + s.mu.RUnlock() + return tok, nil + } + s.mu.RUnlock() + + s.mu.Lock() + defer s.mu.Unlock() + + // Double-check after acquiring write lock. + if s.token != "" && time.Now().Before(s.expiresAt.Add(-oidcRefreshMargin)) { + return s.token, nil + } + + url := fmt.Sprintf("%s/computeMetadata/v1/instance/service-accounts/default/identity?audience=%s&format=full", + s.metadataBaseURL, s.audience) + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return "", fmt.Errorf("oidc: build request: %w", err) + } + req.Header.Set("Metadata-Flavor", "Google") + + resp, err := s.httpClient.Do(req) + if err != nil { + return "", fmt.Errorf("oidc: metadata fetch: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("oidc: metadata server returned %d", resp.StatusCode) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("oidc: read response: %w", err) + } + + tok := string(body) + expiry, err := ParseTokenExpiry(tok) + if err != nil { + expiry = time.Now().Add(oidcDefaultTTL) + } + + s.token = tok + s.expiresAt = expiry + return tok, nil +} + +// oidcTransport is an http.RoundTripper that injects a Google OIDC identity +// token as an Authorization header on outgoing requests. +type oidcTransport struct { + base http.RoundTripper + source *oidcTokenSource +} + +func (t *oidcTransport) RoundTrip(req *http.Request) (*http.Response, error) { + if req.Header.Get("Authorization") == "" { + tok, err := t.source.getToken() + if err != nil { + log.Debug("OIDC token fetch failed, skipping Authorization header: %v", err) + } else { + req = req.Clone(req.Context()) + req.Header.Set("Authorization", "Bearer "+tok) + } + } + return t.base.RoundTrip(req) +} + +func newOIDCTransport(base http.RoundTripper, audience, metadataBaseURL string) *oidcTransport { + if base == nil { + base = http.DefaultTransport + } + return &oidcTransport{ + base: base, + source: &oidcTokenSource{ + audience: audience, + metadataBaseURL: metadataBaseURL, + httpClient: &http.Client{Timeout: oidcFetchTimeout}, + }, + } +} + +// maybeConfigureOIDC wraps the client's HTTP transport with an OIDC token +// injector when running on GCP. This enables transparent authentication +// against Cloud Run-hosted hubs. +func (c *Client) maybeConfigureOIDC() { + if !isOnGCPFunc() { + return + } + + audience := os.Getenv(EnvHubOIDCAudience) + if audience == "" { + audience = c.hubURL + } + + c.client.Transport = newOIDCTransport(c.client.Transport, audience, gcpMetadataBaseURL) + log.Debug("Configured OIDC transport for Cloud Run auth (audience=%s)", audience) +} diff --git a/pkg/sciontool/hub/oidc_test.go b/pkg/sciontool/hub/oidc_test.go new file mode 100644 index 000000000..3f84ee947 --- /dev/null +++ b/pkg/sciontool/hub/oidc_test.go @@ -0,0 +1,311 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hub + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "os" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// makeTestJWT builds a minimal JWT with the given expiry for testing. +func makeTestJWT(exp time.Time) string { + header := base64.RawURLEncoding.EncodeToString([]byte(`{"alg":"none","typ":"JWT"}`)) + payload, _ := json.Marshal(map[string]interface{}{"exp": exp.Unix(), "iss": "test"}) + payloadB64 := base64.RawURLEncoding.EncodeToString(payload) + sig := base64.RawURLEncoding.EncodeToString([]byte("fakesig")) + return fmt.Sprintf("%s.%s.%s", header, payloadB64, sig) +} + +func overrideGCPDetection(val bool) func() { + orig := isOnGCPFunc + isOnGCPFunc = func() bool { return val } + return func() { isOnGCPFunc = orig } +} + +func TestOIDCTokenSource_FetchAndCache(t *testing.T) { + var fetchCount atomic.Int32 + token := makeTestJWT(time.Now().Add(1 * time.Hour)) + + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "Google", r.Header.Get("Metadata-Flavor")) + assert.Contains(t, r.URL.Query().Get("audience"), "https://hub.example.com") + assert.Equal(t, "full", r.URL.Query().Get("format")) + fetchCount.Add(1) + fmt.Fprint(w, token) + })) + defer metaSrv.Close() + + src := &oidcTokenSource{ + audience: "https://hub.example.com", + metadataBaseURL: metaSrv.URL, + httpClient: &http.Client{Timeout: 2 * time.Second}, + } + + tok1, err := src.getToken() + require.NoError(t, err) + assert.Equal(t, token, tok1) + + tok2, err := src.getToken() + require.NoError(t, err) + assert.Equal(t, token, tok2) + + assert.Equal(t, int32(1), fetchCount.Load(), "second call should use cache") +} + +func TestOIDCTokenSource_RefreshExpired(t *testing.T) { + var fetchCount atomic.Int32 + token1 := makeTestJWT(time.Now().Add(1 * time.Hour)) + token2 := makeTestJWT(time.Now().Add(2 * time.Hour)) + + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if fetchCount.Add(1) == 1 { + fmt.Fprint(w, token1) + } else { + fmt.Fprint(w, token2) + } + })) + defer metaSrv.Close() + + src := &oidcTokenSource{ + audience: "https://hub.example.com", + metadataBaseURL: metaSrv.URL, + httpClient: &http.Client{Timeout: 2 * time.Second}, + } + + tok, err := src.getToken() + require.NoError(t, err) + assert.Equal(t, token1, tok) + + // Simulate expiry by setting expiresAt to the past. + src.mu.Lock() + src.expiresAt = time.Now().Add(-1 * time.Minute) + src.mu.Unlock() + + tok, err = src.getToken() + require.NoError(t, err) + assert.Equal(t, token2, tok) + assert.Equal(t, int32(2), fetchCount.Load()) +} + +func TestOIDCTokenSource_RefreshWithinMargin(t *testing.T) { + var fetchCount atomic.Int32 + token1 := makeTestJWT(time.Now().Add(1 * time.Hour)) + token2 := makeTestJWT(time.Now().Add(2 * time.Hour)) + + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if fetchCount.Add(1) == 1 { + fmt.Fprint(w, token1) + } else { + fmt.Fprint(w, token2) + } + })) + defer metaSrv.Close() + + src := &oidcTokenSource{ + audience: "https://hub.example.com", + metadataBaseURL: metaSrv.URL, + httpClient: &http.Client{Timeout: 2 * time.Second}, + } + + tok, err := src.getToken() + require.NoError(t, err) + assert.Equal(t, token1, tok) + + // Set expiry to 3 minutes from now (within 5-minute margin). + src.mu.Lock() + src.expiresAt = time.Now().Add(3 * time.Minute) + src.mu.Unlock() + + tok, err = src.getToken() + require.NoError(t, err) + assert.Equal(t, token2, tok, "should re-fetch when within refresh margin") +} + +func TestOIDCTransport_InjectsHeader(t *testing.T) { + var receivedAuth string + hubSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + receivedAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusOK) + })) + defer hubSrv.Close() + + token := makeTestJWT(time.Now().Add(1 * time.Hour)) + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, token) + })) + defer metaSrv.Close() + + transport := newOIDCTransport(http.DefaultTransport, "https://hub.example.com", metaSrv.URL) + client := &http.Client{Transport: transport} + + req, _ := http.NewRequest("GET", hubSrv.URL+"/test", nil) + resp, err := client.Do(req) + require.NoError(t, err) + resp.Body.Close() + + assert.Equal(t, "Bearer "+token, receivedAuth) +} + +func TestOIDCTransport_DoesNotOverrideExistingAuth(t *testing.T) { + var receivedAuth string + hubSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + receivedAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusOK) + })) + defer hubSrv.Close() + + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Fatal("metadata server should not be called when Authorization is already set") + })) + defer metaSrv.Close() + + transport := newOIDCTransport(http.DefaultTransport, "https://hub.example.com", metaSrv.URL) + client := &http.Client{Transport: transport} + + req, _ := http.NewRequest("GET", hubSrv.URL+"/test", nil) + req.Header.Set("Authorization", "Bearer existing-token") + resp, err := client.Do(req) + require.NoError(t, err) + resp.Body.Close() + + assert.Equal(t, "Bearer existing-token", receivedAuth) +} + +func TestOIDCTransport_GracefulDegradation(t *testing.T) { + var requestReceived bool + hubSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestReceived = true + assert.Empty(t, r.Header.Get("Authorization"), "no auth header when metadata fails") + w.WriteHeader(http.StatusOK) + })) + defer hubSrv.Close() + + // Point at an unreachable metadata server. + transport := newOIDCTransport(http.DefaultTransport, "https://hub.example.com", "http://127.0.0.1:1") + transport.source.httpClient.Timeout = 100 * time.Millisecond + client := &http.Client{Transport: transport} + + req, _ := http.NewRequest("GET", hubSrv.URL+"/test", nil) + resp, err := client.Do(req) + require.NoError(t, err) + resp.Body.Close() + + assert.True(t, requestReceived, "request should proceed even when metadata fetch fails") +} + +func TestMaybeConfigureOIDC_NotOnGCP(t *testing.T) { + cleanup := overrideGCPDetection(false) + defer cleanup() + + c := &Client{ + hubURL: "https://hub.example.com", + client: &http.Client{Timeout: DefaultTimeout}, + } + + c.maybeConfigureOIDC() + + assert.Nil(t, c.client.Transport, "transport should not be wrapped when not on GCP") +} + +func TestMaybeConfigureOIDC_OnGCP(t *testing.T) { + cleanup := overrideGCPDetection(true) + defer cleanup() + + c := &Client{ + hubURL: "https://hub.example.com", + client: &http.Client{Timeout: DefaultTimeout}, + } + + c.maybeConfigureOIDC() + + require.NotNil(t, c.client.Transport) + ot, ok := c.client.Transport.(*oidcTransport) + require.True(t, ok, "transport should be oidcTransport") + assert.Equal(t, "https://hub.example.com", ot.source.audience) +} + +func TestMaybeConfigureOIDC_AudienceOverride(t *testing.T) { + cleanup := overrideGCPDetection(true) + defer cleanup() + + origAud := os.Getenv(EnvHubOIDCAudience) + os.Setenv(EnvHubOIDCAudience, "https://custom-audience.example.com") + defer os.Setenv(EnvHubOIDCAudience, origAud) + + c := &Client{ + hubURL: "https://hub.example.com", + client: &http.Client{Timeout: DefaultTimeout}, + } + + c.maybeConfigureOIDC() + + require.NotNil(t, c.client.Transport) + ot := c.client.Transport.(*oidcTransport) + assert.Equal(t, "https://custom-audience.example.com", ot.source.audience) +} + +func TestOIDC_EndToEnd_BothHeaders(t *testing.T) { + cleanup := overrideGCPDetection(true) + defer cleanup() + + token := makeTestJWT(time.Now().Add(1 * time.Hour)) + metaSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, token) + })) + defer metaSrv.Close() + + var gotAuth, gotAgentToken string + hubSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + gotAgentToken = r.Header.Get("X-Scion-Agent-Token") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) + })) + defer hubSrv.Close() + + // Override GCP metadata URL by directly constructing the client with OIDC transport. + c := &Client{ + hubURL: hubSrv.URL, + token: "test-agent-token", + agentID: "test-agent-123", + maxRetries: 1, + retryBaseDelay: 10 * time.Millisecond, + retryMaxDelay: 10 * time.Millisecond, + client: &http.Client{ + Timeout: DefaultTimeout, + }, + } + c.client.Transport = newOIDCTransport(c.client.Transport, hubSrv.URL, metaSrv.URL) + + err := c.UpdateStatus(context.Background(), StatusUpdate{ + Status: "running", + Message: "test", + }) + require.NoError(t, err) + + assert.Equal(t, "Bearer "+token, gotAuth, "OIDC Authorization header should be set") + assert.Equal(t, "test-agent-token", gotAgentToken, "X-Scion-Agent-Token should still be set") +} diff --git a/scripts/cloudrun/Dockerfile b/scripts/cloudrun/Dockerfile new file mode 100644 index 000000000..6c5dad500 --- /dev/null +++ b/scripts/cloudrun/Dockerfile @@ -0,0 +1,67 @@ +# Scion Hub — Cloud Run container image +# Multi-stage build: web frontend → Go binary → slim runtime + +# --------------------------------------------------------------------------- +# Stage 1: Build web frontend +# --------------------------------------------------------------------------- +FROM node:20-slim AS web-builder + +WORKDIR /src/web +COPY web/package.json web/package-lock.json ./ +RUN npm ci --ignore-scripts +COPY web/ ./ +RUN npm run build + +# --------------------------------------------------------------------------- +# Stage 2: Build Go binary (with embedded web assets) +# --------------------------------------------------------------------------- +FROM golang:1.25 AS go-builder + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +COPY --from=web-builder /src/web/dist/client ./web/dist/client + +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \ + go build -buildvcs=false \ + -ldflags "-X github.com/GoogleCloudPlatform/scion/pkg/version.BuildTime=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + -o /scion ./cmd/scion + +# --------------------------------------------------------------------------- +# Stage 3: Runtime +# --------------------------------------------------------------------------- +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + git \ + openssh-client \ + curl \ + apt-transport-https \ + gnupg \ + && echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ + > /etc/apt/sources.list.d/google-cloud-sdk.list \ + && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg \ + && apt-get update \ + && apt-get install -y --no-install-recommends google-cloud-cli-gke-gcloud-auth-plugin \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN useradd -m -d /home/scion -s /bin/bash -u 1000 scion \ + && mkdir -p /home/scion/.kube /run/secrets \ + && chown -R scion:scion /home/scion /run/secrets + +COPY --from=go-builder /scion /usr/local/bin/scion +COPY scripts/cloudrun/entrypoint.sh /usr/local/bin/entrypoint.sh + +ENV HOME=/home/scion +ENV KUBECONFIG=/home/scion/.kube/config + +USER scion +WORKDIR /home/scion + +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/scripts/cloudrun/README.md b/scripts/cloudrun/README.md new file mode 100644 index 000000000..04fc336ed --- /dev/null +++ b/scripts/cloudrun/README.md @@ -0,0 +1,99 @@ +# Scion Hub — Cloud Run Deployment + +Deploys the Scion hub as a single Cloud Run instance with a co-located GKE +broker targeting `scion-demo-cluster`. + +## Architecture + +``` +Cloud Run (min=max=1) +┌──────────────────────────┐ +│ scion server (combo) │ +│ ├─ Hub API :8080 │ +│ ├─ Web UI :8080 │ +│ └─ Broker :9810 │──▶ GKE Autopilot (scion-demo-cluster) +│ SQLite: /tmp/scion.db│ namespace: scion-agents +└──────────────────────────┘ +``` + +- **Authenticated HTTPS only** (`--no-allow-unauthenticated`) +- **SQLite (ephemeral)** — lost on instance restart, acceptable for demo +- **GKE auth via ADC** — Cloud Run service account → Workload Identity → GKE + +## Prerequisites + +- `gcloud` CLI, authenticated with project `deploy-demo-test` +- `docker` CLI, authenticated to Artifact Registry +- `kubectl` with access to `scion-demo-cluster` (for namespace creation only) +- `openssl` (for session secret generation) + +## Quick Start + +```bash +# Full deploy (build + push + secrets + Cloud Run service) +./scripts/cloudrun/deploy.sh + +# Redeploy without rebuilding the image +./scripts/cloudrun/deploy.sh --skip-build +``` + +## Configuration + +Environment variables override defaults: + +| Variable | Default | Description | +|------------------------|----------------------|---------------------------------| +| `SCION_PROJECT` | `deploy-demo-test` | GCP project ID | +| `SCION_REGION` | `us-central1` | GCP region | +| `SCION_SERVICE` | `scion-hub` | Cloud Run service name | +| `SCION_GKE_CLUSTER` | `scion-demo-cluster` | Target GKE cluster | +| `SCION_SA_NAME` | `scion-hub-sa` | Service account name | +| `SCION_REPO` | `scion` | Artifact Registry repo name | +| `SCION_SESSION_SECRET` | *(auto-generated)* | JWT session secret (hex string) | + +## What the Deploy Script Does + +1. Creates a dedicated service account with `container.admin` and + `secretmanager.secretAccessor` roles (if it doesn't exist) +2. Builds and pushes the container image to Artifact Registry +3. Fetches GKE cluster endpoint + CA cert and generates a kubeconfig +4. Generates hub settings from the template (injects session secret) +5. Stores kubeconfig and settings as Secret Manager secrets +6. Ensures the `scion-agents` namespace exists in GKE +7. Deploys the Cloud Run service with secrets mounted as files + +## Verification + +```bash +# Get the service URL +URL=$(gcloud run services describe scion-hub \ + --region us-central1 --project deploy-demo-test \ + --format="value(status.url)") + +# Health check (requires IAM authentication) +curl -H "Authorization: Bearer $(gcloud auth print-identity-token)" "${URL}/healthz" + +# Point the scion CLI at the Cloud Run hub +scion hub set --url "${URL}" --auth gcloud +``` + +## Files + +| File | Purpose | +|-------------------------------|---------------------------------------------| +| `Dockerfile` | Multi-stage build: web + Go → slim runtime | +| `deploy.sh` | End-to-end deploy script | +| `hub-settings-template.yaml` | Hub settings (session secret placeholder) | +| `README.md` | This file | + +## Notes + +- The Cloud Run instance uses `--timeout 3600` for long-lived WebSocket + connections from agent control channels. +- `--min-instances 1` keeps the instance warm. SQLite state is lost on cold + starts, so a warm instance is critical. +- The `gke-gcloud-auth-plugin` is installed in the image for robustness, but + `pkg/k8s/client.go` also has a `fallbackToGCEAuth()` path that uses ADC + directly if the plugin fails. +- Session secret is stored in Secret Manager and injected into settings at + deploy time, so it survives instance restarts. diff --git a/scripts/cloudrun/deploy.sh b/scripts/cloudrun/deploy.sh new file mode 100755 index 000000000..bb7dd44c2 --- /dev/null +++ b/scripts/cloudrun/deploy.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# Deploy Scion hub as a Cloud Run service with co-located GKE broker. +# +# Prerequisites: +# - gcloud CLI authenticated with sufficient permissions +# - docker CLI authenticated to Artifact Registry +# - kubectl configured for scion-demo-cluster (for namespace setup only) +# +# Usage: +# ./scripts/cloudrun/deploy.sh # full deploy (build + push + secrets + service) +# ./scripts/cloudrun/deploy.sh --skip-build # redeploy without rebuilding image + +set -euo pipefail + +# ── Configuration ──────────────────────────────────────────────────────────── + +PROJECT="${SCION_PROJECT:-deploy-demo-test}" +REGION="${SCION_REGION:-us-central1}" +SERVICE_NAME="${SCION_SERVICE:-scion-hub}" +GKE_CLUSTER="${SCION_GKE_CLUSTER:-scion-demo-cluster}" +SA_NAME="${SCION_SA_NAME:-scion-hub-sa}" +REPO="${SCION_REPO:-scion}" +IMAGE="us-central1-docker.pkg.dev/${PROJECT}/${REPO}/hub:latest" +K8S_NAMESPACE="scion-agents" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +SKIP_BUILD=false +[[ "${1:-}" == "--skip-build" ]] && SKIP_BUILD=true + +# ── Helpers ────────────────────────────────────────────────────────────────── + +log() { echo "==> $*"; } +die() { echo "ERROR: $*" >&2; exit 1; } + +ensure_secret() { + local name="$1" + local data="$2" + if gcloud secrets describe "$name" --project="$PROJECT" &>/dev/null; then + log "Updating secret ${name}" + echo "$data" | gcloud secrets versions add "$name" --data-file=- --project="$PROJECT" + else + log "Creating secret ${name}" + echo "$data" | gcloud secrets create "$name" --data-file=- --project="$PROJECT" \ + --replication-policy=automatic + fi +} + +# ── 0. Validate ────────────────────────────────────────────────────────────── + +command -v gcloud >/dev/null || die "gcloud CLI not found" +command -v docker >/dev/null || die "docker CLI not found" + +# ── 1. Service account ────────────────────────────────────────────────────── + +SA_EMAIL="${SA_NAME}@${PROJECT}.iam.gserviceaccount.com" + +if ! gcloud iam service-accounts describe "$SA_EMAIL" --project="$PROJECT" &>/dev/null; then + log "Creating service account ${SA_NAME}" + gcloud iam service-accounts create "$SA_NAME" \ + --display-name="Scion Hub (Cloud Run)" \ + --project="$PROJECT" + + for role in roles/container.admin roles/secretmanager.secretAccessor; do + gcloud projects add-iam-policy-binding "$PROJECT" \ + --member="serviceAccount:${SA_EMAIL}" \ + --role="$role" \ + --condition=None \ + --quiet + done +fi + +# ── 2. Build & push image ─────────────────────────────────────────────────── + +if [[ "$SKIP_BUILD" == false ]]; then + log "Building container image" + docker build -f "${SCRIPT_DIR}/Dockerfile" -t "$IMAGE" "$REPO_ROOT" + + log "Pushing image to Artifact Registry" + docker push "$IMAGE" +else + log "Skipping build (--skip-build)" +fi + +# ── 3. Generate kubeconfig from live cluster info ──────────────────────────── + +log "Fetching GKE cluster details" +ENDPOINT=$(gcloud container clusters describe "$GKE_CLUSTER" \ + --region "$REGION" --project "$PROJECT" \ + --format="value(endpoint)") +CA_CERT=$(gcloud container clusters describe "$GKE_CLUSTER" \ + --region "$REGION" --project "$PROJECT" \ + --format="value(masterAuth.clusterCaCertificate)") + +[[ -n "$ENDPOINT" ]] || die "Could not fetch cluster endpoint" +[[ -n "$CA_CERT" ]] || die "Could not fetch cluster CA certificate" + +KUBECONFIG_CONTENT="apiVersion: v1 +kind: Config +clusters: +- cluster: + certificate-authority-data: ${CA_CERT} + server: https://${ENDPOINT} + name: ${GKE_CLUSTER} +contexts: +- context: + cluster: ${GKE_CLUSTER} + namespace: ${K8S_NAMESPACE} + name: ${GKE_CLUSTER} +current-context: ${GKE_CLUSTER}" + +# ── 4. Generate hub settings ──────────────────────────────────────────────── + +SESSION_SECRET="${SCION_SESSION_SECRET:-$(openssl rand -hex 32)}" + +SETTINGS_CONTENT=$(sed "s/__SESSION_SECRET__/${SESSION_SECRET}/" \ + "${SCRIPT_DIR}/hub-settings-template.yaml") + +# ── 5. Store secrets ──────────────────────────────────────────────────────── + +log "Storing secrets in Secret Manager" +ensure_secret "${SERVICE_NAME}-kubeconfig" "$KUBECONFIG_CONTENT" +ensure_secret "${SERVICE_NAME}-settings" "$SETTINGS_CONTENT" + +# ── 6. Ensure K8s namespace ───────────────────────────────────────────────── + +log "Ensuring namespace ${K8S_NAMESPACE} exists in ${GKE_CLUSTER}" +kubectl create namespace "$K8S_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - || true + +# ── 7. Create Artifact Registry repo (if needed) ──────────────────────────── + +if ! gcloud artifacts repositories describe "$REPO" \ + --location="$REGION" --project="$PROJECT" &>/dev/null; then + log "Creating Artifact Registry repository ${REPO}" + gcloud artifacts repositories create "$REPO" \ + --repository-format=docker \ + --location="$REGION" \ + --project="$PROJECT" +fi + +# ── 8. Deploy Cloud Run service ───────────────────────────────────────────── + +log "Deploying Cloud Run service ${SERVICE_NAME}" +gcloud run deploy "$SERVICE_NAME" \ + --image "$IMAGE" \ + --region "$REGION" \ + --project "$PROJECT" \ + --min-instances 1 \ + --max-instances 1 \ + --no-allow-unauthenticated \ + --service-account "$SA_EMAIL" \ + --port 8080 \ + --memory 1Gi \ + --cpu 1 \ + --timeout 3600 \ + --set-secrets "/home/scion/.kube/config=${SERVICE_NAME}-kubeconfig:latest,/run/secrets/settings.yaml=${SERVICE_NAME}-settings:latest" \ + --set-env-vars "HOME=/home/scion,KUBECONFIG=/home/scion/.kube/config" + +# ── 9. Print service URL ──────────────────────────────────────────────────── + +SERVICE_URL=$(gcloud run services describe "$SERVICE_NAME" \ + --region "$REGION" --project "$PROJECT" \ + --format="value(status.url)") + +log "Deployment complete" +echo "" +echo " Service URL: ${SERVICE_URL}" +echo " Health check: curl -H \"Authorization: Bearer \$(gcloud auth print-identity-token)\" ${SERVICE_URL}/healthz" +echo "" diff --git a/scripts/cloudrun/entrypoint.sh b/scripts/cloudrun/entrypoint.sh new file mode 100755 index 000000000..5ba111f38 --- /dev/null +++ b/scripts/cloudrun/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/sh +set -e +# Copy secret-mounted settings into ~/.scion/ so the runtime discovery finds them. +# Cloud Run secret volumes use symlink-based atomic updates, so cp may fail. +# Use cat to read through the symlink safely. +mkdir -p "$HOME/.scion/storage" "$HOME/.scion/templates" +if [ -f /run/secrets/settings.yaml ]; then + cat /run/secrets/settings.yaml > "$HOME/.scion/settings.yaml" +fi +exec scion server start \ + --foreground --production --dev-auth \ + --enable-hub --enable-runtime-broker --enable-web --web-port 8080 \ + --auto-provide --global diff --git a/scripts/cloudrun/hub-settings-template.yaml b/scripts/cloudrun/hub-settings-template.yaml new file mode 100644 index 000000000..43152c98b --- /dev/null +++ b/scripts/cloudrun/hub-settings-template.yaml @@ -0,0 +1,21 @@ +schema_version: "1" +image_registry: "us-central1-docker.pkg.dev/deploy-demo-test/public-docker" +active_profile: default +server: + database: + driver: sqlite + url: /tmp/scion.db + auth: + session_secret: "__SESSION_SECRET__" + runtimeBroker: + port: 9810 +profiles: + default: + runtime: kubernetes +runtimes: + kubernetes: + type: kubernetes + gke: true + context: scion-demo-cluster + namespace: scion-agents + list_all_namespaces: false