Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ All notable changes to kage are recorded here. The format follows

## [Unreleased]

### Fixed

- Chrome no longer downloads a file to your Downloads folder when a crawl follows a link that turns out to be a binary (reported in #32).
An extensionless link is queued as a page, so the page worker navigated to it in Chrome, and a link that served a zip or a CSV made Chrome save the file to `~/Downloads`, a surprise side effect of a clone.
kage now denies Chrome-initiated downloads browser-wide, since every asset is fetched through kage's own downloader, and detects a navigation whose response is not HTML and reroutes that URL to the asset downloader, where the size and media policy decides whether to localise it or leave it on the live web.

## [0.3.2] - 2026-06-16

### Fixed
Expand Down
117 changes: 115 additions & 2 deletions browser/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,21 @@ type RenderResult struct {
Title string
}

// ErrNotHTML reports that a URL kage tried to render as a page is not HTML: the
// server returned some other content type (a zip, a CSV, a PDF, a bare image).
// Such a URL reaches the page worker when its link carried no file extension to
// classify it by. The caller reroutes it to the asset downloader, where the
// asset policy decides whether to localise or leave it remote, instead of saving
// an empty or broken page or letting Chrome download it (issue #32).
type ErrNotHTML struct {
URL string
ContentType string
}

func (e *ErrNotHTML) Error() string {
return fmt.Sprintf("not HTML (%s): %s", e.ContentType, e.URL)
}

// Render navigates to rawURL, lets it settle, and returns the final rendered
// HTML. It acquires a page slot from the pool and releases it when done.
func (p *Pool) Render(ctx context.Context, rawURL string) (RenderResult, error) {
Expand All @@ -90,8 +105,23 @@ func (p *Pool) Render(ctx context.Context, rawURL string) (RenderResult, error)

page = page.Context(ctx).Timeout(p.opts.RenderTimeout)

if err := page.Navigate(rawURL); err != nil {
return RenderResult{}, fmt.Errorf("navigate %s: %w", rawURL, err)
// Watch the main document's response so a navigation that turns out to be a
// non-HTML resource (a zip, a CSV, a bare image) is caught and handed back for
// the asset downloader, rather than rendered as a broken page or, with downloads
// denied, left as an aborted navigation (issue #32). The content type arrives in
// the response headers whether Chrome renders the body or aborts it as a denied
// download, so this catches both.
mainContentType := watchMainDocument(page)

navErr := page.Navigate(rawURL)
// A denied download aborts the navigation, so inspect the captured content type
// before treating a navigation error as a failure. waitFor gives the response
// event a brief moment to be processed; for an HTML page it returns at once.
if ct := waitFor(ctx, mainContentType, 2*time.Second); ct != "" && !isHTML(ct) {
return RenderResult{}, &ErrNotHTML{URL: rawURL, ContentType: ct}
}
if navErr != nil {
return RenderResult{}, fmt.Errorf("navigate %s: %w", rawURL, navErr)
}
if err := page.WaitLoad(); err != nil {
return RenderResult{}, fmt.Errorf("wait load %s: %w", rawURL, err)
Expand Down Expand Up @@ -169,6 +199,21 @@ func (p *Pool) getBrowser() (*rod.Browser, error) {
if err := b.Connect(); err != nil {
return nil, fmt.Errorf("connect Chrome: %w", err)
}

// kage never wants Chrome to write a file to disk. Every asset is fetched
// through kage's own downloader, which applies the size and media policy, so a
// Chrome-initiated download is only ever an accident: navigating an <a> link
// that turns out to be a binary (a zip, an installer, a CSV) makes Chrome save
// it to the user's Downloads folder, a surprise side effect of a crawl
// (issue #32). Denying downloads browser-wide stops that. The navigation is
// aborted instead, and Render's non-HTML detection reroutes the URL through the
// asset downloader, where the asset policy decides its fate. This is
// best-effort: if the call is unsupported, the non-HTML detection still keeps
// the binary out of the saved mirror.
_ = proto.BrowserSetDownloadBehavior{
Behavior: proto.BrowserSetDownloadBehaviorBehaviorDeny,
}.Call(b)

p.browser = b
return b, nil
}
Expand Down Expand Up @@ -321,6 +366,74 @@ func envBool(name string) (val, ok bool) {
}
}

// watchMainDocument subscribes to network responses and returns an accessor for
// the main document's content type. The first Document-type response is the main
// frame's navigation; later Document responses are sub-frames (iframes), whose
// type kage does not police, so only the first is kept. The accessor is safe to
// call from another goroutine. Any setup error leaves the accessor returning "",
// which the caller reads as "unknown, render normally".
func watchMainDocument(page *rod.Page) func() string {
var (
mu sync.Mutex
ct string
)
if err := (proto.NetworkEnable{}).Call(page); err != nil {
return func() string { return "" }
}
wait := page.EachEvent(func(e *proto.NetworkResponseReceived) {
if e.Type != proto.NetworkResourceTypeDocument || e.Response == nil {
return
}
mu.Lock()
if ct == "" {
ct = e.Response.MIMEType
}
mu.Unlock()
})
// EachEvent's wait blocks until the page context ends, draining events as they
// arrive; run it for the page's lifetime. The deferred page.Close in Render
// cancels the context and unblocks it.
go wait()
return func() string {
mu.Lock()
defer mu.Unlock()
return ct
}
}

// waitFor polls get until it returns a non-empty value, the deadline passes, or
// the context is cancelled, then returns whatever it last saw. It exists because
// the network response is processed on another goroutine, so the value may not be
// set the instant Navigate returns; an HTML page sets it within a few
// milliseconds, while a never-arriving response simply waits out the deadline.
func waitFor(ctx context.Context, get func() string, deadline time.Duration) string {
const step = 20 * time.Millisecond
for waited := time.Duration(0); waited < deadline; waited += step {
if v := get(); v != "" {
return v
}
select {
case <-ctx.Done():
return get()
case <-time.After(step):
}
}
return get()
}

// isHTML reports whether a document content type is one kage renders and saves as
// a page. HTML and XHTML qualify; an empty type is treated as HTML so an unlabelled
// response still renders. Anything else (a zip, a CSV, a PDF, a bare image or
// JSON) is an asset that reached the page worker because its link carried no file
// extension to classify it by.
func isHTML(contentType string) bool {
mt := strings.ToLower(strings.TrimSpace(contentType))
if i := strings.IndexByte(mt, ';'); i >= 0 {
mt = strings.TrimSpace(mt[:i])
}
return mt == "" || mt == "text/html" || mt == "application/xhtml+xml"
}

// settle waits for the network to go quiet for d, recovering from any rod
// panic and capping the wait so a chatty page can never hang the worker.
func settle(page *rod.Page, d time.Duration) {
Expand Down
84 changes: 84 additions & 0 deletions browser/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package browser

import (
"context"
"errors"
"net/http"
"net/http/httptest"
"os"
Expand Down Expand Up @@ -115,3 +116,86 @@ func TestRenderCapturesFinalDOM(t *testing.T) {
t.Errorf("render did not capture the JS-built DOM:\n%s", res.HTML)
}
}

func TestIsHTML(t *testing.T) {
cases := []struct {
ct string
want bool
}{
{"text/html", true},
{"text/html; charset=utf-8", true},
{"TEXT/HTML", true},
{" text/html ", true},
{"application/xhtml+xml", true},
{"", true}, // unknown: render rather than misclassify
{"application/zip", false},
{"text/csv", false},
{"application/pdf", false},
{"image/png", false},
{"application/json", false},
{"application/octet-stream", false},
}
for _, c := range cases {
if got := isHTML(c.ct); got != c.want {
t.Errorf("isHTML(%q) = %v, want %v", c.ct, got, c.want)
}
}
}

func TestRenderRoutesNonHTML(t *testing.T) {
if testing.Short() {
t.Skip("render test drives Chrome; skipped under -short")
}
if _, ok := LookChrome(); !ok {
t.Skip("no Chrome/Chromium found; skipping render test")
}

srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/page":
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(`<!doctype html><html><body><p>a real page</p></body></html>`))
case "/file.zip", "/download":
// A binary served with no useful extension on the path, the shape that
// makes Chrome download to ~/Downloads when navigated to (issue #32).
w.Header().Set("Content-Type", "application/zip")
_, _ = w.Write([]byte("PK\x03\x04 not really a zip"))
case "/data":
w.Header().Set("Content-Type", "text/csv")
_, _ = w.Write([]byte("a,b\n1,2\n"))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()

p := New(Options{Headless: true, Workers: 1, Settle: 300 * time.Millisecond, RenderTimeout: 20 * time.Second})
defer func() { _ = p.Close() }()

ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()

// A real HTML page renders as before.
if res, err := p.Render(ctx, srv.URL+"/page"); err != nil {
t.Errorf("render HTML page: %v", err)
} else if !strings.Contains(res.HTML, "a real page") {
t.Errorf("HTML page did not render:\n%s", res.HTML)
}

// Non-HTML navigation targets come back as *ErrNotHTML so the caller can route
// them to the asset downloader instead of saving a broken page or downloading.
for _, tc := range []struct{ path, wantCT string }{
{"/download", "application/zip"},
{"/data", "text/csv"},
} {
_, err := p.Render(ctx, srv.URL+tc.path)
var notHTML *ErrNotHTML
if !errors.As(err, &notHTML) {
t.Errorf("Render(%s) error = %v, want *ErrNotHTML", tc.path, err)
continue
}
if !strings.Contains(notHTML.ContentType, tc.wantCT) {
t.Errorf("Render(%s) content type = %q, want %q", tc.path, notHTML.ContentType, tc.wantCT)
}
}
}
16 changes: 16 additions & 0 deletions clone/cloner.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,22 @@ func (c *Cloner) processPage(ctx context.Context, j pageItem) {

res, err := c.pool.Render(ctx, j.u.String())
if err != nil {
var notHTML *browser.ErrNotHTML
if errors.As(err, &notHTML) {
// The URL is not a page but a file (a zip, a CSV, a bare image) that
// reached the page worker through an extensionless link. Hand it to the
// asset downloader, where the size and media policy decides whether to
// localise it or leave it remote, rather than saving a broken page or
// letting Chrome download it to the user's Downloads folder (issue #32).
c.front.markVisited(key)
if c.wantAsset(j.u) {
c.enqueueAsset(ctx, j.u, "")
c.logf("not a page, fetching as asset (%s): %s", notHTML.ContentType, j.u.String())
} else {
c.logf("not a page, left on the live web (%s): %s", notHTML.ContentType, j.u.String())
}
return
}
c.failPage(j.u.String(), fmt.Errorf("render: %w", err))
return
}
Expand Down
73 changes: 73 additions & 0 deletions clone/cloner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,79 @@ func TestCloneRefreshReRenders(t *testing.T) {
}
}

// TestCloneRoutesNonHTMLToAsset guards issue #32: an extensionless link that
// turns out to be a file (a zip) is classified as a page up front, but once the
// page worker sees it is not HTML it must be handed to the asset downloader, not
// saved as a broken page nor downloaded by Chrome to ~/Downloads.
func TestCloneRoutesNonHTMLToAsset(t *testing.T) {
if testing.Short() {
t.Skip("clone test drives Chrome; skipped under -short")
}
if _, ok := browser.LookChrome(); !ok {
t.Skip("no Chrome/Chromium found; skipping clone test")
}

mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
// The link has no extension, so it is queued as a page; the server then
// answers it with a zip.
_, _ = w.Write([]byte(`<!doctype html><html><body>
<h1>Home</h1><a href="/download">grab the bundle</a></body></html>`))
})
mux.HandleFunc("/download", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/zip")
_, _ = w.Write([]byte("PK\x03\x04 pretend bundle"))
})
srv := httptest.NewServer(mux)
defer srv.Close()

seed, err := urlx.ParseSeed(srv.URL)
if err != nil {
t.Fatalf("parse seed: %v", err)
}

out := t.TempDir()
cfg := DefaultConfig()
cfg.OutDir = out
cfg.Settle = 300 * time.Millisecond
cfg.RenderTimeout = 20 * time.Second
cfg.Timeout = 10 * time.Second

ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
defer cancel()

res, err := New(seed, cfg, t.Logf).Run(ctx)
if err != nil {
t.Fatalf("run: %v", err)
}

root := res.OutDir
// The home page is a real page and is written.
if !fileExists(filepath.Join(root, "index.html")) {
t.Error("home page was not written")
}
// The zip is NOT saved as a page: no download/index.html exists.
if fileExists(filepath.Join(root, "download", "index.html")) {
t.Error("non-HTML target was saved as a page")
}
// The zip is fetched as an asset under the reserved tree instead.
if res.Assets < 1 {
t.Errorf("expected the zip to be fetched as an asset, assets=%d", res.Assets)
}
assetDir := filepath.Join(root, cfg.Reserved)
if !anyFileUnder(t, assetDir, "download") {
t.Error("the zip was not downloaded into the reserved asset tree")
}
if res.PageErrors != 0 {
t.Errorf("a rerouted non-HTML target must not count as a page error, got %d", res.PageErrors)
}
}

func readFile(t *testing.T, path string) string {
t.Helper()
b, err := os.ReadFile(path)
Expand Down
Loading