From 09389ebb6f036d530a01f0aa6ea8c92ab7b4d07f Mon Sep 17 00:00:00 2001 From: valentino Date: Thu, 25 Jun 2026 20:23:28 +0000 Subject: [PATCH 1/2] feat: Faster scanning --- go.mod | 2 +- internal/catalog/cataloger.go | 52 ++++++++++--- internal/catalog/concurrency_test.go | 106 ++++++++++++++++++++++++++ internal/catalog/npm_cataloger.go | 18 +++-- internal/catalog/setuppy_cataloger.go | 10 ++- internal/catalog/uv_cataloger.go | 12 ++- 6 files changed, 181 insertions(+), 19 deletions(-) create mode 100644 internal/catalog/concurrency_test.go diff --git a/go.mod b/go.mod index 61bed5a..e9e22e5 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/BurntSushi/toml v1.6.0 github.com/anchore/syft v1.44.0 github.com/spf13/cobra v1.10.2 + golang.org/x/sync v0.20.0 ) require ( @@ -74,7 +75,6 @@ require ( github.com/wagoodman/go-progress v0.0.0-20260303201901-10176f79b2c0 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/sync v0.20.0 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/term v0.42.0 // indirect golang.org/x/text v0.36.0 // indirect diff --git a/internal/catalog/cataloger.go b/internal/catalog/cataloger.go index ee51ae8..5231f2e 100644 --- a/internal/catalog/cataloger.go +++ b/internal/catalog/cataloger.go @@ -2,13 +2,25 @@ package catalog import ( "fmt" + "os" "path/filepath" + "sort" + "strconv" "strings" + "sync" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" + "golang.org/x/sync/errgroup" ) +func catalogConcurrency() int { + if n, err := strconv.Atoi(os.Getenv("OSSPREY_SCAN_CONCURRENCY")); err == nil && n > 0 { + return n + } + return 8 +} + // fileParser converts one matched manifest into syft packages. type fileParser func(absPath string, loc file.Location) ([]pkg.Package, error) @@ -25,18 +37,40 @@ func catalogByGlob(resolver file.Resolver, root, glob, label string, parse fileP if err != nil { return nil, fmt.Errorf("%s cataloger: glob: %w", label, err) } - seen := make(map[string]struct{}) - var out []pkg.Package - for _, loc := range locs { - // Skip vendored dependencies. + + type result struct { + idx int + pkgs []pkg.Package + } + var ( + mu sync.Mutex + results []result + ) + g := new(errgroup.Group) + g.SetLimit(catalogConcurrency()) + for i, loc := range locs { if isVendoredPath(loc.RealPath) { continue } - pkgs, err := parse(filepath.Join(root, loc.RealPath), loc) - if err != nil || len(pkgs) == 0 { - continue - } - for _, p := range pkgs { + i, loc := i, loc + g.Go(func() error { + pkgs, err := parse(filepath.Join(root, loc.RealPath), loc) + if err != nil || len(pkgs) == 0 { + return nil // per-file errors are non-fatal, as before + } + mu.Lock() + results = append(results, result{idx: i, pkgs: pkgs}) + mu.Unlock() + return nil + }) + } + _ = g.Wait() // workers never return non-nil; Wait is just the barrier + + sort.Slice(results, func(a, b int) bool { return results[a].idx < results[b].idx }) + seen := make(map[string]struct{}) + var out []pkg.Package + for _, r := range results { + for _, p := range r.pkgs { key := p.Name + "@" + p.Version if _, ok := seen[key]; ok { continue diff --git a/internal/catalog/concurrency_test.go b/internal/catalog/concurrency_test.go new file mode 100644 index 0000000..9732832 --- /dev/null +++ b/internal/catalog/concurrency_test.go @@ -0,0 +1,106 @@ +package catalog + +import ( + "fmt" + "path/filepath" + "strconv" + "testing" + "time" + + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" + "github.com/anchore/syft/syft/source" + "github.com/anchore/syft/syft/source/directorysource" +) + +func TestCatalogConcurrency(t *testing.T) { + t.Setenv("OSSPREY_SCAN_CONCURRENCY", "") + if got := catalogConcurrency(); got != 8 { + t.Errorf("default = %d, want 8", got) + } + t.Setenv("OSSPREY_SCAN_CONCURRENCY", "3") + if got := catalogConcurrency(); got != 3 { + t.Errorf("override = %d, want 3", got) + } + for _, bad := range []string{"0", "-1", "abc"} { + t.Setenv("OSSPREY_SCAN_CONCURRENCY", bad) + if got := catalogConcurrency(); got != 8 { + t.Errorf("invalid(%q) = %d, want default 8", bad, got) + } + } +} + +func buildResolver(t *testing.T, dir string) file.Resolver { + t.Helper() + src, err := directorysource.NewFromPath(dir) + if err != nil { + t.Fatalf("source: %v", err) + } + t.Cleanup(func() { src.Close() }) + r, err := src.FileResolver(source.SquashedScope) + if err != nil { + t.Fatalf("resolver: %v", err) + } + return r +} + +func TestCatalogByGlob_DeterministicUnderConcurrency(t *testing.T) { + dir := t.TempDir() + const n = 24 + for i := 0; i < n; i++ { + writeFile(t, dir, fmt.Sprintf("m%02d.dep", i), "x") + } + resolver := buildResolver(t, dir) + + parse := func(absPath string, loc file.Location) ([]pkg.Package, error) { + base := filepath.Base(loc.RealPath) + idx, _ := strconv.Atoi(base[1:3]) + time.Sleep(time.Duration(n-idx) * time.Millisecond) + return []pkg.Package{{ + Name: fmt.Sprintf("pkg%02d", idx), + Version: "1.0.0", + Type: pkg.NpmPkg, + Locations: file.NewLocationSet(loc), + }}, nil + } + + t.Setenv("OSSPREY_SCAN_CONCURRENCY", "1") + seq, err := catalogByGlob(resolver, dir, "**/*.dep", "test", parse) + if err != nil { + t.Fatal(err) + } + t.Setenv("OSSPREY_SCAN_CONCURRENCY", "8") + conc, err := catalogByGlob(resolver, dir, "**/*.dep", "test", parse) + if err != nil { + t.Fatal(err) + } + + if len(seq) != n || len(conc) != n { + t.Fatalf("len seq=%d conc=%d, want %d", len(seq), len(conc), n) + } + for i := range seq { + if seq[i].Name != conc[i].Name || seq[i].Version != conc[i].Version { + t.Fatalf("order/content differs at %d: seq=%s conc=%s", i, seq[i].Name, conc[i].Name) + } + } +} + +// Duplicate (name,version) across files collapses to one entry under concurrency. +func TestCatalogByGlob_DedupUnderConcurrency(t *testing.T) { + dir := t.TempDir() + for _, name := range []string{"a.dep", "b.dep", "c.dep"} { + writeFile(t, dir, name, "x") + } + resolver := buildResolver(t, dir) + parse := func(absPath string, loc file.Location) ([]pkg.Package, error) { + return []pkg.Package{{Name: "dup", Version: "1.0.0", Type: pkg.NpmPkg, Locations: file.NewLocationSet(loc)}}, nil + } + t.Setenv("OSSPREY_SCAN_CONCURRENCY", "8") + out, err := catalogByGlob(resolver, dir, "**/*.dep", "test", parse) + if err != nil { + t.Fatal(err) + } + if len(out) != 1 { + t.Fatalf("got %d packages, want 1 (deduped): %v", len(out), out) + } +} diff --git a/internal/catalog/npm_cataloger.go b/internal/catalog/npm_cataloger.go index 930bf4c..5e6cb4e 100644 --- a/internal/catalog/npm_cataloger.go +++ b/internal/catalog/npm_cataloger.go @@ -36,12 +36,19 @@ func (c *NpmResolveCataloger) Catalog(ctx context.Context, resolver file.Resolve if err != nil { return nil, nil, nil // no npm on PATH — silently skip } + // One shared cache for the whole scan + cache, err := os.MkdirTemp("", "ossprey-npm-cache-") + if err != nil { + return nil, nil, fmt.Errorf("npm cache: %w", err) + } + defer os.RemoveAll(cache) + parse := func(absPath string, loc file.Location) ([]pkg.Package, error) { dir := filepath.Dir(absPath) if hasNpmLockfile(dir) { return nil, nil // syft's lock cataloger already resolves this project } - return runNpmResolve(ctx, npm, absPath, loc) + return runNpmResolve(ctx, npm, cache, absPath, loc) } out, err := catalogByGlob(resolver, c.root, "**/package.json", "npm", parse) return out, nil, err @@ -61,8 +68,9 @@ func hasNpmLockfile(dir string) bool { // with `npm install --package-lock-only` (resolution only — no node_modules, // no install scripts), and parses the resolved versions. The temp dir keeps the // user's working tree clean; --ignore-scripts guarantees we never execute code -// from the (potentially malicious) dependency tree we are about to scan. -func runNpmResolve(ctx context.Context, npm, packageJSON string, loc file.Location) ([]pkg.Package, error) { +// from the (potentially malicious) dependency tree we are about to scan. cache +// is the scan-wide shared npm cache. +func runNpmResolve(ctx context.Context, npm, cache, packageJSON string, loc file.Location) ([]pkg.Package, error) { tmp, err := os.MkdirTemp("", "ossprey-npm-") if err != nil { return nil, err @@ -85,9 +93,7 @@ func runNpmResolve(ctx context.Context, npm, packageJSON string, loc file.Locati "--no-update-notifier", ) cmd.Dir = tmp - // Keep npm's cache inside the temp dir so we neither touch the user's cache - // nor fail when HOME is read-only (Lambda). - cmd.Env = append(os.Environ(), "npm_config_cache="+filepath.Join(tmp, ".npm-cache")) + cmd.Env = append(os.Environ(), "npm_config_cache="+cache) if out, err := cmd.CombinedOutput(); err != nil { return nil, fmt.Errorf("npm install --package-lock-only: %w: %s", err, strings.TrimSpace(string(out))) } diff --git a/internal/catalog/setuppy_cataloger.go b/internal/catalog/setuppy_cataloger.go index 6be0d99..5f7e67f 100644 --- a/internal/catalog/setuppy_cataloger.go +++ b/internal/catalog/setuppy_cataloger.go @@ -2,6 +2,8 @@ package catalog import ( "context" + "fmt" + "os" "os/exec" "path/filepath" @@ -31,6 +33,12 @@ func (c *SetupPyCataloger) Catalog(ctx context.Context, resolver file.Resolver) if err != nil { return nil, nil, nil // no uv on PATH — silently skip } + cache, err := os.MkdirTemp("", "ossprey-uv-cache-") + if err != nil { + return nil, nil, fmt.Errorf("uv cache: %w", err) + } + defer os.RemoveAll(cache) + parse := func(absPath string, loc file.Location) ([]pkg.Package, error) { dir := filepath.Dir(absPath) // Skip if pyproject.toml in same dir AND has [project] table — UVCataloger @@ -40,7 +48,7 @@ func (c *SetupPyCataloger) Catalog(ctx context.Context, resolver file.Resolver) return nil, nil } args := []string{"pip", "compile", "--universal", "--no-progress", absPath} - return runUV(ctx, uv, dir, args, loc) + return runUV(ctx, uv, cache, dir, args, loc) } out, err := catalogByGlob(resolver, c.root, "**/setup.py", "setup.py", parse) return out, nil, err diff --git a/internal/catalog/uv_cataloger.go b/internal/catalog/uv_cataloger.go index a145d97..6feb90c 100644 --- a/internal/catalog/uv_cataloger.go +++ b/internal/catalog/uv_cataloger.go @@ -38,10 +38,17 @@ func (c *UVCataloger) Catalog(ctx context.Context, resolver file.Resolver) ([]pk if err != nil { return nil, nil, nil // no uv on PATH — silently skip } + + cache, err := os.MkdirTemp("", "ossprey-uv-cache-") + if err != nil { + return nil, nil, fmt.Errorf("uv cache: %w", err) + } + defer os.RemoveAll(cache) + parse := func(absPath string, loc file.Location) ([]pkg.Package, error) { dir := filepath.Dir(absPath) args := uvArgsForPyProject(dir) - return runUV(ctx, uv, dir, args, loc) + return runUV(ctx, uv, cache, dir, args, loc) } out, err := catalogByGlob(resolver, c.root, "**/pyproject.toml", "uv", parse) return out, nil, err @@ -69,8 +76,9 @@ func uvArgsForPyProject(dir string) []string { } } -func runUV(ctx context.Context, uv, dir string, args []string, loc file.Location) ([]pkg.Package, error) { +func runUV(ctx context.Context, uv, cache, dir string, args []string, loc file.Location) ([]pkg.Package, error) { cmd := exec.CommandContext(ctx, uv, args...) + cmd.Env = append(os.Environ(), "UV_CACHE_DIR="+cache) stdout, err := cmd.Output() if err != nil { var ee *exec.ExitError From 162d1ca31ece8325d1c694f0b330e9ed15c30334 Mon Sep 17 00:00:00 2001 From: valentino Date: Thu, 25 Jun 2026 20:34:12 +0000 Subject: [PATCH 2/2] chore: Copilot comment --- internal/catalog/cataloger.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/catalog/cataloger.go b/internal/catalog/cataloger.go index 5231f2e..ab49024 100644 --- a/internal/catalog/cataloger.go +++ b/internal/catalog/cataloger.go @@ -15,7 +15,7 @@ import ( ) func catalogConcurrency() int { - if n, err := strconv.Atoi(os.Getenv("OSSPREY_SCAN_CONCURRENCY")); err == nil && n > 0 { + if n, err := strconv.Atoi(strings.TrimSpace(os.Getenv("OSSPREY_SCAN_CONCURRENCY"))); err == nil && n > 0 { return n } return 8