diff --git a/.claude/settings.local.json b/.claude/settings.local.json index adc11d8..3575f6d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -4,7 +4,10 @@ "Bash(find:*)", "Bash(touch:*)", "Bash(gh pr checks:*)", - "Bash(grep:*)" + "Bash(grep:*)", + "Bash(git commit:*)", + "Bash(git push:*)", + "Bash(go:*)" ], "deny": [] } diff --git a/cmd/urlmap/main.go b/cmd/urlmap/main.go index 4479ebd..5d2f1d5 100644 --- a/cmd/urlmap/main.go +++ b/cmd/urlmap/main.go @@ -44,6 +44,7 @@ var ( jsAuto bool jsAutoStrict bool jsThreshold float64 + jsPoolSize int // Robots.txt flags respectRobots bool @@ -101,6 +102,9 @@ func init() { rootCmd.Flags().BoolVar(&jsAutoStrict, "js-auto-strict", false, "Enable strict automatic detection with dynamic verification") rootCmd.Flags().Float64Var(&jsThreshold, "js-threshold", 0.5, "SPA detection threshold (0.0-1.0)") + // Browser pool flags + rootCmd.Flags().IntVar(&jsPoolSize, "js-pool-size", 2, "Number of browser instances in the pool") + // Robots.txt flags rootCmd.Flags().BoolVar(&respectRobots, "respect-robots", false, "Respect robots.txt rules and crawl delays") @@ -145,6 +149,7 @@ func runCrawl(cmd *cobra.Command, args []string) error { AutoDetect: jsAuto || jsAutoStrict, StrictMode: jsAutoStrict, Threshold: jsThreshold, + PoolSize: jsPoolSize, } } diff --git a/internal/client/browser_pool.go b/internal/client/browser_pool.go index 5c5bf93..733a9e8 100644 --- a/internal/client/browser_pool.go +++ b/internal/client/browser_pool.go @@ -11,10 +11,13 @@ import ( "github.com/playwright-community/playwright-go" ) +// Global initialization lock to prevent concurrent Playwright installations +var globalInitMu sync.Mutex + // BrowserPool manages shared browser instances for JavaScript rendering type BrowserPool struct { playwright *playwright.Playwright - browser playwright.Browser + browsers []playwright.Browser // Multiple browser instances config *JSConfig logger *slog.Logger @@ -23,6 +26,10 @@ type BrowserPool struct { maxContexts int mu sync.RWMutex + // Browser instance management + currentBrowserIdx int + browserMu sync.Mutex + // Lifecycle management initialized bool closed bool @@ -48,11 +55,18 @@ func NewBrowserPool(config *JSConfig, logger *slog.Logger) (*BrowserPool, error) logger = slog.Default() } + poolSize := config.PoolSize + if poolSize <= 0 { + poolSize = 2 // Default pool size + } + pool := &BrowserPool{ - config: config, - logger: logger, - maxContexts: 10, // Default max contexts - contextPool: make(chan *BrowserContext, 10), + config: config, + logger: logger, + maxContexts: 10, // Default max contexts per browser + contextPool: make(chan *BrowserContext, 10*poolSize), + browsers: make([]playwright.Browser, 0, poolSize), + currentBrowserIdx: 0, } // Initialize the pool if JS rendering is enabled @@ -74,13 +88,19 @@ func (p *BrowserPool) initialize() error { return nil } + // Use global lock for Playwright initialization + globalInitMu.Lock() + defer globalInitMu.Unlock() + p.logger.Debug("Initializing browser pool", "browser_type", p.config.BrowserType) // Install Playwright (this will be a no-op if already installed) + p.logger.Debug("Installing Playwright browsers...") err := playwright.Install() if err != nil { return fmt.Errorf("failed to install Playwright: %w", err) } + p.logger.Debug("Playwright browsers installed") // Run Playwright pw, err := playwright.Run() @@ -89,7 +109,7 @@ func (p *BrowserPool) initialize() error { } p.playwright = pw - // Launch browser + // Get browser type var browserType playwright.BrowserType switch p.config.BrowserType { case "chromium": @@ -102,23 +122,92 @@ func (p *BrowserPool) initialize() error { return fmt.Errorf("unsupported browser type: %s", p.config.BrowserType) } + // Launch the first browser instance browser, err := browserType.Launch(playwright.BrowserTypeLaunchOptions{ Headless: playwright.Bool(p.config.Headless), }) if err != nil { - return fmt.Errorf("failed to launch browser: %w", err) + return fmt.Errorf("failed to launch first browser: %w", err) } - p.browser = browser + p.browsers = append(p.browsers, browser) p.initialized = true p.logger.Info("Browser pool initialized successfully", "browser_type", p.config.BrowserType, "headless", p.config.Headless, - "max_contexts", p.maxContexts) + "pool_size", len(p.browsers), + "max_pool_size", cap(p.browsers), + "max_contexts_per_browser", p.maxContexts) return nil } +// launchNewBrowserLocked creates and launches a new browser instance (must be called with browserMu held) +func (p *BrowserPool) launchNewBrowserLocked() (int, error) { + + // Check if we've reached the pool size limit + if len(p.browsers) >= cap(p.browsers) { + return -1, fmt.Errorf("browser pool is at capacity: %d", cap(p.browsers)) + } + + // Get browser type + var browserType playwright.BrowserType + switch p.config.BrowserType { + case "chromium": + browserType = p.playwright.Chromium + case "firefox": + browserType = p.playwright.Firefox + case "webkit": + browserType = p.playwright.WebKit + default: + return -1, fmt.Errorf("unsupported browser type: %s", p.config.BrowserType) + } + + // Launch new browser + browser, err := browserType.Launch(playwright.BrowserTypeLaunchOptions{ + Headless: playwright.Bool(p.config.Headless), + }) + if err != nil { + return -1, fmt.Errorf("failed to launch browser: %w", err) + } + + // Add to browsers slice + idx := len(p.browsers) + p.browsers = append(p.browsers, browser) + + p.logger.Info("Launched new browser instance", + "browser_index", idx, + "total_browsers", len(p.browsers)) + + return idx, nil +} + +// getBrowser gets a browser using round-robin +func (p *BrowserPool) getBrowser() (playwright.Browser, error) { + p.browserMu.Lock() + defer p.browserMu.Unlock() + + // If no browsers exist, create the first one + if len(p.browsers) == 0 { + return nil, fmt.Errorf("no browsers initialized") + } + + // Create more browsers if needed and under limit + if len(p.browsers) < cap(p.browsers) { + _, err := p.launchNewBrowserLocked() + if err != nil { + // Log but don't fail - use existing browsers + p.logger.Debug("Failed to launch additional browser", "error", err) + } + } + + // Use round-robin to distribute load + browser := p.browsers[p.currentBrowserIdx] + p.currentBrowserIdx = (p.currentBrowserIdx + 1) % len(p.browsers) + + return browser, nil +} + // AcquireContext gets a browser context from the pool func (p *BrowserPool) AcquireContext() (*BrowserContext, error) { p.closeMu.Lock() @@ -170,11 +259,13 @@ func (p *BrowserContext) ReleaseContext() { // createNewContext creates a new browser context func (p *BrowserPool) createNewContext() (*BrowserContext, error) { - if p.browser == nil { - return nil, fmt.Errorf("browser not initialized") + // Get a browser from the pool + browser, err := p.getBrowser() + if err != nil { + return nil, fmt.Errorf("failed to get browser from pool: %w", err) } - context, err := p.browser.NewContext(playwright.BrowserNewContextOptions{ + context, err := browser.NewContext(playwright.BrowserNewContextOptions{ UserAgent: playwright.String(p.config.UserAgent), }) if err != nil { @@ -295,11 +386,14 @@ func (p *BrowserPool) GetPoolStats() map[string]interface{} { defer p.mu.RUnlock() return map[string]interface{}{ - "initialized": p.initialized, - "closed": p.closed, - "pool_size": cap(p.contextPool), - "max_contexts": p.maxContexts, - "available": len(p.contextPool), + "initialized": p.initialized, + "closed": p.closed, + "browsers_active": len(p.browsers), + "browsers_available": len(p.browsers), // All browsers are available via round-robin + "browser_pool_size": cap(p.browsers), + "context_pool_size": cap(p.contextPool), + "contexts_available": len(p.contextPool), + "max_contexts": p.maxContexts, } } @@ -322,13 +416,18 @@ func (p *BrowserPool) Close() error { } } - // Close browser - if p.browser != nil { - if err := p.browser.Close(); err != nil { - p.logger.Warn("Failed to close browser", "error", err) + // Close all browsers + var closeErrors []error + for i, browser := range p.browsers { + if browser != nil { + p.logger.Debug("Closing browser", "browser_index", i) + if err := browser.Close(); err != nil { + p.logger.Warn("Failed to close browser", "browser_index", i, "error", err) + closeErrors = append(closeErrors, fmt.Errorf("browser %d: %w", i, err)) + } } - p.browser = nil } + p.browsers = nil // Stop Playwright if p.playwright != nil { diff --git a/internal/client/browser_pool_test.go b/internal/client/browser_pool_test.go index 34dd267..0b65445 100644 --- a/internal/client/browser_pool_test.go +++ b/internal/client/browser_pool_test.go @@ -3,8 +3,11 @@ package client import ( "context" "log/slog" + "strings" "testing" "time" + + "github.com/aoshimash/urlmap/test/shared" ) func TestNewBrowserPool(t *testing.T) { @@ -22,7 +25,7 @@ func TestNewBrowserPool(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, } pool2, err := NewBrowserPool(config, logger) @@ -38,7 +41,7 @@ func TestBrowserPool_AcquireContext(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, } pool, err := NewBrowserPool(config, logger) @@ -78,12 +81,16 @@ func TestBrowserPool_AcquireContext(t *testing.T) { } func TestBrowserPool_RenderPage(t *testing.T) { + // Create test server for more reliable testing in CI + testServer := shared.CreateBasicTestServer() + defer testServer.Close() + logger := slog.Default() config := &JSConfig{ Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 30 * time.Second, // Reduced timeout since local server is faster } pool, err := NewBrowserPool(config, logger) @@ -92,9 +99,9 @@ func TestBrowserPool_RenderPage(t *testing.T) { } defer pool.Close() - // Test rendering a simple page + // Test rendering a simple page from local test server ctx := context.Background() - content, err := pool.RenderPage(ctx, "https://example.com") + content, err := pool.RenderPage(ctx, testServer.URL) if err != nil { // The browser pool now logs debug info automatically t.Fatalf("Failed to render page: %v", err) @@ -104,9 +111,13 @@ func TestBrowserPool_RenderPage(t *testing.T) { t.Error("Rendered content is empty") } - // Check that content contains expected elements - if len(content) < 100 { - t.Error("Rendered content seems too short") + // Check that content contains expected elements from test server + if !strings.Contains(content, "Test Home Page") { + t.Error("Rendered content does not contain expected title") + } + + if !strings.Contains(content, "Page 1") || !strings.Contains(content, "Page 2") { + t.Error("Rendered content does not contain expected links") } } @@ -116,7 +127,7 @@ func TestBrowserPool_GetPoolStats(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", } @@ -129,7 +140,7 @@ func TestBrowserPool_GetPoolStats(t *testing.T) { stats := pool.GetPoolStats() // Check required fields - requiredFields := []string{"initialized", "closed", "pool_size", "max_contexts", "available"} + requiredFields := []string{"initialized", "closed", "browsers_active", "browsers_available", "browser_pool_size", "context_pool_size", "contexts_available", "max_contexts"} for _, field := range requiredFields { if _, exists := stats[field]; !exists { t.Errorf("Stats missing required field: %s", field) @@ -156,7 +167,7 @@ func TestBrowserPool_ConcurrentAccess(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", } @@ -192,7 +203,7 @@ func TestBrowserPool_ConcurrentAccess(t *testing.T) { // Check pool stats after concurrent access stats := pool.GetPoolStats() - if stats["available"].(int) > 10 { + if stats["contexts_available"].(int) > 10 { t.Error("Pool should not exceed max contexts") } } @@ -203,7 +214,7 @@ func TestBrowserPool_Close(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", } @@ -237,7 +248,7 @@ func TestBrowserContext_ReleaseContext(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", } @@ -261,8 +272,101 @@ func TestBrowserContext_ReleaseContext(t *testing.T) { // Check that context is properly released stats := pool.GetPoolStats() - available := stats["available"].(int) + available := stats["contexts_available"].(int) if available < 0 || available > 10 { t.Errorf("Available contexts should be between 0 and 10, got %d", available) } } + +func TestBrowserPool_MultipleBrowsers(t *testing.T) { + logger := slog.Default() + config := &JSConfig{ + Enabled: true, + BrowserType: "chromium", + Headless: true, + Timeout: 90 * time.Second, + WaitFor: "networkidle", + PoolSize: 3, + } + + pool, err := NewBrowserPool(config, logger) + if err != nil { + t.Fatalf("Failed to create browser pool: %v", err) + } + defer pool.Close() + + // Check initial stats + stats := pool.GetPoolStats() + if stats["browsers_active"].(int) != 1 { + t.Errorf("Expected 1 browser initially, got %d", stats["browsers_active"].(int)) + } + + // Acquire multiple contexts to trigger browser creation + contexts := make([]*BrowserContext, 0) + for i := 0; i < 3; i++ { + ctx, err := pool.AcquireContext() + if err != nil { + t.Fatalf("Failed to acquire context %d: %v", i, err) + } + contexts = append(contexts, ctx) + } + + // Check stats after acquiring contexts + stats = pool.GetPoolStats() + browsersActive := stats["browsers_active"].(int) + if browsersActive < 1 || browsersActive > 3 { + t.Errorf("Expected 1-3 browsers active, got %d", browsersActive) + } + + // Release all contexts + for _, ctx := range contexts { + ctx.ReleaseContext() + } +} + +func TestBrowserPool_OnDemandCreation(t *testing.T) { + logger := slog.Default() + config := &JSConfig{ + Enabled: true, + BrowserType: "chromium", + Headless: true, + Timeout: 90 * time.Second, + WaitFor: "networkidle", + PoolSize: 2, + } + + pool, err := NewBrowserPool(config, logger) + if err != nil { + t.Fatalf("Failed to create browser pool: %v", err) + } + defer pool.Close() + + // The pool should start with one browser + initialStats := pool.GetPoolStats() + if initialStats["browsers_active"].(int) != 1 { + t.Errorf("Expected 1 browser initially, got %d", initialStats["browsers_active"].(int)) + } + + // Acquire a context when no browsers are available + ctx1, err := pool.AcquireContext() + if err != nil { + t.Fatalf("Failed to acquire first context: %v", err) + } + + // Force creation of a second browser by not releasing the first context + ctx2, err := pool.AcquireContext() + if err != nil { + t.Fatalf("Failed to acquire second context: %v", err) + } + + // Check that a new browser might have been created + stats := pool.GetPoolStats() + browsersActive := stats["browsers_active"].(int) + if browsersActive > 2 { + t.Errorf("Should not exceed pool size of 2, got %d browsers", browsersActive) + } + + // Clean up + ctx1.ReleaseContext() + ctx2.ReleaseContext() +} diff --git a/internal/client/js_client_test.go b/internal/client/js_client_test.go index 534f982..7357332 100644 --- a/internal/client/js_client_test.go +++ b/internal/client/js_client_test.go @@ -3,8 +3,11 @@ package client import ( "context" "log/slog" + "strings" "testing" "time" + + "github.com/aoshimash/urlmap/test/shared" ) func TestNewJSClient(t *testing.T) { @@ -22,8 +25,9 @@ func TestNewJSClient(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", + PoolSize: 2, } client2, err := NewJSClient(config, logger) @@ -34,13 +38,18 @@ func TestNewJSClient(t *testing.T) { } func TestJSClient_RenderPage(t *testing.T) { + // Create test server for more reliable testing in CI + testServer := shared.CreateBasicTestServer() + defer testServer.Close() + logger := slog.Default() config := &JSConfig{ Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 30 * time.Second, WaitFor: "networkidle", + PoolSize: 2, } client, err := NewJSClient(config, logger) @@ -51,7 +60,7 @@ func TestJSClient_RenderPage(t *testing.T) { // Test rendering a simple page ctx := context.Background() - content, err := client.RenderPage(ctx, "https://example.com") + content, err := client.RenderPage(ctx, testServer.URL) if err != nil { t.Fatalf("Failed to render page: %v", err) } @@ -60,20 +69,29 @@ func TestJSClient_RenderPage(t *testing.T) { t.Error("Rendered content is empty") } - // Check that content contains expected elements - if len(content) < 100 { - t.Error("Rendered content seems too short") + // Check that content contains expected elements from test server + if !strings.Contains(content, "Test Home Page") { + t.Error("Rendered content does not contain expected title") + } + + if !strings.Contains(content, "Page 1") || !strings.Contains(content, "Page 2") { + t.Error("Rendered content does not contain expected links") } } func TestJSClient_Get(t *testing.T) { + // Create test server for more reliable testing in CI + testServer := shared.CreateBasicTestServer() + defer testServer.Close() + logger := slog.Default() config := &JSConfig{ Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 30 * time.Second, WaitFor: "networkidle", + PoolSize: 2, } client, err := NewJSClient(config, logger) @@ -84,7 +102,7 @@ func TestJSClient_Get(t *testing.T) { // Test getting a page ctx := context.Background() - response, err := client.Get(ctx, "https://example.com") + response, err := client.Get(ctx, testServer.URL) if err != nil { t.Fatalf("Failed to get page: %v", err) } @@ -93,8 +111,8 @@ func TestJSClient_Get(t *testing.T) { t.Fatal("Response is nil") } - if response.URL != "https://example.com" { - t.Errorf("Expected URL 'https://example.com', got '%s'", response.URL) + if response.URL != testServer.URL { + t.Errorf("Expected URL '%s', got '%s'", testServer.URL, response.URL) } if response.Content == "" { @@ -105,8 +123,9 @@ func TestJSClient_Get(t *testing.T) { t.Errorf("Expected status 200, got %d", response.Status) } - if response.Host != "example.com" { - t.Errorf("Expected host 'example.com', got '%s'", response.Host) + // Check that content contains expected elements from test server + if !strings.Contains(response.Content, "Test Home Page") { + t.Error("Response content does not contain expected title") } } @@ -116,8 +135,9 @@ func TestJSClient_GetPoolStats(t *testing.T) { Enabled: true, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, WaitFor: "networkidle", + PoolSize: 2, } client, err := NewJSClient(config, logger) @@ -132,7 +152,7 @@ func TestJSClient_GetPoolStats(t *testing.T) { } // Check required fields - requiredFields := []string{"initialized", "closed", "pool_size", "max_contexts", "available"} + requiredFields := []string{"initialized", "closed", "browsers_active", "browsers_available", "browser_pool_size", "context_pool_size", "contexts_available", "max_contexts"} for _, field := range requiredFields { if _, exists := stats[field]; !exists { t.Errorf("Stats missing required field: %s", field) @@ -146,7 +166,8 @@ func TestJSClient_Disabled(t *testing.T) { Enabled: false, BrowserType: "chromium", Headless: true, - Timeout: 60 * time.Second, + Timeout: 90 * time.Second, + PoolSize: 2, } client, err := NewJSClient(config, logger) diff --git a/internal/client/js_config.go b/internal/client/js_config.go index af7b050..57202f3 100644 --- a/internal/client/js_config.go +++ b/internal/client/js_config.go @@ -37,6 +37,9 @@ type JSConfig struct { // Fallback indicates whether to fallback to HTTP client on JS errors Fallback bool + + // PoolSize specifies the number of browser instances in the pool + PoolSize int } // DefaultJSConfig returns a default JavaScript configuration @@ -52,6 +55,7 @@ func DefaultJSConfig() *JSConfig { StrictMode: false, Threshold: 0.5, Fallback: true, + PoolSize: 2, } } @@ -92,5 +96,10 @@ func (c *JSConfig) Validate() error { return fmt.Errorf("timeout must be positive, got: %v", c.Timeout) } + // Validate pool size + if c.PoolSize <= 0 { + return fmt.Errorf("pool size must be positive, got: %v", c.PoolSize) + } + return nil }