diff --git a/Example/Sources/DemarkExample/ContentView-iOS.swift b/Example/Sources/DemarkExample/ContentView-iOS.swift index 0c86ba4..f3e7e4e 100644 --- a/Example/Sources/DemarkExample/ContentView-iOS.swift +++ b/Example/Sources/DemarkExample/ContentView-iOS.swift @@ -10,28 +10,90 @@ import SwiftUI #if os(iOS) extension ContentView { + private var hasValidInputForIOS: Bool { + switch inputMode { + case .html: + return !htmlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + case .url: + let trimmed = urlInput.trimmingCharacters(in: .whitespacesAndNewlines) + return !trimmed.isEmpty + } + } + + private var iOSContentSelectorBinding: Binding { + Binding( + get: { urlLoadingOptions.contentSelector ?? "" }, + set: { urlLoadingOptions.contentSelector = $0.isEmpty ? nil : $0 } + ) + } + var iOSLayout: some View { NavigationStack { ScrollView { VStack(spacing: 20) { - // HTML Input Section + // Input Mode Picker + Picker("Input Mode", selection: $inputMode) { + ForEach(InputMode.allCases, id: \.self) { mode in + Label(mode.rawValue, systemImage: mode.icon).tag(mode) + } + } + .pickerStyle(.segmented) + .padding(.horizontal) + + // Input Section VStack(alignment: .leading, spacing: 12) { HStack { - Label("HTML Input", systemImage: "chevron.left.forwardslash.chevron.right") - .font(.headline) + Label( + inputMode == .html ? "HTML Input" : "URL Input", + systemImage: inputMode.icon + ) + .font(.headline) Spacer() - sampleHTMLMenu + if inputMode == .html { + sampleHTMLMenu + } } - TextEditor(text: $htmlInput) - .font(.system(.body, design: .monospaced)) - .frame(minHeight: 200) - .overlay( - RoundedRectangle(cornerRadius: 8) - .stroke(Color.secondary.opacity(0.3), lineWidth: 1) - ) + if inputMode == .html { + TextEditor(text: $htmlInput) + .font(.system(.body, design: .monospaced)) + .frame(minHeight: 200) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.secondary.opacity(0.3), lineWidth: 1) + ) + } else { + VStack(alignment: .leading, spacing: 16) { + VStack(alignment: .leading, spacing: 8) { + Text("URL") + .font(.caption) + .foregroundColor(.secondary) + + TextField("https://example.com", text: $urlInput) + .font(.system(.body, design: .monospaced)) + .textFieldStyle(.roundedBorder) + .autocapitalization(.none) + .keyboardType(.URL) + } + + VStack(alignment: .leading, spacing: 8) { + Text("Content Selector (optional)") + .font(.caption) + .foregroundColor(.secondary) + + TextField("e.g., article, main", text: iOSContentSelectorBinding) + .font(.system(.body, design: .monospaced)) + .textFieldStyle(.roundedBorder) + .autocapitalization(.none) + + Text("CSS selector to extract specific content") + .font(.caption2) + .foregroundColor(.secondary) + } + } + } } .padding() .background(Color(.secondarySystemBackground)) @@ -108,18 +170,18 @@ import SwiftUI .cornerRadius(12) // Convert Button - Button(action: convertHTML) { + Button(action: performConversion) { HStack { Image(systemName: "arrow.right.circle.fill") Text("Convert to Markdown") } .frame(maxWidth: .infinity) .padding() - .background(isConverting || htmlInput.isEmpty ? Color.gray : Color.accentColor) + .background(isConverting || !hasValidInputForIOS ? Color.gray : Color.accentColor) .foregroundColor(.white) .cornerRadius(12) } - .disabled(isConverting || htmlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + .disabled(isConverting || !hasValidInputForIOS) // Output Section if !markdownOutput.isEmpty || conversionError != nil { @@ -168,3 +230,4 @@ import SwiftUI } } #endif + diff --git a/Example/Sources/DemarkExample/ContentView.swift b/Example/Sources/DemarkExample/ContentView.swift index fc0384b..4dfd1f2 100644 --- a/Example/Sources/DemarkExample/ContentView.swift +++ b/Example/Sources/DemarkExample/ContentView.swift @@ -14,16 +14,31 @@ import SwiftUI #endif struct ContentView: View { + @State var inputMode: InputMode = .html @State var htmlInput: String = SampleHTML.defaultHTML + @State var urlInput: String = "https://example.com" @State var markdownOutput: String = "" @State var isConverting: Bool = false @State var conversionError: String? @State var selectedTab: OutputTab = .source @State var options = DemarkOptions() @State var selectedEngine: ConversionEngine = .turndown + @State var urlLoadingOptions = URLLoadingOptions() private let demark = Demark() + enum InputMode: String, CaseIterable { + case html = "HTML" + case url = "URL" + + var icon: String { + switch self { + case .html: "chevron.left.forwardslash.chevron.right" + case .url: "link" + } + } + } + enum OutputTab: String, CaseIterable { case source = "Source" case rendered = "Rendered" @@ -49,23 +64,47 @@ struct ContentView: View { var inputHeader: some View { VStack(alignment: .leading, spacing: 8) { HStack { - Label("HTML Input", systemImage: "chevron.left.forwardslash.chevron.right") + Label(inputMode == .html ? "HTML Input" : "URL Input", systemImage: inputMode.icon) .font(.title2) .fontWeight(.semibold) Spacer() - sampleHTMLMenu + inputModePicker + + if inputMode == .html { + sampleHTMLMenu + } } - Text("Paste or type your HTML content below") + Text(inputMode == .html ? "Paste or type your HTML content below" : "Enter a URL to fetch and convert") .font(.caption) .foregroundColor(.secondary) } .padding() } + var inputModePicker: some View { + Picker("Input Mode", selection: $inputMode) { + ForEach(InputMode.allCases, id: \.self) { mode in + Label(mode.rawValue, systemImage: mode.icon).tag(mode) + } + } + .pickerStyle(.segmented) + .frame(width: 140) + } + + @ViewBuilder var inputEditor: some View { + switch inputMode { + case .html: + htmlInputEditor + case .url: + urlInputEditor + } + } + + var htmlInputEditor: some View { ScrollView { TextEditor(text: $htmlInput) .font(.system(.body, design: .monospaced)) @@ -81,6 +120,51 @@ struct ContentView: View { .padding(.horizontal) } + var urlInputEditor: some View { + VStack(alignment: .leading, spacing: 16) { + VStack(alignment: .leading, spacing: 8) { + Text("URL") + .font(.caption) + .foregroundColor(.secondary) + + TextField("https://example.com", text: $urlInput) + .font(.system(.body, design: .monospaced)) + .textFieldStyle(.roundedBorder) + } + + VStack(alignment: .leading, spacing: 8) { + Text("Content Selector (optional)") + .font(.caption) + .foregroundColor(.secondary) + + TextField("e.g., article, main, .content", text: contentSelectorBinding) + .font(.system(.body, design: .monospaced)) + .textFieldStyle(.roundedBorder) + + Text("CSS selector to extract specific content from the page") + .font(.caption2) + .foregroundColor(.secondary) + } + + Spacer() + } + .padding() + .background(platformBackgroundColor) + .cornerRadius(8) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.secondary.opacity(0.3), lineWidth: 1) + ) + .padding(.horizontal) + } + + private var contentSelectorBinding: Binding { + Binding( + get: { urlLoadingOptions.contentSelector ?? "" }, + set: { urlLoadingOptions.contentSelector = $0.isEmpty ? nil : $0 } + ) + } + var sampleHTMLMenu: some View { Menu("Sample HTML") { ForEach(SampleHTML.allCases, id: \.self) { sample in @@ -312,14 +396,23 @@ struct ContentView: View { // MARK: - Action Buttons var convertButton: some View { - Button(action: convertHTML) { + Button(action: performConversion) { HStack { Image(systemName: "arrow.right.circle.fill") Text("Convert") } } .keyboardShortcut(.return, modifiers: .command) - .disabled(isConverting || htmlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + .disabled(isConverting || !hasValidInput) + } + + private var hasValidInput: Bool { + switch inputMode { + case .html: + return !htmlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + case .url: + return URL(string: urlInput) != nil && !urlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } } private var copyButton: some View { @@ -332,6 +425,16 @@ struct ContentView: View { // MARK: - Actions + @MainActor + func performConversion() { + switch inputMode { + case .html: + convertHTML() + case .url: + convertURL() + } + } + @MainActor func convertHTML() { guard !htmlInput.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return } @@ -353,6 +456,34 @@ struct ContentView: View { } } + @MainActor + func convertURL() { + guard let url = URL(string: urlInput) else { + conversionError = "Invalid URL" + return + } + + isConverting = true + conversionError = nil + + Task { + do { + let result = try await demark.convertToMarkdown( + url: url, + options: options, + loadingOptions: urlLoadingOptions + ) + markdownOutput = result + conversionError = nil + } catch { + conversionError = error.localizedDescription + markdownOutput = "" + } + + isConverting = false + } + } + func copyMarkdown() { #if os(macOS) let pasteboard = NSPasteboard.general diff --git a/Sources/Demark/Demark.swift b/Sources/Demark/Demark.swift index 32403e2..d2b6922 100644 --- a/Sources/Demark/Demark.swift +++ b/Sources/Demark/Demark.swift @@ -16,6 +16,7 @@ final class ConversionRuntime { private let logger = Logger(subsystem: "com.demark", category: "conversion") private let turndownRuntime = TurndownRuntime() private let htmlToMdRuntime = HTMLToMdRuntime() + private lazy var urlLoadingRuntime = URLLoadingRuntime() // MARK: - Public Methods @@ -57,6 +58,22 @@ final class ConversionRuntime { return normalizeMarkdown(rawMarkdown, bulletMarker: options.bulletListMarker) } + /// Load URL and convert to Markdown + func urlToMarkdown(_ url: URL, options: DemarkOptions, loadingOptions: URLLoadingOptions) async throws -> String { + logger.info("Loading URL for conversion: \(url.absoluteString)") + + // Validate URL scheme + guard url.scheme == "http" || url.scheme == "https" else { + throw DemarkError.invalidURLScheme("Only http and https URLs are supported, got: \(url.scheme ?? "nil")") + } + + // Load and extract HTML + let html = try await urlLoadingRuntime.loadAndExtract(url: url, options: loadingOptions) + + // Convert using existing pipeline + return try await htmlToMarkdown(html, options: options) + } + // MARK: - Normalization helpers /// Normalize list markers to match expectations in tests (single space after marker) @@ -196,4 +213,59 @@ public final class Demark { public func convertToMarkdown(_ html: String, options: DemarkOptions = DemarkOptions()) async throws -> String { try await conversionRuntime.htmlToMarkdown(html, options: options) } + + /// Convert a website URL to Markdown format + /// + /// Loads the URL in a WebView, waits for JavaScript execution to complete, + /// extracts the rendered HTML, and converts it to Markdown. + /// + /// - Parameters: + /// - url: The URL to load and convert + /// - options: Configuration options for the HTML to Markdown conversion process + /// - loadingOptions: Configuration options for URL loading behavior + /// - Returns: The converted Markdown string + /// - Throws: DemarkError if loading or conversion fails + /// + /// ## Example + /// + /// ```swift + /// let demark = Demark() + /// let url = URL(string: "https://example.com")! + /// + /// // Basic usage with defaults + /// let markdown = try await demark.convertToMarkdown(url: url) + /// + /// // Extract only article content with custom timeout + /// let loadingOptions = URLLoadingOptions( + /// timeout: 60, + /// contentSelector: "article" + /// ) + /// let markdown = try await demark.convertToMarkdown( + /// url: url, + /// loadingOptions: loadingOptions + /// ) + /// ``` + /// + /// ## Security + /// + /// Uses an ephemeral WebView with non-persistent storage for security. + /// Each URL load creates a fresh WebView to prevent cookie/cache pollution. + /// + /// ## Network Requirements + /// + /// Plain HTTP URLs may require App Transport Security exceptions. + /// Only `http` and `https` URL schemes are supported. + /// + /// ## See Also + /// + /// - `URLLoadingOptions`: Configuration options for URL loading + /// - `DemarkOptions`: Configuration options for HTML to Markdown conversion + /// - `DemarkError`: Error types that can be thrown during loading or conversion + public func convertToMarkdown( + url: URL, + options: DemarkOptions = DemarkOptions(), + loadingOptions: URLLoadingOptions = URLLoadingOptions() + ) async throws -> String { + try await conversionRuntime.urlToMarkdown(url, options: options, loadingOptions: loadingOptions) + } } diff --git a/Sources/Demark/DemarkTypes.swift b/Sources/Demark/DemarkTypes.swift index 17c1db5..071e912 100644 --- a/Sources/Demark/DemarkTypes.swift +++ b/Sources/Demark/DemarkTypes.swift @@ -174,6 +174,68 @@ public struct DemarkOptions: Sendable { } } +// MARK: - URL Loading Options + +/// Options for loading URLs in a WebView before conversion +/// +/// Controls how Demark loads web pages, including timeout behavior, +/// JavaScript idle detection, and content extraction. +/// +/// ## Example Usage +/// +/// ```swift +/// // Basic usage with defaults +/// let markdown = try await demark.convertToMarkdown(url: url) +/// +/// // Extract only article content with custom timeout +/// let options = URLLoadingOptions( +/// timeout: 60, +/// contentSelector: "article" +/// ) +/// let markdown = try await demark.convertToMarkdown(url: url, loadingOptions: options) +/// ``` +public struct URLLoadingOptions: Sendable { + /// Default configuration with sensible settings + public static let `default` = URLLoadingOptions() + + /// Maximum time to wait for page load (seconds) + public var timeout: TimeInterval + + /// Wait for JavaScript to settle after page load + public var waitForIdle: Bool + + /// Additional delay after page appears loaded (seconds) + public var idleDelay: TimeInterval + + /// CSS selector to extract specific content (e.g., "article", "main") + public var contentSelector: String? + + /// Custom user agent string + public var userAgent: String? + + /// Create URL loading options with custom configuration + /// + /// - Parameters: + /// - timeout: Maximum time to wait for page load (default: 30 seconds) + /// - waitForIdle: Wait for JavaScript to settle after page load (default: true) + /// - idleDelay: Additional delay after page appears loaded (default: 0.5 seconds) + /// - contentSelector: CSS selector to extract specific content (default: nil, extracts full page) + /// - userAgent: Custom user agent string (default: nil, uses system default) + public init( + timeout: TimeInterval = 30, + waitForIdle: Bool = true, + idleDelay: TimeInterval = 0.5, + contentSelector: String? = nil, + userAgent: String? = nil + ) { + self.timeout = timeout + self.waitForIdle = waitForIdle + self.idleDelay = idleDelay + self.contentSelector = contentSelector + self.userAgent = userAgent + } +} + // MARK: - Error Types /// Errors that can occur during HTML to Markdown conversion. @@ -189,6 +251,10 @@ public enum DemarkError: LocalizedError, Sendable { case jsException(String) case bundleResourceMissing(String) case webViewInitializationFailed + case urlLoadingTimeout(String) + case urlNavigationFailed(String) + case invalidURLScheme(String) + case contentSelectorNotFound(String) // MARK: Public @@ -217,6 +283,14 @@ public enum DemarkError: LocalizedError, Sendable { "Required bundle resource missing: \(resource)" case .webViewInitializationFailed: "Failed to initialize WKWebView" + case let .urlLoadingTimeout(details): + "URL loading timed out: \(details)" + case let .urlNavigationFailed(details): + "URL navigation failed: \(details)" + case let .invalidURLScheme(details): + "Invalid URL scheme: \(details)" + case let .contentSelectorNotFound(selector): + "Content selector '\(selector)' matched no elements" } } } diff --git a/Sources/Demark/TurndownRuntime.swift b/Sources/Demark/TurndownRuntime.swift index 38d51f6..96ee8e1 100644 --- a/Sources/Demark/TurndownRuntime.swift +++ b/Sources/Demark/TurndownRuntime.swift @@ -89,11 +89,23 @@ final class TurndownRuntime { guard try await turndownIsAvailable(in: webView) else { logger.warning("TurndownService missing, reinitializing WKWebView...") + + // Clean up old WebView before creating new one + self.webView = nil isInitialized = false + try await initializeJavaScriptEnvironment() + guard let refreshedWebView = self.webView else { throw DemarkError.jsEnvironmentInitializationFailed } + + // Verify TurndownService is available after reinit + guard try await turndownIsAvailable(in: refreshedWebView) else { + logger.error("TurndownService still not available after reinitialization") + throw DemarkError.jsEnvironmentInitializationFailed + } + return refreshedWebView } @@ -249,8 +261,8 @@ final class TurndownRuntime { // Load a blank page first webView.loadHTMLString("", baseURL: nil) - // Wait for page to load - try await Task.sleep(nanoseconds: 100_000_000) // 100ms + // Wait for page to actually be ready (poll document.readyState) + try await waitForDocumentReady(webView: webView) // Load Turndown library logger.info("Loading Turndown from: \(turndownPath)") @@ -261,28 +273,14 @@ final class TurndownRuntime { _ = try await webView.evaluateJavaScript(turndownScript) logger.info("Successfully loaded Turndown JavaScript library") - // Wait a bit for the script to fully initialize - try await Task.sleep(nanoseconds: 50_000_000) // 50ms - - // Check what's available in the global scope - let globalCheck = try await webView.evaluateJavaScript(""" - JSON.stringify({ - hasTurndownService: typeof TurndownService !== 'undefined', - hasTurndown: typeof Turndown !== 'undefined', - hasWindowTurndownService: typeof window.TurndownService !== 'undefined', - hasWindowTurndown: typeof window.Turndown !== 'undefined' - }) - """) - - if let checkResult = globalCheck as? String { - logger.info("Global scope check: \(checkResult)") + // Verify TurndownService is actually available + guard try await turndownIsAvailable(in: webView) else { + logger.error("TurndownService not available after loading script") + throw DemarkError.libraryLoadingFailed("TurndownService not available in global scope") } - // Since TurndownService is available, we don't need to do anything else - // The global scope check confirmed it's there - isInitialized = true - logger.info("WKWebView runtime ready with Turndown 🎉") + logger.info("WKWebView runtime ready with Turndown") } catch let error as DemarkError { throw error } catch { @@ -290,4 +288,30 @@ final class TurndownRuntime { throw DemarkError.libraryLoadingFailed(error.localizedDescription) } } + + /// Wait for document to be ready by polling document.readyState + private func waitForDocumentReady(webView: WKWebView) async throws { + let maxAttempts = 50 // 5 seconds max + var attempts = 0 + + while attempts < maxAttempts { + try Task.checkCancellation() + + do { + let readyState = try await webView.evaluateJavaScript("document.readyState") as? String + logger.debug("Document readyState: \(readyState ?? "unknown")") + if readyState == "complete" || readyState == "interactive" { + return + } + } catch { + // If we can't even evaluate JS, the page isn't ready yet + logger.debug("Waiting for document... (\(error.localizedDescription))") + } + + try await Task.sleep(nanoseconds: 100_000_000) // 100ms between polls + attempts += 1 + } + + logger.warning("Document never reached ready state, proceeding anyway") + } } diff --git a/Sources/Demark/URLLoadingRuntime.swift b/Sources/Demark/URLLoadingRuntime.swift new file mode 100644 index 0000000..8d687a3 --- /dev/null +++ b/Sources/Demark/URLLoadingRuntime.swift @@ -0,0 +1,284 @@ +// +// URLLoadingRuntime.swift +// Demark +// +// Copyright © 2026 atacan. All rights reserved. +// + +import Foundation +import os.log +import WebKit + +/// WebView-based URL loading runtime for fetching JavaScript-rendered content +/// +/// This implementation uses WKWebView to load URLs and extract rendered HTML: +/// - Real browser DOM environment +/// - JavaScript execution and rendering +/// - Ephemeral storage for security isolation +/// - Main thread execution required for WKWebView +/// - Cross-platform support (iOS, macOS, tvOS, watchOS, visionOS) +@MainActor +final class URLLoadingRuntime { + // MARK: - Properties + + private let logger = Logger(subsystem: "com.demark", category: "url-loading") + private var activeDelegates: [ObjectIdentifier: URLNavigationDelegate] = [:] + + // MARK: - Lifecycle + + deinit { + logger.info("URLLoadingRuntime being deallocated") + } + + // MARK: - Public Methods + + /// Load a URL in a WebView and extract rendered HTML + /// + /// Creates an ephemeral WebView for each load to ensure isolation between + /// untrusted pages. Supports waiting for JavaScript to settle and extracting + /// specific content via CSS selectors. + /// + /// This method supports concurrent calls - each invocation uses its own webView + /// and cleanup is isolated to that specific request. + /// + /// - Parameters: + /// - url: The URL to load + /// - options: Loading configuration options + /// - Returns: The rendered HTML content + /// - Throws: DemarkError if loading fails + func loadAndExtract(url: URL, options: URLLoadingOptions) async throws -> String { + // Create fresh ephemeral WebView for each load + let webView = createWebView(userAgent: options.userAgent) + + defer { + webView.stopLoading() + self.activeDelegates.removeValue(forKey: ObjectIdentifier(webView)) + } + + return try await withTaskCancellationHandler { + try await performLoad(webView: webView, url: url, options: options) + } onCancel: { + Task { @MainActor in + webView.stopLoading() + if let delegate = self.activeDelegates[ObjectIdentifier(webView)] { + delegate.cancel() + } + } + } + } + + // MARK: - Private Methods + + private func createWebView(userAgent: String?) -> WKWebView { + let config = WKWebViewConfiguration() + config.userContentController = WKUserContentController() + + // Use ephemeral storage - no cookies/cache pollution between loads + config.websiteDataStore = .nonPersistent() + + // Platform-specific configuration + #if os(macOS) + config.preferences.javaScriptCanOpenWindowsAutomatically = false + #elseif os(iOS) || os(visionOS) + config.allowsInlineMediaPlayback = false + config.mediaTypesRequiringUserActionForPlayback = .all + #endif + + let webView: WKWebView + #if os(watchOS) || os(tvOS) + webView = WKWebView(frame: CGRect(x: 0, y: 0, width: 100, height: 100), configuration: config) + #else + webView = WKWebView(frame: .zero, configuration: config) + #endif + + // Set user agent before loading + if let userAgent { + webView.customUserAgent = userAgent + } + + return webView + } + + private func performLoad(webView: WKWebView, url: URL, options: URLLoadingOptions) async throws -> String { + try Task.checkCancellation() + + return try await withCheckedThrowingContinuation { continuation in + let delegate = URLNavigationDelegate( + url: url, + options: options, + logger: logger, + continuation: continuation + ) + self.activeDelegates[ObjectIdentifier(webView)] = delegate + webView.navigationDelegate = delegate + + var request = URLRequest(url: url) + if options.timeout.isFinite, options.timeout > 0 { + request.timeoutInterval = options.timeout + } + webView.load(request) + + // Set up timeout (delegate will cancel this task on completion) + if let nanoseconds = clampedNanoseconds(options.timeout) { + delegate.timeoutTask = Task { + try? await Task.sleep(nanoseconds: nanoseconds) + delegate.handleTimeout() + } + } + } + } +} + +// MARK: - Navigation Delegate + +@MainActor +private final class URLNavigationDelegate: NSObject, WKNavigationDelegate { + private let url: URL + private let options: URLLoadingOptions + private let logger: Logger + private var continuation: CheckedContinuation? + private var hasCompleted = false + + /// Timeout task - cancelled on successful completion to prevent leaks + var timeoutTask: Task? + + init(url: URL, options: URLLoadingOptions, logger: Logger, continuation: CheckedContinuation) { + self.url = url + self.options = options + self.logger = logger + self.continuation = continuation + super.init() + } + + func webView(_ webView: WKWebView, didFinish navigation: WKNavigation!) { + logger.info("Navigation finished for: \(self.url.absoluteString)") + + Task { @MainActor in + do { + try Task.checkCancellation() + + if options.waitForIdle { + try await waitForIdle(webView: webView) + } + + if let nanoseconds = clampedNanoseconds(options.idleDelay), nanoseconds > 0 { + try await Task.sleep(nanoseconds: nanoseconds) + } + + try Task.checkCancellation() + let html = try await extractHTML(from: webView) + complete(with: .success(html)) + } catch { + complete(with: .failure(error)) + } + } + } + + func webView(_ webView: WKWebView, didFail navigation: WKNavigation!, withError error: Error) { + logger.error("Navigation failed: \(error.localizedDescription)") + complete(with: .failure(DemarkError.urlNavigationFailed("\(url.absoluteString): \(error.localizedDescription)"))) + } + + func webView(_ webView: WKWebView, didFailProvisionalNavigation navigation: WKNavigation!, withError error: Error) { + logger.error("Provisional navigation failed: \(error.localizedDescription)") + complete(with: .failure(DemarkError.urlNavigationFailed("\(url.absoluteString): \(error.localizedDescription)"))) + } + + func handleTimeout() { + guard !hasCompleted else { return } + logger.warning("Page load timed out for: \(self.url.absoluteString)") + let secondsDescription: String + if let nanoseconds = clampedNanoseconds(options.timeout) { + secondsDescription = String(nanoseconds / 1_000_000_000) + } else { + secondsDescription = "∞" + } + complete(with: .failure(DemarkError.urlLoadingTimeout("\(url.absoluteString) after \(secondsDescription) seconds"))) + } + + func cancel() { + guard !hasCompleted else { return } + complete(with: .failure(CancellationError())) + } + + private func waitForIdle(webView: WKWebView) async throws { + var attempts = 0 + let maxAttempts = 50 // 5 seconds max polling + + while attempts < maxAttempts { + try Task.checkCancellation() + let readyState = try await webView.evaluateJavaScript("document.readyState") as? String + logger.debug("Document readyState: \(readyState ?? "unknown")") + if readyState == "complete" { + return + } + try await Task.sleep(nanoseconds: 100_000_000) // 100ms + attempts += 1 + } + logger.warning("Document never reached 'complete' state, proceeding anyway") + } + + private func extractHTML(from webView: WKWebView) async throws -> String { + let script: String + if let selector = options.contentSelector { + // Use JSON serialization for proper escaping (handles quotes, newlines, special chars) + let escapedSelector = try escapeForJS(selector) + script = """ + (function() { + var el = document.querySelector(\(escapedSelector)); + return el ? el.outerHTML : null; + })(); + """ + } else { + script = "document.documentElement.outerHTML" + } + + let result = try await webView.evaluateJavaScript(script) + + if options.contentSelector != nil, result == nil || (result as? NSNull) != nil { + throw DemarkError.contentSelectorNotFound(options.contentSelector!) + } + + guard let html = result as? String else { + throw DemarkError.conversionFailed + } + + logger.info("Extracted HTML length: \(html.count) characters") + return html + } + + /// Escape string for JavaScript using JSON serialization (handles all special characters) + private func escapeForJS(_ string: String) throws -> String { + // Wrap in array since JSONSerialization requires a collection as top-level object + let data = try JSONSerialization.data(withJSONObject: [string]) + guard let arrayString = String(data: data, encoding: .utf8) else { + throw DemarkError.invalidInput("Failed to escape selector: \(string)") + } + // Extract the quoted string from the array: ["value"] -> "value" + let startIndex = arrayString.index(after: arrayString.startIndex) // Skip [ + let endIndex = arrayString.index(before: arrayString.endIndex) // Skip ] + return String(arrayString[startIndex ..< endIndex]) + } + + private func complete(with result: Result) { + guard !hasCompleted else { return } + hasCompleted = true + + // Cancel timeout task to prevent leak + timeoutTask?.cancel() + timeoutTask = nil + + switch result { + case let .success(html): continuation?.resume(returning: html) + case let .failure(error): continuation?.resume(throwing: error) + } + continuation = nil + } +} + +private func clampedNanoseconds(_ seconds: TimeInterval) -> UInt64? { + guard seconds.isFinite else { return nil } + let maxSeconds = Double(UInt64.max) / 1_000_000_000 + let clampedSeconds = max(0, min(seconds, maxSeconds)) + return UInt64(clampedSeconds * 1_000_000_000) +} diff --git a/Tests/DemarkTests/DemarkURLLoadingTests.swift b/Tests/DemarkTests/DemarkURLLoadingTests.swift new file mode 100644 index 0000000..8c208a7 --- /dev/null +++ b/Tests/DemarkTests/DemarkURLLoadingTests.swift @@ -0,0 +1,392 @@ +import Foundation +import Testing +@testable import Demark + +@MainActor +struct DemarkURLLoadingTests { + // MARK: - Unit Tests: Invalid URL Schemes + + @Test("Invalid URL scheme - file:// rejected") + func invalidURLSchemeFile() async { + let service = Demark() + let url = URL(string: "file:///tmp/test.html")! + + do { + _ = try await service.convertToMarkdown(url: url) + #expect(Bool(false), "Expected DemarkError.invalidURLScheme for file:// URL") + } catch DemarkError.invalidURLScheme(let details) { + #expect(details.contains("file")) + #expect(details.contains("Only http and https")) + } catch { + #expect(Bool(false), "Unexpected error: \(error)") + } + } + + @Test("Invalid URL scheme - ftp:// rejected") + func invalidURLSchemeFTP() async { + let service = Demark() + let url = URL(string: "ftp://example.com/file.txt")! + + do { + _ = try await service.convertToMarkdown(url: url) + #expect(Bool(false), "Expected DemarkError.invalidURLScheme for ftp:// URL") + } catch DemarkError.invalidURLScheme(let details) { + #expect(details.contains("ftp")) + } catch { + #expect(Bool(false), "Unexpected error: \(error)") + } + } + + @Test("Invalid URL scheme - custom scheme rejected") + func invalidURLSchemeCustom() async { + let service = Demark() + let url = URL(string: "myapp://page/content")! + + do { + _ = try await service.convertToMarkdown(url: url) + #expect(Bool(false), "Expected DemarkError.invalidURLScheme for custom scheme") + } catch DemarkError.invalidURLScheme(let details) { + #expect(details.contains("myapp")) + } catch { + #expect(Bool(false), "Unexpected error: \(error)") + } + } + + // MARK: - Unit Tests: URLLoadingOptions + + @Test("URLLoadingOptions default values") + func urlLoadingOptionsDefaults() { + let options = URLLoadingOptions() + + #expect(options.timeout == 30) + #expect(options.waitForIdle == true) + #expect(options.idleDelay == 0.5) + #expect(options.contentSelector == nil) + #expect(options.userAgent == nil) + } + + @Test("URLLoadingOptions custom values") + func urlLoadingOptionsCustom() { + let options = URLLoadingOptions( + timeout: 60, + waitForIdle: false, + idleDelay: 1.0, + contentSelector: "article", + userAgent: "TestBot/1.0" + ) + + #expect(options.timeout == 60) + #expect(options.waitForIdle == false) + #expect(options.idleDelay == 1.0) + #expect(options.contentSelector == "article") + #expect(options.userAgent == "TestBot/1.0") + } + + @Test("URLLoadingOptions.default matches init()") + func urlLoadingOptionsDefaultStatic() { + let defaultOptions = URLLoadingOptions.default + let initOptions = URLLoadingOptions() + + #expect(defaultOptions.timeout == initOptions.timeout) + #expect(defaultOptions.waitForIdle == initOptions.waitForIdle) + #expect(defaultOptions.idleDelay == initOptions.idleDelay) + #expect(defaultOptions.contentSelector == initOptions.contentSelector) + #expect(defaultOptions.userAgent == initOptions.userAgent) + } + + // MARK: - Unit Tests: Error Descriptions + + @Test("Error description - urlLoadingTimeout") + func errorDescriptionTimeout() { + let error = DemarkError.urlLoadingTimeout("https://example.com after 30 seconds") + let description = error.errorDescription ?? "" + + #expect(description.contains("timed out")) + #expect(description.contains("https://example.com")) + #expect(description.contains("30 seconds")) + } + + @Test("Error description - urlNavigationFailed") + func errorDescriptionNavigation() { + let error = DemarkError.urlNavigationFailed("https://example.com: Connection refused") + let description = error.errorDescription ?? "" + + #expect(description.contains("navigation failed")) + #expect(description.contains("https://example.com")) + } + + @Test("Error description - invalidURLScheme") + func errorDescriptionScheme() { + let error = DemarkError.invalidURLScheme("Only http and https URLs are supported, got: file") + let description = error.errorDescription ?? "" + + #expect(description.contains("Invalid URL scheme")) + #expect(description.contains("file")) + } + + @Test("Error description - contentSelectorNotFound") + func errorDescriptionSelector() { + let error = DemarkError.contentSelectorNotFound("article.main-content") + let description = error.errorDescription ?? "" + + #expect(description.contains("selector")) + #expect(description.contains("article.main-content")) + #expect(description.contains("matched no elements")) + } +} + +// MARK: - Integration Tests + +@MainActor +struct DemarkURLLoadingIntegrationTests { + @Test("Load example.com and convert to markdown") + func loadExampleDotCom() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 30, + waitForIdle: true, + idleDelay: 0.5 + ) + + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + + // example.com has a simple page with "Example Domain" heading + #expect(markdown.contains("Example Domain")) + #expect(!markdown.isEmpty) + } + + @Test("Load with content selector extracts specific element") + func loadWithContentSelector() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + // example.com has a
container with the main content + let loadingOptions = URLLoadingOptions( + timeout: 30, + contentSelector: "div" + ) + + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + + #expect(markdown.contains("Example Domain")) + } + + @Test("Content selector not found throws error") + func contentSelectorNotFound() async { + let service = Demark() + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 30, + contentSelector: "article.nonexistent-class-xyz" + ) + + do { + _ = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + #expect(Bool(false), "Expected DemarkError.contentSelectorNotFound") + } catch DemarkError.contentSelectorNotFound(let selector) { + #expect(selector == "article.nonexistent-class-xyz") + } catch { + #expect(Bool(false), "Unexpected error: \(error)") + } + } + + @Test("Custom user agent is applied") + func customUserAgent() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 30, + userAgent: "DemarkTest/1.0" + ) + + // Just verify the request succeeds with custom user agent + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + #expect(!markdown.isEmpty) + } + + @Test("Short timeout with slow request") + func shortTimeoutError() async { + let service = Demark() + // Use a URL that will definitely timeout with 0.1s timeout + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 0.001, // 1ms - will timeout + waitForIdle: false, + idleDelay: 0 + ) + + do { + _ = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + // If it succeeds (very fast network), that's fine too + } catch DemarkError.urlLoadingTimeout(let details) { + #expect(details.contains("example.com")) + } catch { + // Other network errors are acceptable + } + } +} + +// MARK: - Edge Case Tests + +@MainActor +struct DemarkURLLoadingEdgeCaseTests { + @Test("URL with query parameters") + func urlWithQueryParams() async throws { + let service = Demark() + // example.com ignores query params but we're testing URL handling + let url = URL(string: "https://example.com/?foo=bar&baz=123")! + + let markdown = try await service.convertToMarkdown(url: url) + #expect(markdown.contains("Example Domain")) + } + + @Test("URL with fragment") + func urlWithFragment() async throws { + let service = Demark() + let url = URL(string: "https://example.com/#section")! + + let markdown = try await service.convertToMarkdown(url: url) + #expect(markdown.contains("Example Domain")) + } + + @Test("URL with encoded characters") + func urlWithEncodedChars() async throws { + let service = Demark() + // %20 is space, example.com will handle this gracefully + let url = URL(string: "https://example.com/path%20with%20spaces")! + + do { + let markdown = try await service.convertToMarkdown(url: url) + // May get error page but shouldn't crash + #expect(!markdown.isEmpty) + } catch DemarkError.urlNavigationFailed { + // 404 or similar is acceptable + } + } + + @Test("Selector with attribute") + func selectorWithAttribute() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + // Test that attribute selectors work + let loadingOptions = URLLoadingOptions( + contentSelector: "a[href]" + ) + + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + // example.com has a link - verify we got a markdown link + #expect(markdown.contains("[") && markdown.contains("](")) + } + + @Test("Selector with quotes in attribute") + func selectorWithQuotes() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + // Test selector with quoted attribute value - use the actual IANA link + let loadingOptions = URLLoadingOptions( + contentSelector: "a[href*=\"iana.org\"]" + ) + + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + // Verify we got the link content + #expect(markdown.contains("iana.org")) + } + + @Test("Minimal timeout and idle settings") + func minimalDelays() async throws { + let service = Demark() + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 30, + waitForIdle: false, + idleDelay: 0 + ) + + let markdown = try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + #expect(!markdown.isEmpty) + } + + @Test("HTTP URL scheme accepted") + func httpSchemeAccepted() async { + let service = Demark() + // Note: May fail due to ATS, but should not throw invalidURLScheme + let url = URL(string: "http://example.com")! + + do { + let markdown = try await service.convertToMarkdown(url: url) + #expect(!markdown.isEmpty) + } catch DemarkError.invalidURLScheme { + #expect(Bool(false), "http:// should be accepted, not rejected as invalid scheme") + } catch { + // Network errors (ATS, connection issues) are acceptable + } + } +} + +// MARK: - Cancellation Tests + +@MainActor +struct DemarkURLLoadingCancellationTests { + @Test("Task cancellation stops loading") + func taskCancellation() async { + let service = Demark() + let url = URL(string: "https://example.com")! + + let loadingOptions = URLLoadingOptions( + timeout: 60, // Long timeout + waitForIdle: true, + idleDelay: 5 // Long delay to ensure we can cancel + ) + + let task = Task { + try await service.convertToMarkdown(url: url, loadingOptions: loadingOptions) + } + + // Cancel quickly + try? await Task.sleep(nanoseconds: 100_000_000) // 100ms + task.cancel() + + let result = await task.result + switch result { + case .success: + // Fast completion before cancel is OK + break + case .failure(let error): + // CancellationError or wrapped version is expected + let isCancellation = error is CancellationError || + String(describing: error).contains("cancel") + #expect(isCancellation || error is DemarkError, "Expected cancellation or Demark error, got: \(error)") + } + } +} + +// MARK: - Concurrent Load Tests + +@MainActor +struct DemarkURLLoadingConcurrencyTests { + @Test("Concurrent URL loads don't cross-cancel") + func concurrentURLLoads() async throws { + let service = Demark() + let urls = [ + URL(string: "https://example.com")!, + URL(string: "https://example.org")!, + ] + + try await withThrowingTaskGroup(of: String.self) { group in + for url in urls { + group.addTask { try await service.convertToMarkdown(url: url) } + } + for try await result in group { + #expect(!result.isEmpty) + } + } + } +}