diff --git a/src/cli.rs b/src/cli.rs index 961537b..c3a6400 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,81 +1,67 @@ use clap::{Arg, Command}; -use std::collections::HashSet; use url::Url; #[derive(Debug, Clone)] pub struct CliArgs { - pub links: Vec, - pub verbose: bool, - pub template: bool, + pub domain: String, + pub prep: bool, } impl CliArgs { pub fn parse() -> Result { let matches = Command::new("smart-crawler") - .version("0.3.2") + .version("0.4.1") .about("A web crawler that uses WebDriver to extract and parse HTML content") .arg( - Arg::new("link") - .long("link") - .value_name("URL") - .help("URL to crawl (can be specified multiple times)") - .action(clap::ArgAction::Append) + Arg::new("domain") + .long("domain") + .value_name("DOMAIN") + .help("Domain to crawl. Can be a URL or domain name") .required(true), ) .arg( - Arg::new("verbose") - .long("verbose") - .help("Enable verbose output showing filtered HTML node tree") - .action(clap::ArgAction::SetTrue), - ) - .arg( - Arg::new("template") - .long("template") - .help("Enable template detection mode to identify patterns like '{count} comments' in HTML content") + Arg::new("prep") + .long("prep") + .help( + "Enable preparation mode to discover template patterns across domain pages", + ) .action(clap::ArgAction::SetTrue), ) .get_matches(); - let links: Vec = matches - .get_many::("link") - .unwrap_or_default() - .cloned() - .collect(); + let domain_input = matches + .get_one::("domain") + .ok_or("Domain argument is required")?; - let validated_links = Self::validate_and_deduplicate_links(links)?; - let verbose = matches.get_flag("verbose"); - let template = matches.get_flag("template"); + let validated_domain = Self::extract_domain(domain_input)?; + let prep = matches.get_flag("prep"); Ok(CliArgs { - links: validated_links, - verbose, - template, + domain: validated_domain, + prep, }) } - fn validate_and_deduplicate_links(links: Vec) -> Result, String> { - let mut seen_urls = HashSet::new(); - let mut validated_links = Vec::new(); - - for link in links { - match Url::parse(&link) { - Ok(url) => { - let normalized_url = url.to_string(); - if seen_urls.insert(normalized_url.clone()) { - validated_links.push(normalized_url); - } - } - Err(_) => { - return Err(format!("Invalid URL: {link}")); + fn extract_domain(input: &str) -> Result { + let trimmed = input.trim(); + + // Always try to parse as URL to validate the domain + let url_str = if trimmed.starts_with("http://") || trimmed.starts_with("https://") { + trimmed.to_string() + } else { + format!("https://{trimmed}") + }; + + match Url::parse(&url_str) { + Ok(url) => { + if let Some(domain) = url.host_str() { + Ok(domain.to_string()) + } else { + Err(format!("Could not extract domain from: {input}")) } } + Err(_) => Err(format!("Invalid domain or URL: {input}")), } - - if validated_links.is_empty() { - return Err("No valid URLs provided".to_string()); - } - - Ok(validated_links) } } @@ -84,47 +70,64 @@ mod tests { use super::*; #[test] - fn test_validate_and_deduplicate_links() { - let links = vec![ - "https://example.com".to_string(), - "https://example.org".to_string(), - "https://example.com".to_string(), // duplicate - ]; - - let result = CliArgs::validate_and_deduplicate_links(links).unwrap(); - assert_eq!(result.len(), 2); - assert!(result.contains(&"https://example.com/".to_string())); - assert!(result.contains(&"https://example.org/".to_string())); + fn test_single_domain_parsing() { + // Test that single domain parsing works correctly + let args = CliArgs { + domain: "example.com".to_string(), + prep: false, + }; + + assert_eq!(args.domain, "example.com"); + assert!(!args.prep); } #[test] - fn test_validate_invalid_url() { - let links = vec!["invalid-url".to_string()]; - let result = CliArgs::validate_and_deduplicate_links(links); - assert!(result.is_err()); - assert!(result.unwrap_err().contains("Invalid URL")); + fn test_extract_domain() { + // Test URL with protocol + assert_eq!( + CliArgs::extract_domain("https://example.com").unwrap(), + "example.com" + ); + assert_eq!( + CliArgs::extract_domain("http://example.com/path").unwrap(), + "example.com" + ); + + // Test domain without protocol + assert_eq!( + CliArgs::extract_domain("example.com").unwrap(), + "example.com" + ); + assert_eq!( + CliArgs::extract_domain(" example.com ").unwrap(), + "example.com" + ); + + // Test edge case - the URL crate behavior with multiple dots + assert_eq!( + CliArgs::extract_domain("invalid..domain").unwrap(), + "invalid..domain" + ); } #[test] - fn test_validate_empty_links() { - let links = vec![]; - let result = CliArgs::validate_and_deduplicate_links(links); + fn test_extract_domain_error() { + // Test that invalid domain extraction returns error + let result = CliArgs::extract_domain("://invalid"); assert!(result.is_err()); - assert!(result.unwrap_err().contains("No valid URLs provided")); + assert!(result.unwrap_err().contains("Invalid domain or URL")); } #[test] - fn test_cli_template_flag() { - // Test that template flag is properly parsed (this is a simplified test + fn test_cli_prep_flag() { + // Test that prep flag is properly parsed (this is a simplified test // since we can't easily test the full CLI parsing in unit tests) let args = CliArgs { - links: vec!["https://example.com".to_string()], - verbose: true, - template: true, + domain: "example.com".to_string(), + prep: true, }; - assert!(args.template); - assert!(args.verbose); - assert_eq!(args.links.len(), 1); + assert!(args.prep); + assert_eq!(args.domain, "example.com"); } } diff --git a/src/main.rs b/src/main.rs index e15ff62..7cfb33d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,6 @@ -use smart_crawler::{Browser, CliArgs, FetchStatus, HtmlParser, UrlStorage}; +use smart_crawler::{ + Browser, CliArgs, FetchStatus, HtmlParser, TemplateDetector, TemplatePathStore, UrlStorage, +}; use std::collections::{HashMap, HashSet}; use tracing::{debug, error, info}; @@ -19,12 +21,18 @@ async fn main() { } }; - info!("Starting SmartCrawler with {} URLs", args.links.len()); + info!("Starting SmartCrawler with domain: {}", args.domain); let mut storage = UrlStorage::new(); - for link in &args.links { - storage.add_url(link.clone()); - } + let mut domain_urls: HashMap> = HashMap::new(); + + // Convert domain to initial URL + let root_url = smart_crawler::utils::construct_root_url(&args.domain); + storage.add_url(root_url.clone()); + domain_urls + .entry(args.domain.clone()) + .or_default() + .insert(root_url); let mut browser = Browser::new(4444); @@ -46,95 +54,70 @@ async fn main() { let parser = HtmlParser::new(); - // Phase 1: Preparation stage - fetch additional URLs from same domains - info!("Starting preparation stage to collect URLs from same domains"); + // Phase 1: URL Discovery - find additional URLs for each domain + info!("Starting URL discovery for domains"); - let mut domain_urls: HashMap> = HashMap::new(); + let max_urls_per_domain = if args.prep { 10 } else { 3 }; - // Group initial URLs by domain - for url in &args.links { - if let Some(domain) = smart_crawler::utils::extract_domain_from_url(url) { - domain_urls.entry(domain).or_default().insert(url.clone()); - } - } + // Discover additional URLs for the domain + let domain = &args.domain; + let urls = domain_urls.get_mut(domain).unwrap(); - // Add root URLs for each domain if not already present - for (domain, urls) in &mut domain_urls { - let root_url = smart_crawler::utils::construct_root_url(domain); - if !urls.contains(&root_url) { - urls.insert(root_url.clone()); - storage.add_url(root_url); - info!( - "Added root URL for domain {}: {}", - domain, - smart_crawler::utils::construct_root_url(domain) - ); - } - } + if urls.len() < max_urls_per_domain { + info!( + "Domain {} has {} URL(s), searching for more (max: {})...", + domain, + urls.len(), + max_urls_per_domain + ); - // For each domain, try to find additional URLs - for (domain, urls) in &mut domain_urls { - if urls.len() < 3 { - info!( - "Domain {} has only {} URL(s), searching for more...", - domain, - urls.len() - ); + // Pick the first URL to extract links from + if let Some(first_url) = urls.iter().next() { + match process_url(&mut browser, &parser, &mut storage, first_url, true).await { + Ok(html_source) => { + let additional_urls = parser.extract_links(&html_source, domain); + let mut added_count = 0; - // Pick the first URL to extract links from - if let Some(first_url) = urls.iter().next() { - match process_url(&mut browser, &parser, &mut storage, first_url, true).await { - Ok(html_source) => { - let additional_urls = parser.extract_links(&html_source, domain); - let mut added_count = 0; - - for additional_url in additional_urls { - if urls.len() >= 3 { - break; - } - if urls.insert(additional_url.clone()) { - storage.add_url(additional_url); - added_count += 1; - } + for additional_url in additional_urls { + if urls.len() >= max_urls_per_domain { + break; + } + if urls.insert(additional_url.clone()) { + storage.add_url(additional_url); + added_count += 1; } - - info!( - "Found {} additional URLs for domain {}", - added_count, domain - ); - } - Err(e) => { - error!("Failed to extract links from {}: {}", first_url, e); } + + info!( + "Found {} additional URLs for domain {}", + added_count, domain + ); + } + Err(e) => { + error!("Failed to extract links from {}: {}", first_url, e); } } } } - // Phase 2: Process all URLs (initial + discovered) with root URL prioritization - info!("Processing all URLs with root URL prioritization"); + // Phase 2: Process all discovered URLs + info!("Processing all discovered URLs"); let mut all_urls: Vec = Vec::new(); - // First, add all user-specified URLs - for url in &args.links { - all_urls.push(url.clone()); - } + // Collect all URLs with root URL prioritized + let domain = &args.domain; + let urls = domain_urls.get(domain).unwrap(); + let root_url = smart_crawler::utils::construct_root_url(domain); - // Then, add root URLs for each domain (if not already in user-specified URLs) - for domain in domain_urls.keys() { - let root_url = smart_crawler::utils::construct_root_url(domain); - if !args.links.contains(&root_url) { - all_urls.push(root_url); - } + // Add root URL first + if urls.contains(&root_url) { + all_urls.push(root_url.clone()); } - - // Finally, add all other discovered URLs - for urls in domain_urls.values() { - for url in urls { - if !args.links.contains(url) && !smart_crawler::utils::is_root_url(url) { - all_urls.push(url.clone()); - } + // Then add other URLs + for url in urls { + if url != &root_url { + all_urls.push(url.clone()); } } @@ -151,72 +134,100 @@ async fn main() { } } - // Phase 2.5: Apply template detection if enabled - if args.template { - info!("Applying template detection to HTML content"); - apply_template_detection_to_storage(&mut storage); - } - - // Phase 3: Analyze domain duplicates (skip if template mode is enabled) - if !args.template { - info!("Analyzing domain-level duplicate nodes"); - - for domain in domain_urls.keys() { - storage.analyze_domain_duplicates(domain); - if let Some(duplicates) = storage.get_domain_duplicates(domain) { - let duplicate_count = duplicates.get_duplicate_count(); - if duplicate_count > 0 { - info!( - "Found {} duplicate node patterns for domain {}", - duplicate_count, domain - ); - } else { - info!( - "No duplicate patterns found for domain {} (likely insufficient pages)", - domain - ); + // Phase 3: Template analysis (prep mode) or standard duplicate analysis + if args.prep { + info!("Running template detection analysis in prep mode"); + let mut combined_store = TemplatePathStore::new(); + let template_detector = TemplateDetector::new(); + + // Process each completed URL to extract template paths + let completed_urls = storage.get_completed_urls(); + for url_data in &completed_urls { + if let Some(html_tree) = &url_data.html_tree { + let url_store = template_detector.extract_templates_with_paths(html_tree); + for path in url_store.get_paths() { + combined_store.add_path(path.clone()); } } } + + info!( + "Template analysis complete, found {} unique template paths", + combined_store.get_paths().len() + ); } else { - info!("Skipping domain duplicate analysis in template mode"); + info!("Running standard duplicate analysis"); + + let domain = &args.domain; + storage.analyze_domain_duplicates(domain); + if let Some(duplicates) = storage.get_domain_duplicates(domain) { + let duplicate_count = duplicates.get_duplicate_count(); + if duplicate_count > 0 { + info!( + "Found {} duplicate node patterns for domain {}", + duplicate_count, domain + ); + } else { + info!( + "No duplicate patterns found for domain {} (likely insufficient pages)", + domain + ); + } + } } let _ = browser.close().await; - println!("\n=== Crawling Results ==="); - let completed_urls = storage.get_completed_urls(); - - if completed_urls.is_empty() { - println!("No URLs were successfully processed."); - } else { - for url_data in completed_urls { - let title = url_data.title.as_deref().unwrap_or("No title found"); - println!("URL: {}", url_data.url); - println!("Title: {title}"); - println!("Domain: {}", url_data.domain); + if args.prep { + // In prep mode, output detected template paths in serialized format + println!("\n=== Template Path Detection Results ==="); + + let mut combined_store = TemplatePathStore::new(); + let template_detector = TemplateDetector::new(); + + // Process each completed URL to extract template paths + let completed_urls = storage.get_completed_urls(); + if completed_urls.is_empty() { + println!("No URLs were successfully processed."); + } else { + println!( + "Processed {} URLs for domain {}:", + completed_urls.len(), + args.domain + ); + for url_data in &completed_urls { + println!( + " - {} ({})", + url_data.url, + url_data.title.as_deref().unwrap_or("No title") + ); - if args.verbose { if let Some(html_tree) = &url_data.html_tree { - if args.template { - // In template mode, show HTML tree with template patterns (no duplicate filtering) - println!("HTML Tree with Template Patterns:"); - print_html_tree_with_template(html_tree, 0, false); - } else if let Some(domain_duplicates) = - storage.get_domain_duplicates(&url_data.domain) - { - let filtered_tree = - HtmlParser::filter_domain_duplicates(html_tree, domain_duplicates); - println!("Filtered HTML Tree (showing complete structure with duplicate marking):"); - print_html_tree_with_template(&filtered_tree, 0, false); - } else { - println!("HTML Tree (no duplicates to filter):"); - print_html_tree_with_template(html_tree, 0, false); + let url_store = template_detector.extract_templates_with_paths(html_tree); + for path in url_store.get_paths() { + combined_store.add_path(path.clone()); } } } - println!("---"); + println!("\nDetected Template Paths (Rust-serializable format):"); + println!("{}", combined_store.to_serialized_string()); + } + } else { + // Regular mode - show crawling results + println!("\n=== Crawling Results ==="); + let completed_urls = storage.get_completed_urls(); + + if completed_urls.is_empty() { + println!("No URLs were successfully processed."); + } else { + for url_data in completed_urls { + let title = url_data.title.as_deref().unwrap_or("No title found"); + println!("URL: {}", url_data.url); + println!("Title: {title}"); + println!("Domain: {}", url_data.domain); + println!("---"); + } } } @@ -274,72 +285,3 @@ async fn process_url( } } } - -/// Apply template detection to all HTML trees in storage -fn apply_template_detection_to_storage(storage: &mut smart_crawler::UrlStorage) { - let detector = smart_crawler::TemplateDetector::new(); - - // Get all URLs to process - let all_urls: Vec = storage - .get_all_urls() - .iter() - .map(|url_data| url_data.url.clone()) - .collect(); - - for url in &all_urls { - if let Some(url_data) = storage.get_url_data_mut(url) { - if let Some(html_tree) = &mut url_data.html_tree { - apply_template_to_node(html_tree, &detector); - } - } - } -} - -/// Recursively apply template detection to HTML node content -fn apply_template_to_node( - node: &mut smart_crawler::HtmlNode, - detector: &smart_crawler::TemplateDetector, -) { - // Apply template detection to this node's content - if !node.content.is_empty() { - node.content = detector.apply_template(&node.content); - } - - // Recursively apply to all children - for child in &mut node.children { - apply_template_to_node(child, detector); - } -} - -fn print_html_tree_with_template( - node: &smart_crawler::HtmlNode, - indent: usize, - _use_template: bool, -) { - let indent_str = " ".repeat(indent); - - // Build the element info string with tag, id, and classes - let mut element_info = node.tag.clone(); - if let Some(id) = &node.id { - element_info.push_str(&format!("#{id}")); - } - if !node.classes.is_empty() { - element_info.push_str(&format!("[{}]", node.classes.join(" "))); - } - - if !node.content.is_empty() { - // Content already contains template patterns if template mode was enabled - println!( - "{}{}: {}", - indent_str, - element_info, - node.content.chars().take(100).collect::() - ); - } else { - println!("{indent_str}{element_info}"); - } - - for child in &node.children { - print_html_tree_with_template(child, indent + 1, _use_template); - } -} diff --git a/src/template_detection.rs b/src/template_detection.rs index b43e412..d3d9516 100644 --- a/src/template_detection.rs +++ b/src/template_detection.rs @@ -1,5 +1,52 @@ use regex::Regex; -use std::collections::HashMap; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; + +/// Represents an element in the path from HTML root to a template-containing element +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ElementPathComponent { + pub tag: String, + pub classes: Vec, +} + +/// Complete path from HTML root to a template-containing element +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ElementPath { + pub components: Vec, + pub template_pattern: String, +} + +/// Store for tracking detected template paths across pages +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TemplatePathStore { + pub detected_paths: HashSet, +} + +impl TemplatePathStore { + pub fn new() -> Self { + Self { + detected_paths: HashSet::new(), + } + } + + pub fn add_path(&mut self, path: ElementPath) { + self.detected_paths.insert(path); + } + + pub fn get_paths(&self) -> &HashSet { + &self.detected_paths + } + + pub fn to_serialized_string(&self) -> String { + serde_json::to_string_pretty(self).unwrap_or_default() + } +} + +impl Default for TemplatePathStore { + fn default() -> Self { + Self::new() + } +} /// Template variable types that can be detected #[derive(Debug, Clone, PartialEq)] @@ -254,6 +301,50 @@ impl TemplateDetector { content.to_string() } } + + /// Extract templates with their element paths from an HTML tree + pub fn extract_templates_with_paths(&self, root: &crate::HtmlNode) -> TemplatePathStore { + let mut store = TemplatePathStore::new(); + let mut current_path = Vec::new(); + self.extract_templates_recursive(root, &mut current_path, &mut store); + store + } + + fn extract_templates_recursive( + &self, + node: &crate::HtmlNode, + current_path: &mut Vec, + store: &mut TemplatePathStore, + ) { + // Add current node to path (excluding root html node if tag is empty) + if !node.tag.is_empty() { + current_path.push(ElementPathComponent { + tag: node.tag.clone(), + classes: node.classes.clone(), + }); + } + + // Check if current node has template-detectable content + if !node.content.is_empty() { + if let Some(template) = self.detect_template(&node.content) { + let element_path = ElementPath { + components: current_path.clone(), + template_pattern: template.pattern, + }; + store.add_path(element_path); + } + } + + // Recursively process children + for child in &node.children { + self.extract_templates_recursive(child, current_path, store); + } + + // Remove current node from path when backtracking + if !node.tag.is_empty() { + current_path.pop(); + } + } } impl Default for TemplateDetector {