diff --git a/src/cli.rs b/src/cli.rs index b65dd82..961537b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -6,6 +6,7 @@ use url::Url; pub struct CliArgs { pub links: Vec, pub verbose: bool, + pub template: bool, } impl CliArgs { @@ -27,6 +28,12 @@ impl CliArgs { .help("Enable verbose output showing filtered HTML node tree") .action(clap::ArgAction::SetTrue), ) + .arg( + Arg::new("template") + .long("template") + .help("Enable template detection mode to identify patterns like '{count} comments' in HTML content") + .action(clap::ArgAction::SetTrue), + ) .get_matches(); let links: Vec = matches @@ -37,10 +44,12 @@ impl CliArgs { let validated_links = Self::validate_and_deduplicate_links(links)?; let verbose = matches.get_flag("verbose"); + let template = matches.get_flag("template"); Ok(CliArgs { links: validated_links, verbose, + template, }) } @@ -103,4 +112,19 @@ mod tests { assert!(result.is_err()); assert!(result.unwrap_err().contains("No valid URLs provided")); } + + #[test] + fn test_cli_template_flag() { + // Test that template flag is properly parsed (this is a simplified test + // since we can't easily test the full CLI parsing in unit tests) + let args = CliArgs { + links: vec!["https://example.com".to_string()], + verbose: true, + template: true, + }; + + assert!(args.template); + assert!(args.verbose); + assert_eq!(args.links.len(), 1); + } } diff --git a/src/lib.rs b/src/lib.rs index 79c1630..40d0fdd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,10 +2,12 @@ pub mod browser; pub mod cli; pub mod html_parser; pub mod storage; +pub mod template_detection; pub mod utils; pub use browser::*; pub use cli::*; pub use html_parser::*; pub use storage::*; +pub use template_detection::*; pub use utils::*; diff --git a/src/main.rs b/src/main.rs index 120c5fc..e15ff62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -151,25 +151,35 @@ async fn main() { } } - // Phase 3: Analyze domain duplicates - info!("Analyzing domain-level duplicate nodes"); + // Phase 2.5: Apply template detection if enabled + if args.template { + info!("Applying template detection to HTML content"); + apply_template_detection_to_storage(&mut storage); + } - for domain in domain_urls.keys() { - storage.analyze_domain_duplicates(domain); - if let Some(duplicates) = storage.get_domain_duplicates(domain) { - let duplicate_count = duplicates.get_duplicate_count(); - if duplicate_count > 0 { - info!( - "Found {} duplicate node patterns for domain {}", - duplicate_count, domain - ); - } else { - info!( - "No duplicate patterns found for domain {} (likely insufficient pages)", - domain - ); + // Phase 3: Analyze domain duplicates (skip if template mode is enabled) + if !args.template { + info!("Analyzing domain-level duplicate nodes"); + + for domain in domain_urls.keys() { + storage.analyze_domain_duplicates(domain); + if let Some(duplicates) = storage.get_domain_duplicates(domain) { + let duplicate_count = duplicates.get_duplicate_count(); + if duplicate_count > 0 { + info!( + "Found {} duplicate node patterns for domain {}", + duplicate_count, domain + ); + } else { + info!( + "No duplicate patterns found for domain {} (likely insufficient pages)", + domain + ); + } } } + } else { + info!("Skipping domain duplicate analysis in template mode"); } let _ = browser.close().await; @@ -188,15 +198,20 @@ async fn main() { if args.verbose { if let Some(html_tree) = &url_data.html_tree { - if let Some(domain_duplicates) = storage.get_domain_duplicates(&url_data.domain) + if args.template { + // In template mode, show HTML tree with template patterns (no duplicate filtering) + println!("HTML Tree with Template Patterns:"); + print_html_tree_with_template(html_tree, 0, false); + } else if let Some(domain_duplicates) = + storage.get_domain_duplicates(&url_data.domain) { let filtered_tree = HtmlParser::filter_domain_duplicates(html_tree, domain_duplicates); println!("Filtered HTML Tree (showing complete structure with duplicate marking):"); - print_html_tree(&filtered_tree, 0); + print_html_tree_with_template(&filtered_tree, 0, false); } else { println!("HTML Tree (no duplicates to filter):"); - print_html_tree(html_tree, 0); + print_html_tree_with_template(html_tree, 0, false); } } } @@ -260,7 +275,47 @@ async fn process_url( } } -fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) { +/// Apply template detection to all HTML trees in storage +fn apply_template_detection_to_storage(storage: &mut smart_crawler::UrlStorage) { + let detector = smart_crawler::TemplateDetector::new(); + + // Get all URLs to process + let all_urls: Vec = storage + .get_all_urls() + .iter() + .map(|url_data| url_data.url.clone()) + .collect(); + + for url in &all_urls { + if let Some(url_data) = storage.get_url_data_mut(url) { + if let Some(html_tree) = &mut url_data.html_tree { + apply_template_to_node(html_tree, &detector); + } + } + } +} + +/// Recursively apply template detection to HTML node content +fn apply_template_to_node( + node: &mut smart_crawler::HtmlNode, + detector: &smart_crawler::TemplateDetector, +) { + // Apply template detection to this node's content + if !node.content.is_empty() { + node.content = detector.apply_template(&node.content); + } + + // Recursively apply to all children + for child in &mut node.children { + apply_template_to_node(child, detector); + } +} + +fn print_html_tree_with_template( + node: &smart_crawler::HtmlNode, + indent: usize, + _use_template: bool, +) { let indent_str = " ".repeat(indent); // Build the element info string with tag, id, and classes @@ -273,6 +328,7 @@ fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) { } if !node.content.is_empty() { + // Content already contains template patterns if template mode was enabled println!( "{}{}: {}", indent_str, @@ -284,6 +340,6 @@ fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) { } for child in &node.children { - print_html_tree(child, indent + 1); + print_html_tree_with_template(child, indent + 1, _use_template); } } diff --git a/src/template_detection.rs b/src/template_detection.rs new file mode 100644 index 0000000..b43e412 --- /dev/null +++ b/src/template_detection.rs @@ -0,0 +1,658 @@ +use regex::Regex; +use std::collections::HashMap; + +/// Template variable types that can be detected +#[derive(Debug, Clone, PartialEq)] +pub enum VariableType { + Number, // Integer numbers + Float, // Floating point numbers +} + +/// Represents a template pattern with variable placeholders +#[derive(Debug, Clone, PartialEq)] +pub struct Template { + pub pattern: String, // The template pattern like "{count} comments" + pub variables: Vec<(String, VariableType)>, // Variable names and their types +} + +/// Template detector that can identify common patterns in text +pub struct TemplateDetector { + // Common time unit patterns + time_units: HashMap, + // Common count/quantity descriptors + count_descriptors: HashMap, + // Regex patterns for detection + number_regex: Regex, + float_regex: Regex, +} + +impl TemplateDetector { + pub fn new() -> Self { + let mut time_units = HashMap::new(); + time_units.insert("second".to_string(), "time".to_string()); + time_units.insert("seconds".to_string(), "time".to_string()); + time_units.insert("minute".to_string(), "time".to_string()); + time_units.insert("minutes".to_string(), "time".to_string()); + time_units.insert("hour".to_string(), "time".to_string()); + time_units.insert("hours".to_string(), "time".to_string()); + time_units.insert("day".to_string(), "time".to_string()); + time_units.insert("days".to_string(), "time".to_string()); + time_units.insert("week".to_string(), "time".to_string()); + time_units.insert("weeks".to_string(), "time".to_string()); + time_units.insert("month".to_string(), "time".to_string()); + time_units.insert("months".to_string(), "time".to_string()); + time_units.insert("year".to_string(), "time".to_string()); + time_units.insert("years".to_string(), "time".to_string()); + + let mut count_descriptors = HashMap::new(); + count_descriptors.insert("comment".to_string(), "count".to_string()); + count_descriptors.insert("comments".to_string(), "count".to_string()); + count_descriptors.insert("reply".to_string(), "count".to_string()); + count_descriptors.insert("replies".to_string(), "count".to_string()); + count_descriptors.insert("like".to_string(), "count".to_string()); + count_descriptors.insert("likes".to_string(), "count".to_string()); + count_descriptors.insert("view".to_string(), "count".to_string()); + count_descriptors.insert("views".to_string(), "count".to_string()); + count_descriptors.insert("share".to_string(), "count".to_string()); + count_descriptors.insert("shares".to_string(), "count".to_string()); + count_descriptors.insert("point".to_string(), "count".to_string()); + count_descriptors.insert("points".to_string(), "count".to_string()); + count_descriptors.insert("upvote".to_string(), "count".to_string()); + count_descriptors.insert("upvotes".to_string(), "count".to_string()); + count_descriptors.insert("item".to_string(), "count".to_string()); + count_descriptors.insert("items".to_string(), "count".to_string()); + + let number_regex = Regex::new(r"\b\d+\b").unwrap(); + let float_regex = Regex::new(r"\b\d+\.\d+\b").unwrap(); + + TemplateDetector { + time_units, + count_descriptors, + number_regex, + float_regex, + } + } + + /// Detect template pattern in given text content + pub fn detect_template(&self, content: &str) -> Option