Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use url::Url;
pub struct CliArgs {
pub links: Vec<String>,
pub verbose: bool,
pub template: bool,
}

impl CliArgs {
Expand All @@ -27,6 +28,12 @@ impl CliArgs {
.help("Enable verbose output showing filtered HTML node tree")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("template")
.long("template")
.help("Enable template detection mode to identify patterns like '{count} comments' in HTML content")
.action(clap::ArgAction::SetTrue),
)
.get_matches();

let links: Vec<String> = matches
Expand All @@ -37,10 +44,12 @@ impl CliArgs {

let validated_links = Self::validate_and_deduplicate_links(links)?;
let verbose = matches.get_flag("verbose");
let template = matches.get_flag("template");

Ok(CliArgs {
links: validated_links,
verbose,
template,
})
}

Expand Down Expand Up @@ -103,4 +112,19 @@ mod tests {
assert!(result.is_err());
assert!(result.unwrap_err().contains("No valid URLs provided"));
}

#[test]
fn test_cli_template_flag() {
// Test that template flag is properly parsed (this is a simplified test
// since we can't easily test the full CLI parsing in unit tests)
let args = CliArgs {
links: vec!["https://example.com".to_string()],
verbose: true,
template: true,
};

assert!(args.template);
assert!(args.verbose);
assert_eq!(args.links.len(), 1);
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ pub mod browser;
pub mod cli;
pub mod html_parser;
pub mod storage;
pub mod template_detection;
pub mod utils;

pub use browser::*;
pub use cli::*;
pub use html_parser::*;
pub use storage::*;
pub use template_detection::*;
pub use utils::*;
98 changes: 77 additions & 21 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,25 +151,35 @@ async fn main() {
}
}

// Phase 3: Analyze domain duplicates
info!("Analyzing domain-level duplicate nodes");
// Phase 2.5: Apply template detection if enabled
if args.template {
info!("Applying template detection to HTML content");
apply_template_detection_to_storage(&mut storage);
}

for domain in domain_urls.keys() {
storage.analyze_domain_duplicates(domain);
if let Some(duplicates) = storage.get_domain_duplicates(domain) {
let duplicate_count = duplicates.get_duplicate_count();
if duplicate_count > 0 {
info!(
"Found {} duplicate node patterns for domain {}",
duplicate_count, domain
);
} else {
info!(
"No duplicate patterns found for domain {} (likely insufficient pages)",
domain
);
// Phase 3: Analyze domain duplicates (skip if template mode is enabled)
if !args.template {
info!("Analyzing domain-level duplicate nodes");

for domain in domain_urls.keys() {
storage.analyze_domain_duplicates(domain);
if let Some(duplicates) = storage.get_domain_duplicates(domain) {
let duplicate_count = duplicates.get_duplicate_count();
if duplicate_count > 0 {
info!(
"Found {} duplicate node patterns for domain {}",
duplicate_count, domain
);
} else {
info!(
"No duplicate patterns found for domain {} (likely insufficient pages)",
domain
);
}
}
}
} else {
info!("Skipping domain duplicate analysis in template mode");
}

let _ = browser.close().await;
Expand All @@ -188,15 +198,20 @@ async fn main() {

if args.verbose {
if let Some(html_tree) = &url_data.html_tree {
if let Some(domain_duplicates) = storage.get_domain_duplicates(&url_data.domain)
if args.template {
// In template mode, show HTML tree with template patterns (no duplicate filtering)
println!("HTML Tree with Template Patterns:");
print_html_tree_with_template(html_tree, 0, false);
} else if let Some(domain_duplicates) =
storage.get_domain_duplicates(&url_data.domain)
{
let filtered_tree =
HtmlParser::filter_domain_duplicates(html_tree, domain_duplicates);
println!("Filtered HTML Tree (showing complete structure with duplicate marking):");
print_html_tree(&filtered_tree, 0);
print_html_tree_with_template(&filtered_tree, 0, false);
} else {
println!("HTML Tree (no duplicates to filter):");
print_html_tree(html_tree, 0);
print_html_tree_with_template(html_tree, 0, false);
}
}
}
Expand Down Expand Up @@ -260,7 +275,47 @@ async fn process_url(
}
}

fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) {
/// Apply template detection to all HTML trees in storage
fn apply_template_detection_to_storage(storage: &mut smart_crawler::UrlStorage) {
let detector = smart_crawler::TemplateDetector::new();

// Get all URLs to process
let all_urls: Vec<String> = storage
.get_all_urls()
.iter()
.map(|url_data| url_data.url.clone())
.collect();

for url in &all_urls {
if let Some(url_data) = storage.get_url_data_mut(url) {
if let Some(html_tree) = &mut url_data.html_tree {
apply_template_to_node(html_tree, &detector);
}
}
}
}

/// Recursively apply template detection to HTML node content
fn apply_template_to_node(
node: &mut smart_crawler::HtmlNode,
detector: &smart_crawler::TemplateDetector,
) {
// Apply template detection to this node's content
if !node.content.is_empty() {
node.content = detector.apply_template(&node.content);
}

// Recursively apply to all children
for child in &mut node.children {
apply_template_to_node(child, detector);
}
}

fn print_html_tree_with_template(
node: &smart_crawler::HtmlNode,
indent: usize,
_use_template: bool,
) {
let indent_str = " ".repeat(indent);

// Build the element info string with tag, id, and classes
Expand All @@ -273,6 +328,7 @@ fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) {
}

if !node.content.is_empty() {
// Content already contains template patterns if template mode was enabled
println!(
"{}{}: {}",
indent_str,
Expand All @@ -284,6 +340,6 @@ fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) {
}

for child in &node.children {
print_html_tree(child, indent + 1);
print_html_tree_with_template(child, indent + 1, _use_template);
}
}
Loading