Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 81 additions & 78 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,81 +1,67 @@
use clap::{Arg, Command};
use std::collections::HashSet;
use url::Url;

#[derive(Debug, Clone)]
pub struct CliArgs {
pub links: Vec<String>,
pub verbose: bool,
pub template: bool,
pub domain: String,
pub prep: bool,
}

impl CliArgs {
pub fn parse() -> Result<Self, String> {
let matches = Command::new("smart-crawler")
.version("0.3.2")
.version("0.4.1")
.about("A web crawler that uses WebDriver to extract and parse HTML content")
.arg(
Arg::new("link")
.long("link")
.value_name("URL")
.help("URL to crawl (can be specified multiple times)")
.action(clap::ArgAction::Append)
Arg::new("domain")
.long("domain")
.value_name("DOMAIN")
.help("Domain to crawl. Can be a URL or domain name")
.required(true),
)
.arg(
Arg::new("verbose")
.long("verbose")
.help("Enable verbose output showing filtered HTML node tree")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("template")
.long("template")
.help("Enable template detection mode to identify patterns like '{count} comments' in HTML content")
Arg::new("prep")
.long("prep")
.help(
"Enable preparation mode to discover template patterns across domain pages",
)
.action(clap::ArgAction::SetTrue),
)
.get_matches();

let links: Vec<String> = matches
.get_many::<String>("link")
.unwrap_or_default()
.cloned()
.collect();
let domain_input = matches
.get_one::<String>("domain")
.ok_or("Domain argument is required")?;

let validated_links = Self::validate_and_deduplicate_links(links)?;
let verbose = matches.get_flag("verbose");
let template = matches.get_flag("template");
let validated_domain = Self::extract_domain(domain_input)?;
let prep = matches.get_flag("prep");

Ok(CliArgs {
links: validated_links,
verbose,
template,
domain: validated_domain,
prep,
})
}

fn validate_and_deduplicate_links(links: Vec<String>) -> Result<Vec<String>, String> {
let mut seen_urls = HashSet::new();
let mut validated_links = Vec::new();

for link in links {
match Url::parse(&link) {
Ok(url) => {
let normalized_url = url.to_string();
if seen_urls.insert(normalized_url.clone()) {
validated_links.push(normalized_url);
}
}
Err(_) => {
return Err(format!("Invalid URL: {link}"));
fn extract_domain(input: &str) -> Result<String, String> {
let trimmed = input.trim();

// Always try to parse as URL to validate the domain
let url_str = if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
trimmed.to_string()
} else {
format!("https://{trimmed}")
};

match Url::parse(&url_str) {
Ok(url) => {
if let Some(domain) = url.host_str() {
Ok(domain.to_string())
} else {
Err(format!("Could not extract domain from: {input}"))
}
}
Err(_) => Err(format!("Invalid domain or URL: {input}")),
}

if validated_links.is_empty() {
return Err("No valid URLs provided".to_string());
}

Ok(validated_links)
}
}

Expand All @@ -84,47 +70,64 @@ mod tests {
use super::*;

#[test]
fn test_validate_and_deduplicate_links() {
let links = vec![
"https://example.com".to_string(),
"https://example.org".to_string(),
"https://example.com".to_string(), // duplicate
];

let result = CliArgs::validate_and_deduplicate_links(links).unwrap();
assert_eq!(result.len(), 2);
assert!(result.contains(&"https://example.com/".to_string()));
assert!(result.contains(&"https://example.org/".to_string()));
fn test_single_domain_parsing() {
// Test that single domain parsing works correctly
let args = CliArgs {
domain: "example.com".to_string(),
prep: false,
};

assert_eq!(args.domain, "example.com");
assert!(!args.prep);
}

#[test]
fn test_validate_invalid_url() {
let links = vec!["invalid-url".to_string()];
let result = CliArgs::validate_and_deduplicate_links(links);
assert!(result.is_err());
assert!(result.unwrap_err().contains("Invalid URL"));
fn test_extract_domain() {
// Test URL with protocol
assert_eq!(
CliArgs::extract_domain("https://example.com").unwrap(),
"example.com"
);
assert_eq!(
CliArgs::extract_domain("http://example.com/path").unwrap(),
"example.com"
);

// Test domain without protocol
assert_eq!(
CliArgs::extract_domain("example.com").unwrap(),
"example.com"
);
assert_eq!(
CliArgs::extract_domain(" example.com ").unwrap(),
"example.com"
);

// Test edge case - the URL crate behavior with multiple dots
assert_eq!(
CliArgs::extract_domain("invalid..domain").unwrap(),
"invalid..domain"
);
}

#[test]
fn test_validate_empty_links() {
let links = vec![];
let result = CliArgs::validate_and_deduplicate_links(links);
fn test_extract_domain_error() {
// Test that invalid domain extraction returns error
let result = CliArgs::extract_domain("://invalid");
assert!(result.is_err());
assert!(result.unwrap_err().contains("No valid URLs provided"));
assert!(result.unwrap_err().contains("Invalid domain or URL"));
}

#[test]
fn test_cli_template_flag() {
// Test that template flag is properly parsed (this is a simplified test
fn test_cli_prep_flag() {
// Test that prep flag is properly parsed (this is a simplified test
// since we can't easily test the full CLI parsing in unit tests)
let args = CliArgs {
links: vec!["https://example.com".to_string()],
verbose: true,
template: true,
domain: "example.com".to_string(),
prep: true,
};

assert!(args.template);
assert!(args.verbose);
assert_eq!(args.links.len(), 1);
assert!(args.prep);
assert_eq!(args.domain, "example.com");
}
}
Loading