pixlie · brainless · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/src/cli.rs b/src/cli.rs
@@ -1,81 +1,67 @@
 use clap::{Arg, Command};
-use std::collections::HashSet;
 use url::Url;
 
 #[derive(Debug, Clone)]
 pub struct CliArgs {
-    pub links: Vec<String>,
-    pub verbose: bool,
-    pub template: bool,
+    pub domain: String,
+    pub prep: bool,
 }
 
 impl CliArgs {
     pub fn parse() -> Result<Self, String> {
         let matches = Command::new("smart-crawler")
-            .version("0.3.2")
+            .version("0.4.1")
             .about("A web crawler that uses WebDriver to extract and parse HTML content")
             .arg(
-                Arg::new("link")
-                    .long("link")
-                    .value_name("URL")
-                    .help("URL to crawl (can be specified multiple times)")
-                    .action(clap::ArgAction::Append)
+                Arg::new("domain")
+                    .long("domain")
+                    .value_name("DOMAIN")
+                    .help("Domain to crawl. Can be a URL or domain name")
                     .required(true),
             )
             .arg(
-                Arg::new("verbose")
-                    .long("verbose")
-                    .help("Enable verbose output showing filtered HTML node tree")
-                    .action(clap::ArgAction::SetTrue),
-            )
-            .arg(
-                Arg::new("template")
-                    .long("template")
-                    .help("Enable template detection mode to identify patterns like '{count} comments' in HTML content")
+                Arg::new("prep")
+                    .long("prep")
+                    .help(
+                        "Enable preparation mode to discover template patterns across domain pages",
+                    )
                     .action(clap::ArgAction::SetTrue),
             )
             .get_matches();
 
-        let links: Vec<String> = matches
-            .get_many::<String>("link")
-            .unwrap_or_default()
-            .cloned()
-            .collect();
+        let domain_input = matches
+            .get_one::<String>("domain")
+            .ok_or("Domain argument is required")?;
 
-        let validated_links = Self::validate_and_deduplicate_links(links)?;
-        let verbose = matches.get_flag("verbose");
-        let template = matches.get_flag("template");
+        let validated_domain = Self::extract_domain(domain_input)?;
+        let prep = matches.get_flag("prep");
 
         Ok(CliArgs {
-            links: validated_links,
-            verbose,
-            template,
+            domain: validated_domain,
+            prep,
         })
     }
 
-    fn validate_and_deduplicate_links(links: Vec<String>) -> Result<Vec<String>, String> {
-        let mut seen_urls = HashSet::new();
-        let mut validated_links = Vec::new();
-
-        for link in links {
-            match Url::parse(&link) {
-                Ok(url) => {
-                    let normalized_url = url.to_string();
-                    if seen_urls.insert(normalized_url.clone()) {
-                        validated_links.push(normalized_url);
-                    }
-                }
-                Err(_) => {
-                    return Err(format!("Invalid URL: {link}"));
+    fn extract_domain(input: &str) -> Result<String, String> {
+        let trimmed = input.trim();
+
+        // Always try to parse as URL to validate the domain
+        let url_str = if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
+            trimmed.to_string()
+        } else {
+            format!("https://{trimmed}")
+        };
+
+        match Url::parse(&url_str) {
+            Ok(url) => {
+                if let Some(domain) = url.host_str() {
+                    Ok(domain.to_string())
+                } else {
+                    Err(format!("Could not extract domain from: {input}"))
                 }
             }
+            Err(_) => Err(format!("Invalid domain or URL: {input}")),
         }
-
-        if validated_links.is_empty() {
-            return Err("No valid URLs provided".to_string());
-        }
-
-        Ok(validated_links)
     }
 }
 
@@ -84,47 +70,64 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_validate_and_deduplicate_links() {
-        let links = vec![
-            "https://example.com".to_string(),
-            "https://example.org".to_string(),
-            "https://example.com".to_string(), // duplicate
-        ];
-
-        let result = CliArgs::validate_and_deduplicate_links(links).unwrap();
-        assert_eq!(result.len(), 2);
-        assert!(result.contains(&"https://example.com/".to_string()));
-        assert!(result.contains(&"https://example.org/".to_string()));
+    fn test_single_domain_parsing() {
+        // Test that single domain parsing works correctly
+        let args = CliArgs {
+            domain: "example.com".to_string(),
+            prep: false,
+        };
+
+        assert_eq!(args.domain, "example.com");
+        assert!(!args.prep);
     }
 
     #[test]
-    fn test_validate_invalid_url() {
-        let links = vec!["invalid-url".to_string()];
-        let result = CliArgs::validate_and_deduplicate_links(links);
-        assert!(result.is_err());
-        assert!(result.unwrap_err().contains("Invalid URL"));
+    fn test_extract_domain() {
+        // Test URL with protocol
+        assert_eq!(
+            CliArgs::extract_domain("https://example.com").unwrap(),
+            "example.com"
+        );
+        assert_eq!(
+            CliArgs::extract_domain("http://example.com/path").unwrap(),
+            "example.com"
+        );
+
+        // Test domain without protocol
+        assert_eq!(
+            CliArgs::extract_domain("example.com").unwrap(),
+            "example.com"
+        );
+        assert_eq!(
+            CliArgs::extract_domain("  example.com  ").unwrap(),
+            "example.com"
+        );
+
+        // Test edge case - the URL crate behavior with multiple dots
+        assert_eq!(
+            CliArgs::extract_domain("invalid..domain").unwrap(),
+            "invalid..domain"
+        );
     }
 
     #[test]
-    fn test_validate_empty_links() {
-        let links = vec![];
-        let result = CliArgs::validate_and_deduplicate_links(links);
+    fn test_extract_domain_error() {
+        // Test that invalid domain extraction returns error
+        let result = CliArgs::extract_domain("://invalid");
         assert!(result.is_err());
-        assert!(result.unwrap_err().contains("No valid URLs provided"));
+        assert!(result.unwrap_err().contains("Invalid domain or URL"));
     }
 
     #[test]
-    fn test_cli_template_flag() {
-        // Test that template flag is properly parsed (this is a simplified test
+    fn test_cli_prep_flag() {
+        // Test that prep flag is properly parsed (this is a simplified test
         // since we can't easily test the full CLI parsing in unit tests)
         let args = CliArgs {
-            links: vec!["https://example.com".to_string()],
-            verbose: true,
-            template: true,
+            domain: "example.com".to_string(),
+            prep: true,
         };
 
-        assert!(args.template);
-        assert!(args.verbose);
-        assert_eq!(args.links.len(), 1);
+        assert!(args.prep);
+        assert_eq!(args.domain, "example.com");
     }
 }