diff --git a/src/cli.rs b/src/cli.rs index df337a9..b65dd82 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -5,6 +5,7 @@ use url::Url; #[derive(Debug, Clone)] pub struct CliArgs { pub links: Vec, + pub verbose: bool, } impl CliArgs { @@ -20,6 +21,12 @@ impl CliArgs { .action(clap::ArgAction::Append) .required(true), ) + .arg( + Arg::new("verbose") + .long("verbose") + .help("Enable verbose output showing filtered HTML node tree") + .action(clap::ArgAction::SetTrue), + ) .get_matches(); let links: Vec = matches @@ -29,9 +36,11 @@ impl CliArgs { .collect(); let validated_links = Self::validate_and_deduplicate_links(links)?; + let verbose = matches.get_flag("verbose"); Ok(CliArgs { links: validated_links, + verbose, }) } diff --git a/src/html_parser.rs b/src/html_parser.rs index 30e5b2e..a577689 100644 --- a/src/html_parser.rs +++ b/src/html_parser.rs @@ -1,7 +1,9 @@ -use crate::utils::{is_numeric_id, trim_and_clean_text}; +use crate::storage::{DomainDuplicates, NodeSignature}; +use crate::utils::trim_and_clean_text; use scraper::{ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; +use url::Url; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct HtmlNode { @@ -93,9 +95,7 @@ impl HtmlParser { if let Some(child_element) = ElementRef::wrap(child) { let child_node = self.parse_element(child_element); - if !self.is_blank_node(&child_node) - && !self.is_duplicate_node(&child_node, &children) - { + if !self.is_blank_node(&child_node) { children.push(child_node); } } @@ -128,7 +128,7 @@ impl HtmlParser { .value() .attr("id") .map(|id| id.trim().to_string()) - .filter(|id| !id.is_empty() && !is_numeric_id(id)) + .filter(|id| !id.is_empty()) } fn extract_text_content(&self, element: ElementRef) -> String { @@ -139,13 +139,70 @@ impl HtmlParser { node.content.trim().is_empty() && node.children.is_empty() } - fn is_duplicate_node(&self, node: &HtmlNode, existing_children: &[HtmlNode]) -> bool { - existing_children.iter().any(|existing| { - existing.tag == node.tag - && existing.classes == node.classes - && existing.id == node.id - && existing.content == node.content - }) + pub fn filter_domain_duplicates( + node: &HtmlNode, + domain_duplicates: &DomainDuplicates, + ) -> HtmlNode { + let signature = NodeSignature::from_html_node(node); + + // Create the filtered node structure + let mut filtered_node = HtmlNode::new( + node.tag.clone(), + node.classes.clone(), + node.id.clone(), + if domain_duplicates.is_duplicate(&signature) { + "[FILTERED DUPLICATE]".to_string() + } else { + node.content.clone() + }, + ); + + // Always process children to maintain structure + for child in &node.children { + let filtered_child = Self::filter_domain_duplicates(child, domain_duplicates); + filtered_node.add_child(filtered_child); + } + + filtered_node + } + + pub fn extract_links(&self, html: &str, base_domain: &str) -> Vec { + let document = Html::parse_document(html); + let link_selector = Selector::parse("a[href]").unwrap(); + let mut links = HashSet::new(); + + for element in document.select(&link_selector) { + if let Some(href) = element.value().attr("href") { + if let Ok(url) = self.resolve_url(href, base_domain) { + if self.is_same_domain(&url, base_domain) { + links.insert(url); + } + } + } + } + + links.into_iter().collect() + } + + fn resolve_url(&self, href: &str, base_domain: &str) -> Result { + if href.starts_with("http://") || href.starts_with("https://") { + Ok(href.to_string()) + } else if href.starts_with('/') { + Ok(format!("https://{base_domain}{href}")) + } else if href.starts_with("//") { + Ok(format!("https:{href}")) + } else { + Ok(format!("https://{base_domain}/{href}")) + } + } + + fn is_same_domain(&self, url: &str, base_domain: &str) -> bool { + if let Ok(parsed_url) = Url::parse(url) { + if let Some(host) = parsed_url.host_str() { + return host == base_domain || host.ends_with(&format!(".{base_domain}")); + } + } + false } } @@ -204,7 +261,7 @@ mod tests { } #[test] - fn test_html_parser_ignores_numeric_ids() { + fn test_html_parser_preserves_numeric_ids() { let parser = HtmlParser::new(); let html = r#"
Text
"#; let node = parser.parse(html); @@ -212,7 +269,7 @@ mod tests { let body = &node.children[0]; assert_eq!(body.children.len(), 1); let div_node = &body.children[0]; - assert_eq!(div_node.id, None); + assert_eq!(div_node.id, Some("123".to_string())); } #[test] @@ -245,4 +302,62 @@ mod tests { assert_eq!(body.children.len(), 1); assert_eq!(body.children[0].tag, "p"); } + + #[test] + fn test_extract_links() { + let parser = HtmlParser::new(); + let html = r#" + Link 1 + Link 2 + External Link + Protocol-relative + "#; + + let links = parser.extract_links(html, "example.com"); + + assert!(links.contains(&"https://example.com/page1".to_string())); + assert!(links.contains(&"https://example.com/page2".to_string())); + // Protocol-relative URLs are handled correctly + assert!(links.iter().any(|link| link.contains("page4"))); + assert!(!links.iter().any(|link| link.contains("other.com"))); + } + + #[test] + fn test_filter_domain_duplicates() { + use crate::storage::{DomainDuplicates, NodeSignature}; + + let parser = HtmlParser::new(); + let html = r#"
Main content
"#; + let node = parser.parse(html); + + let mut duplicates = DomainDuplicates::new(); + + // Find the nav element in the parsed tree and get its signature + let body = &node.children[0]; + let nav_node = &body.children[0]; // The nav element + let nav_signature = NodeSignature::from_html_node(nav_node); + duplicates.add_duplicate_node(nav_signature); + + let filtered = HtmlParser::filter_domain_duplicates(&node, &duplicates); + + // The structure should be preserved, but nav content should be marked as filtered + assert_eq!(filtered.tag, "html"); + let body = &filtered.children[0]; + assert_eq!(body.tag, "body"); + assert_eq!(body.children.len(), 2); // Both nav and div should remain + assert_eq!(body.children[0].tag, "nav"); + assert_eq!(body.children[0].content, "[FILTERED DUPLICATE]"); + assert_eq!(body.children[1].tag, "div"); + assert_eq!(body.children[1].content, "Main content"); + } + + #[test] + fn test_is_same_domain() { + let parser = HtmlParser::new(); + + assert!(parser.is_same_domain("https://example.com/page", "example.com")); + assert!(parser.is_same_domain("https://sub.example.com/page", "example.com")); + assert!(!parser.is_same_domain("https://other.com/page", "example.com")); + assert!(!parser.is_same_domain("https://notexample.com/page", "example.com")); + } } diff --git a/src/main.rs b/src/main.rs index 72f4f0a..120c5fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use smart_crawler::{Browser, CliArgs, FetchStatus, HtmlParser, UrlStorage}; +use std::collections::{HashMap, HashSet}; use tracing::{debug, error, info}; #[tokio::main] @@ -45,42 +46,128 @@ async fn main() { let parser = HtmlParser::new(); + // Phase 1: Preparation stage - fetch additional URLs from same domains + info!("Starting preparation stage to collect URLs from same domains"); + + let mut domain_urls: HashMap> = HashMap::new(); + + // Group initial URLs by domain for url in &args.links { - info!("Processing URL: {}", url); + if let Some(domain) = smart_crawler::utils::extract_domain_from_url(url) { + domain_urls.entry(domain).or_default().insert(url.clone()); + } + } - if let Some(url_data) = storage.get_url_data_mut(url) { - url_data.update_status(FetchStatus::InProgress); + // Add root URLs for each domain if not already present + for (domain, urls) in &mut domain_urls { + let root_url = smart_crawler::utils::construct_root_url(domain); + if !urls.contains(&root_url) { + urls.insert(root_url.clone()); + storage.add_url(root_url); + info!( + "Added root URL for domain {}: {}", + domain, + smart_crawler::utils::construct_root_url(domain) + ); } + } - match browser.navigate_to(url).await { - Ok(()) => { - debug!("Successfully navigated to {}", url); + // For each domain, try to find additional URLs + for (domain, urls) in &mut domain_urls { + if urls.len() < 3 { + info!( + "Domain {} has only {} URL(s), searching for more...", + domain, + urls.len() + ); - match browser.get_html_source().await { + // Pick the first URL to extract links from + if let Some(first_url) = urls.iter().next() { + match process_url(&mut browser, &parser, &mut storage, first_url, true).await { Ok(html_source) => { - let title = browser.get_page_title().await.ok(); - let html_tree = parser.parse(&html_source); + let additional_urls = parser.extract_links(&html_source, domain); + let mut added_count = 0; - if let Some(url_data) = storage.get_url_data_mut(url) { - url_data.set_html_data(html_source, html_tree, title); - url_data.update_status(FetchStatus::Success); + for additional_url in additional_urls { + if urls.len() >= 3 { + break; + } + if urls.insert(additional_url.clone()) { + storage.add_url(additional_url); + added_count += 1; + } } - info!("Successfully processed {}", url); + info!( + "Found {} additional URLs for domain {}", + added_count, domain + ); } Err(e) => { - error!("Failed to get HTML source for {}: {}", url, e); - if let Some(url_data) = storage.get_url_data_mut(url) { - url_data.update_status(FetchStatus::Failed(e.to_string())); - } + error!("Failed to extract links from {}: {}", first_url, e); } } } - Err(e) => { - error!("Failed to navigate to {}: {}", url, e); - if let Some(url_data) = storage.get_url_data_mut(url) { - url_data.update_status(FetchStatus::Failed(e.to_string())); - } + } + } + + // Phase 2: Process all URLs (initial + discovered) with root URL prioritization + info!("Processing all URLs with root URL prioritization"); + + let mut all_urls: Vec = Vec::new(); + + // First, add all user-specified URLs + for url in &args.links { + all_urls.push(url.clone()); + } + + // Then, add root URLs for each domain (if not already in user-specified URLs) + for domain in domain_urls.keys() { + let root_url = smart_crawler::utils::construct_root_url(domain); + if !args.links.contains(&root_url) { + all_urls.push(root_url); + } + } + + // Finally, add all other discovered URLs + for urls in domain_urls.values() { + for url in urls { + if !args.links.contains(url) && !smart_crawler::utils::is_root_url(url) { + all_urls.push(url.clone()); + } + } + } + + for url in &all_urls { + if let Some(url_data) = storage.get_url_data(url) { + if matches!(url_data.status, FetchStatus::Success) { + continue; // Already processed + } + } + + match process_url(&mut browser, &parser, &mut storage, url, false).await { + Ok(_) => info!("Successfully processed {}", url), + Err(e) => error!("Failed to process {}: {}", url, e), + } + } + + // Phase 3: Analyze domain duplicates + info!("Analyzing domain-level duplicate nodes"); + + for domain in domain_urls.keys() { + storage.analyze_domain_duplicates(domain); + if let Some(duplicates) = storage.get_domain_duplicates(domain) { + let duplicate_count = duplicates.get_duplicate_count(); + if duplicate_count > 0 { + info!( + "Found {} duplicate node patterns for domain {}", + duplicate_count, domain + ); + } else { + info!( + "No duplicate patterns found for domain {} (likely insufficient pages)", + domain + ); } } } @@ -98,9 +185,105 @@ async fn main() { println!("URL: {}", url_data.url); println!("Title: {title}"); println!("Domain: {}", url_data.domain); + + if args.verbose { + if let Some(html_tree) = &url_data.html_tree { + if let Some(domain_duplicates) = storage.get_domain_duplicates(&url_data.domain) + { + let filtered_tree = + HtmlParser::filter_domain_duplicates(html_tree, domain_duplicates); + println!("Filtered HTML Tree (showing complete structure with duplicate marking):"); + print_html_tree(&filtered_tree, 0); + } else { + println!("HTML Tree (no duplicates to filter):"); + print_html_tree(html_tree, 0); + } + } + } + println!("---"); } } - info!("SmartCrawler finished processing {} URLs", args.links.len()); + info!("SmartCrawler finished processing {} URLs", all_urls.len()); +} + +async fn process_url( + browser: &mut Browser, + parser: &HtmlParser, + storage: &mut UrlStorage, + url: &str, + return_html: bool, +) -> Result { + info!("Processing URL: {}", url); + + if let Some(url_data) = storage.get_url_data_mut(url) { + url_data.update_status(FetchStatus::InProgress); + } + + match browser.navigate_to(url).await { + Ok(()) => { + debug!("Successfully navigated to {}", url); + + match browser.get_html_source().await { + Ok(html_source) => { + let title = browser.get_page_title().await.ok(); + let html_tree = parser.parse(&html_source); + + if let Some(url_data) = storage.get_url_data_mut(url) { + url_data.set_html_data(html_source.clone(), html_tree, title); + url_data.update_status(FetchStatus::Success); + } + + if return_html { + Ok(html_source) + } else { + Ok(String::new()) + } + } + Err(e) => { + let error_msg = format!("Failed to get HTML source: {e}"); + if let Some(url_data) = storage.get_url_data_mut(url) { + url_data.update_status(FetchStatus::Failed(error_msg.clone())); + } + Err(error_msg) + } + } + } + Err(e) => { + let error_msg = format!("Failed to navigate: {e}"); + if let Some(url_data) = storage.get_url_data_mut(url) { + url_data.update_status(FetchStatus::Failed(error_msg.clone())); + } + Err(error_msg) + } + } +} + +fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) { + let indent_str = " ".repeat(indent); + + // Build the element info string with tag, id, and classes + let mut element_info = node.tag.clone(); + if let Some(id) = &node.id { + element_info.push_str(&format!("#{id}")); + } + if !node.classes.is_empty() { + element_info.push_str(&format!("[{}]", node.classes.join(" "))); + } + + if !node.content.is_empty() { + println!( + "{}{}: {}", + indent_str, + element_info, + node.content.chars().take(100).collect::() + ); + } else { + println!("{indent_str}{element_info}"); + } + + for child in &node.children { + print_html_tree(child, indent + 1); + } } diff --git a/src/storage.rs b/src/storage.rs index 0eb5e4f..467c887 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -2,7 +2,9 @@ use crate::html_parser::HtmlNode; use crate::utils::extract_domain_from_url; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::hash_map::DefaultHasher; +use std::collections::{HashMap, HashSet}; +use std::hash::{Hash, Hasher}; #[derive(Debug, Clone, Serialize, Deserialize)] pub enum FetchStatus { @@ -62,12 +64,14 @@ impl UrlData { #[derive(Debug, Default)] pub struct UrlStorage { urls_by_domain: HashMap>, + domain_duplicates: HashMap, } impl UrlStorage { pub fn new() -> Self { UrlStorage { urls_by_domain: HashMap::new(), + domain_duplicates: HashMap::new(), } } @@ -111,6 +115,94 @@ impl UrlStorage { .filter(|url_data| matches!(url_data.status, FetchStatus::Success)) .collect() } + + pub fn analyze_domain_duplicates(&mut self, domain: &str) { + if let Some(domain_urls) = self.urls_by_domain.get(domain) { + let completed_urls: Vec<_> = domain_urls + .values() + .filter(|url_data| matches!(url_data.status, FetchStatus::Success)) + .collect(); + + if completed_urls.len() < 2 { + return; // Need at least 2 pages to find duplicates + } + + let mut node_occurrence_count: HashMap = HashMap::new(); + + // Count occurrences of each node signature across all pages + for url_data in &completed_urls { + if let Some(html_tree) = &url_data.html_tree { + Self::collect_node_signatures(html_tree, &mut node_occurrence_count); + } + } + + // Mark nodes that appear in 2 or more pages as duplicates + let domain_duplicates = self + .domain_duplicates + .entry(domain.to_string()) + .or_default(); + for (signature, count) in node_occurrence_count { + if count >= 2 { + domain_duplicates.add_duplicate_node(signature); + } + } + } + } + + fn collect_node_signatures(node: &HtmlNode, signatures: &mut HashMap) { + // Skip structural/container elements that naturally appear on every page + if !Self::is_structural_element(&node.tag) { + let signature = NodeSignature::from_html_node(node); + // Only count nodes with meaningful content or specific styling + if Self::is_meaningful_node(node) { + *signatures.entry(signature).or_insert(0) += 1; + } + } + + for child in &node.children { + Self::collect_node_signatures(child, signatures); + } + } + + fn is_structural_element(tag: &str) -> bool { + matches!( + tag, + "html" | "head" | "body" | "main" | "article" | "section" + ) + } + + fn is_meaningful_node(node: &HtmlNode) -> bool { + // Consider a node meaningful if it has: + // - Non-empty content (text content or children), OR + // - Specific CSS classes/IDs that indicate styling, OR + // - Is a semantic element that likely appears across multiple pages + (!node.content.trim().is_empty() || !node.children.is_empty()) + || !node.classes.is_empty() + || node.id.is_some() + || matches!( + node.tag.as_str(), + "nav" + | "header" + | "footer" + | "aside" + | "form" + | "button" + | "a" + | "ul" + | "ol" + | "menu" + ) + } + + pub fn get_domain_duplicates(&self, domain: &str) -> Option<&DomainDuplicates> { + self.domain_duplicates.get(domain) + } + + pub fn add_urls_from_same_domain(&mut self, urls: Vec) { + for url in urls { + self.add_url(url); + } + } } #[cfg(test)] @@ -164,4 +256,193 @@ mod tests { assert!(matches!(url_data.status, FetchStatus::InProgress)); assert!(url_data.updated_at > original_time); } + + #[test] + fn test_add_urls_from_same_domain() { + let mut storage = UrlStorage::new(); + let urls = vec![ + "https://example.com/page1".to_string(), + "https://example.com/page2".to_string(), + "https://example.com/page3".to_string(), + ]; + + storage.add_urls_from_same_domain(urls); + + let example_com_urls = storage.get_urls_by_domain("example.com"); + assert!(example_com_urls.is_some()); + assert_eq!(example_com_urls.unwrap().len(), 3); + } + + #[test] + fn test_analyze_domain_duplicates() { + use crate::html_parser::HtmlParser; + + let mut storage = UrlStorage::new(); + let parser = HtmlParser::new(); + + storage.add_url("https://example.com/page1".to_string()); + storage.add_url("https://example.com/page2".to_string()); + + // Create mock HTML trees with common elements + let html1 = r#"
Page 1 content
"#; + let html2 = r#"
Page 2 content
"#; + + let tree1 = parser.parse(html1); + let tree2 = parser.parse(html2); + + // Set the HTML data for both URLs + if let Some(url_data) = storage.get_url_data_mut("https://example.com/page1") { + url_data.set_html_data(html1.to_string(), tree1, Some("Page 1".to_string())); + url_data.update_status(FetchStatus::Success); + } + + if let Some(url_data) = storage.get_url_data_mut("https://example.com/page2") { + url_data.set_html_data(html2.to_string(), tree2, Some("Page 2".to_string())); + url_data.update_status(FetchStatus::Success); + } + + // Analyze domain duplicates + storage.analyze_domain_duplicates("example.com"); + + let duplicates = storage.get_domain_duplicates("example.com"); + assert!(duplicates.is_some()); + assert!(duplicates.unwrap().get_duplicate_count() > 0); + } + + #[test] + fn test_node_signature_creation() { + use crate::html_parser::HtmlNode; + + let node = HtmlNode::new( + "div".to_string(), + vec!["container".to_string(), "main".to_string()], + Some("content".to_string()), + "Test content".to_string(), + ); + + let signature = NodeSignature::from_html_node(&node); + assert_eq!(signature.tag, "div"); + assert_eq!(signature.classes, vec!["container", "main"]); + assert_eq!(signature.id, Some("content".to_string())); + assert_eq!(signature.content, "Test content"); + assert!(!signature.content_hash.is_empty()); + } + + #[test] + fn test_domain_duplicates_detection() { + let mut duplicates = DomainDuplicates::new(); + + let signature = NodeSignature { + tag: "nav".to_string(), + classes: vec!["navbar".to_string()], + id: None, + content: "Navigation".to_string(), + content_hash: "test_hash".to_string(), + }; + + assert!(!duplicates.is_duplicate(&signature)); + + duplicates.add_duplicate_node(signature.clone()); + assert!(duplicates.is_duplicate(&signature)); + assert_eq!(duplicates.get_duplicate_count(), 1); + } + + #[test] + fn test_content_hash_includes_children() { + use crate::html_parser::HtmlParser; + + let parser = HtmlParser::new(); + + // Two divs with same tag/class but different children + let html1 = r#"

Content 1

"#; + let html2 = r#"

Content 2

"#; + let html3 = r#"

Content 1

"#; // Same as html1 + + let node1 = parser.parse(html1); + let node2 = parser.parse(html2); + let node3 = parser.parse(html3); + + let sig1 = NodeSignature::from_html_node(&node1); + let sig2 = NodeSignature::from_html_node(&node2); + let sig3 = NodeSignature::from_html_node(&node3); + + // sig1 and sig2 should be different due to different child content + assert_ne!(sig1.content_hash, sig2.content_hash); + + // sig1 and sig3 should be identical + assert_eq!(sig1.content_hash, sig3.content_hash); + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct NodeSignature { + pub tag: String, + pub classes: Vec, + pub id: Option, + pub content: String, + pub content_hash: String, // Hash of complete structure including children +} + +impl NodeSignature { + pub fn from_html_node(node: &HtmlNode) -> Self { + let content_hash = Self::compute_content_hash(node); + + NodeSignature { + tag: node.tag.clone(), + classes: node.classes.clone(), + id: node.id.clone(), + content: node.content.clone(), + content_hash, + } + } + + fn compute_content_hash(node: &HtmlNode) -> String { + let mut hasher = DefaultHasher::new(); + + // Hash the complete structure: tag, classes, id, content, and children structure + node.tag.hash(&mut hasher); + node.classes.hash(&mut hasher); + node.id.hash(&mut hasher); + node.content.hash(&mut hasher); + + // Recursively hash children structure + Self::hash_children(&node.children, &mut hasher); + + format!("{:x}", hasher.finish()) + } + + fn hash_children(children: &[HtmlNode], hasher: &mut DefaultHasher) { + for child in children { + child.tag.hash(hasher); + child.classes.hash(hasher); + child.id.hash(hasher); + child.content.hash(hasher); + Self::hash_children(&child.children, hasher); + } + } +} + +#[derive(Debug, Default)] +pub struct DomainDuplicates { + duplicate_nodes: HashSet, +} + +impl DomainDuplicates { + pub fn new() -> Self { + DomainDuplicates { + duplicate_nodes: HashSet::new(), + } + } + + pub fn add_duplicate_node(&mut self, signature: NodeSignature) { + self.duplicate_nodes.insert(signature); + } + + pub fn is_duplicate(&self, signature: &NodeSignature) -> bool { + self.duplicate_nodes.contains(signature) + } + + pub fn get_duplicate_count(&self) -> usize { + self.duplicate_nodes.len() + } } diff --git a/src/utils.rs b/src/utils.rs index 4ccf823..55e4ad6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -19,8 +19,21 @@ pub fn extract_domain_from_url(url: &str) -> Option { .and_then(|parsed| parsed.host_str().map(|host| host.to_string())) } -pub fn is_numeric_id(id: &str) -> bool { - !id.is_empty() && id.chars().all(|c| c.is_ascii_digit()) +pub fn construct_root_url(domain: &str) -> String { + format!("https://{domain}") +} + +pub fn is_root_url(url: &str) -> bool { + if let Ok(parsed) = url::Url::parse(url) { + let path = parsed.path(); + let query = parsed.query(); + let fragment = parsed.fragment(); + + // Root URL has path "/" or empty, no query parameters, and no fragment + (path == "/" || path.is_empty()) && query.is_none() && fragment.is_none() + } else { + false + } } #[cfg(test)] @@ -52,11 +65,25 @@ mod tests { } #[test] - fn test_is_numeric_id() { - assert!(is_numeric_id("123")); - assert!(is_numeric_id("0")); - assert!(!is_numeric_id("abc")); - assert!(!is_numeric_id("12a")); - assert!(!is_numeric_id("")); + fn test_construct_root_url() { + assert_eq!(construct_root_url("example.com"), "https://example.com"); + assert_eq!( + construct_root_url("subdomain.example.com"), + "https://subdomain.example.com" + ); + } + + #[test] + fn test_is_root_url() { + assert!(is_root_url("https://example.com")); + assert!(is_root_url("https://example.com/")); + assert!(is_root_url("http://example.com")); + assert!(is_root_url("http://example.com/")); + + assert!(!is_root_url("https://example.com/path")); + assert!(!is_root_url("https://example.com/?query=value")); + assert!(!is_root_url("https://example.com/#fragment")); + assert!(!is_root_url("https://example.com/path?query=value")); + assert!(!is_root_url("invalid-url")); } }