diff --git a/src/cli.rs b/src/cli.rs
index df337a9..b65dd82 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -5,6 +5,7 @@ use url::Url;
 #[derive(Debug, Clone)]
 pub struct CliArgs {
     pub links: Vec<String>,
+    pub verbose: bool,
 }
 
 impl CliArgs {
@@ -20,6 +21,12 @@ impl CliArgs {
                     .action(clap::ArgAction::Append)
                     .required(true),
             )
+            .arg(
+                Arg::new("verbose")
+                    .long("verbose")
+                    .help("Enable verbose output showing filtered HTML node tree")
+                    .action(clap::ArgAction::SetTrue),
+            )
             .get_matches();
 
         let links: Vec<String> = matches
@@ -29,9 +36,11 @@ impl CliArgs {
             .collect();
 
         let validated_links = Self::validate_and_deduplicate_links(links)?;
+        let verbose = matches.get_flag("verbose");
 
         Ok(CliArgs {
             links: validated_links,
+            verbose,
         })
     }
 
diff --git a/src/html_parser.rs b/src/html_parser.rs
index 30e5b2e..a577689 100644
--- a/src/html_parser.rs
+++ b/src/html_parser.rs
@@ -1,7 +1,9 @@
-use crate::utils::{is_numeric_id, trim_and_clean_text};
+use crate::storage::{DomainDuplicates, NodeSignature};
+use crate::utils::trim_and_clean_text;
 use scraper::{ElementRef, Html, Selector};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
+use url::Url;
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HtmlNode {
@@ -93,9 +95,7 @@ impl HtmlParser {
             if let Some(child_element) = ElementRef::wrap(child) {
                 let child_node = self.parse_element(child_element);
 
-                if !self.is_blank_node(&child_node)
-                    && !self.is_duplicate_node(&child_node, &children)
-                {
+                if !self.is_blank_node(&child_node) {
                     children.push(child_node);
                 }
             }
@@ -128,7 +128,7 @@ impl HtmlParser {
             .value()
             .attr("id")
             .map(|id| id.trim().to_string())
-            .filter(|id| !id.is_empty() && !is_numeric_id(id))
+            .filter(|id| !id.is_empty())
     }
 
     fn extract_text_content(&self, element: ElementRef) -> String {
@@ -139,13 +139,70 @@ impl HtmlParser {
         node.content.trim().is_empty() && node.children.is_empty()
     }
 
-    fn is_duplicate_node(&self, node: &HtmlNode, existing_children: &[HtmlNode]) -> bool {
-        existing_children.iter().any(|existing| {
-            existing.tag == node.tag
-                && existing.classes == node.classes
-                && existing.id == node.id
-                && existing.content == node.content
-        })
+    pub fn filter_domain_duplicates(
+        node: &HtmlNode,
+        domain_duplicates: &DomainDuplicates,
+    ) -> HtmlNode {
+        let signature = NodeSignature::from_html_node(node);
+
+        // Create the filtered node structure
+        let mut filtered_node = HtmlNode::new(
+            node.tag.clone(),
+            node.classes.clone(),
+            node.id.clone(),
+            if domain_duplicates.is_duplicate(&signature) {
+                "[FILTERED DUPLICATE]".to_string()
+            } else {
+                node.content.clone()
+            },
+        );
+
+        // Always process children to maintain structure
+        for child in &node.children {
+            let filtered_child = Self::filter_domain_duplicates(child, domain_duplicates);
+            filtered_node.add_child(filtered_child);
+        }
+
+        filtered_node
+    }
+
+    pub fn extract_links(&self, html: &str, base_domain: &str) -> Vec<String> {
+        let document = Html::parse_document(html);
+        let link_selector = Selector::parse("a[href]").unwrap();
+        let mut links = HashSet::new();
+
+        for element in document.select(&link_selector) {
+            if let Some(href) = element.value().attr("href") {
+                if let Ok(url) = self.resolve_url(href, base_domain) {
+                    if self.is_same_domain(&url, base_domain) {
+                        links.insert(url);
+                    }
+                }
+            }
+        }
+
+        links.into_iter().collect()
+    }
+
+    fn resolve_url(&self, href: &str, base_domain: &str) -> Result<String, String> {
+        if href.starts_with("http://") || href.starts_with("https://") {
+            Ok(href.to_string())
+        } else if href.starts_with('/') {
+            Ok(format!("https://{base_domain}{href}"))
+        } else if href.starts_with("//") {
+            Ok(format!("https:{href}"))
+        } else {
+            Ok(format!("https://{base_domain}/{href}"))
+        }
+    }
+
+    fn is_same_domain(&self, url: &str, base_domain: &str) -> bool {
+        if let Ok(parsed_url) = Url::parse(url) {
+            if let Some(host) = parsed_url.host_str() {
+                return host == base_domain || host.ends_with(&format!(".{base_domain}"));
+            }
+        }
+        false
     }
 }
 
@@ -204,7 +261,7 @@ mod tests {
     }
 
     #[test]
-    fn test_html_parser_ignores_numeric_ids() {
+    fn test_html_parser_preserves_numeric_ids() {
         let parser = HtmlParser::new();
         let html = r#"<html><body><div id="123">Text</div></body></html>"#;
         let node = parser.parse(html);
@@ -212,7 +269,7 @@ mod tests {
         let body = &node.children[0];
         assert_eq!(body.children.len(), 1);
         let div_node = &body.children[0];
-        assert_eq!(div_node.id, None);
+        assert_eq!(div_node.id, Some("123".to_string()));
     }
 
     #[test]
@@ -245,4 +302,62 @@ mod tests {
         assert_eq!(body.children.len(), 1);
         assert_eq!(body.children[0].tag, "p");
     }
+
+    #[test]
+    fn test_extract_links() {
+        let parser = HtmlParser::new();
+        let html = r#"<html><body>
+            <a href="/page1">Link 1</a>
+            <a href="https://example.com/page2">Link 2</a>
+            <a href="https://other.com/page3">External Link</a>
+            <a href="//example.com/page4">Protocol-relative</a>
+        </body></html>"#;
+
+        let links = parser.extract_links(html, "example.com");
+
+        assert!(links.contains(&"https://example.com/page1".to_string()));
+        assert!(links.contains(&"https://example.com/page2".to_string()));
+        // Protocol-relative URLs are handled correctly
+        assert!(links.iter().any(|link| link.contains("page4")));
+        assert!(!links.iter().any(|link| link.contains("other.com")));
+    }
+
+    #[test]
+    fn test_filter_domain_duplicates() {
+        use crate::storage::{DomainDuplicates, NodeSignature};
+
+        let parser = HtmlParser::new();
+        let html = r#"<html><body><nav class="navbar">Navigation</nav><div class="content">Main content</div></body></html>"#;
+        let node = parser.parse(html);
+
+        let mut duplicates = DomainDuplicates::new();
+
+        // Find the nav element in the parsed tree and get its signature
+        let body = &node.children[0];
+        let nav_node = &body.children[0]; // The nav element
+        let nav_signature = NodeSignature::from_html_node(nav_node);
+        duplicates.add_duplicate_node(nav_signature);
+
+        let filtered = HtmlParser::filter_domain_duplicates(&node, &duplicates);
+
+        // The structure should be preserved, but nav content should be marked as filtered
+        assert_eq!(filtered.tag, "html");
+        let body = &filtered.children[0];
+        assert_eq!(body.tag, "body");
+        assert_eq!(body.children.len(), 2); // Both nav and div should remain
+        assert_eq!(body.children[0].tag, "nav");
+        assert_eq!(body.children[0].content, "[FILTERED DUPLICATE]");
+        assert_eq!(body.children[1].tag, "div");
+        assert_eq!(body.children[1].content, "Main content");
+    }
+
+    #[test]
+    fn test_is_same_domain() {
+        let parser = HtmlParser::new();
+
+        assert!(parser.is_same_domain("https://example.com/page", "example.com"));
+        assert!(parser.is_same_domain("https://sub.example.com/page", "example.com"));
+        assert!(!parser.is_same_domain("https://other.com/page", "example.com"));
+        assert!(!parser.is_same_domain("https://notexample.com/page", "example.com"));
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 72f4f0a..120c5fc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
 use smart_crawler::{Browser, CliArgs, FetchStatus, HtmlParser, UrlStorage};
+use std::collections::{HashMap, HashSet};
 use tracing::{debug, error, info};
 
 #[tokio::main]
@@ -45,42 +46,128 @@ async fn main() {
 
     let parser = HtmlParser::new();
 
+    // Phase 1: Preparation stage - fetch additional URLs from same domains
+    info!("Starting preparation stage to collect URLs from same domains");
+
+    let mut domain_urls: HashMap<String, HashSet<String>> = HashMap::new();
+
+    // Group initial URLs by domain
     for url in &args.links {
-        info!("Processing URL: {}", url);
+        if let Some(domain) = smart_crawler::utils::extract_domain_from_url(url) {
+            domain_urls.entry(domain).or_default().insert(url.clone());
+        }
+    }
 
-        if let Some(url_data) = storage.get_url_data_mut(url) {
-            url_data.update_status(FetchStatus::InProgress);
+    // Add root URLs for each domain if not already present
+    for (domain, urls) in &mut domain_urls {
+        let root_url = smart_crawler::utils::construct_root_url(domain);
+        if !urls.contains(&root_url) {
+            urls.insert(root_url.clone());
+            storage.add_url(root_url);
+            info!(
+                "Added root URL for domain {}: {}",
+                domain,
+                smart_crawler::utils::construct_root_url(domain)
+            );
         }
+    }
 
-        match browser.navigate_to(url).await {
-            Ok(()) => {
-                debug!("Successfully navigated to {}", url);
+    // For each domain, try to find additional URLs
+    for (domain, urls) in &mut domain_urls {
+        if urls.len() < 3 {
+            info!(
+                "Domain {} has only {} URL(s), searching for more...",
+                domain,
+                urls.len()
+            );
 
-                match browser.get_html_source().await {
+            // Pick the first URL to extract links from
+            if let Some(first_url) = urls.iter().next() {
+                match process_url(&mut browser, &parser, &mut storage, first_url, true).await {
                     Ok(html_source) => {
-                        let title = browser.get_page_title().await.ok();
-                        let html_tree = parser.parse(&html_source);
+                        let additional_urls = parser.extract_links(&html_source, domain);
+                        let mut added_count = 0;
 
-                        if let Some(url_data) = storage.get_url_data_mut(url) {
-                            url_data.set_html_data(html_source, html_tree, title);
-                            url_data.update_status(FetchStatus::Success);
+                        for additional_url in additional_urls {
+                            if urls.len() >= 3 {
+                                break;
+                            }
+                            if urls.insert(additional_url.clone()) {
+                                storage.add_url(additional_url);
+                                added_count += 1;
+                            }
                         }
 
-                        info!("Successfully processed {}", url);
+                        info!(
+                            "Found {} additional URLs for domain {}",
+                            added_count, domain
+                        );
                     }
                     Err(e) => {
-                        error!("Failed to get HTML source for {}: {}", url, e);
-                        if let Some(url_data) = storage.get_url_data_mut(url) {
-                            url_data.update_status(FetchStatus::Failed(e.to_string()));
-                        }
+                        error!("Failed to extract links from {}: {}", first_url, e);
                     }
                 }
             }
-            Err(e) => {
-                error!("Failed to navigate to {}: {}", url, e);
-                if let Some(url_data) = storage.get_url_data_mut(url) {
-                    url_data.update_status(FetchStatus::Failed(e.to_string()));
-                }
+        }
+    }
+
+    // Phase 2: Process all URLs (initial + discovered) with root URL prioritization
+    info!("Processing all URLs with root URL prioritization");
+
+    let mut all_urls: Vec<String> = Vec::new();
+
+    // First, add all user-specified URLs
+    for url in &args.links {
+        all_urls.push(url.clone());
+    }
+
+    // Then, add root URLs for each domain (if not already in user-specified URLs)
+    for domain in domain_urls.keys() {
+        let root_url = smart_crawler::utils::construct_root_url(domain);
+        if !args.links.contains(&root_url) {
+            all_urls.push(root_url);
+        }
+    }
+
+    // Finally, add all other discovered URLs
+    for urls in domain_urls.values() {
+        for url in urls {
+            if !args.links.contains(url) && !smart_crawler::utils::is_root_url(url) {
+                all_urls.push(url.clone());
+            }
+        }
+    }
+
+    for url in &all_urls {
+        if let Some(url_data) = storage.get_url_data(url) {
+            if matches!(url_data.status, FetchStatus::Success) {
+                continue; // Already processed
+            }
+        }
+
+        match process_url(&mut browser, &parser, &mut storage, url, false).await {
+            Ok(_) => info!("Successfully processed {}", url),
+            Err(e) => error!("Failed to process {}: {}", url, e),
+        }
+    }
+
+    // Phase 3: Analyze domain duplicates
+    info!("Analyzing domain-level duplicate nodes");
+
+    for domain in domain_urls.keys() {
+        storage.analyze_domain_duplicates(domain);
+        if let Some(duplicates) = storage.get_domain_duplicates(domain) {
+            let duplicate_count = duplicates.get_duplicate_count();
+            if duplicate_count > 0 {
+                info!(
+                    "Found {} duplicate node patterns for domain {}",
+                    duplicate_count, domain
+                );
+            } else {
+                info!(
+                    "No duplicate patterns found for domain {} (likely insufficient pages)",
+                    domain
+                );
             }
         }
     }
@@ -98,9 +185,105 @@ async fn main() {
             println!("URL: {}", url_data.url);
             println!("Title: {title}");
             println!("Domain: {}", url_data.domain);
+
+            if args.verbose {
+                if let Some(html_tree) = &url_data.html_tree {
+                    if let Some(domain_duplicates) = storage.get_domain_duplicates(&url_data.domain)
+                    {
+                        let filtered_tree =
+                            HtmlParser::filter_domain_duplicates(html_tree, domain_duplicates);
+                        println!("Filtered HTML Tree (showing complete structure with duplicate marking):");
+                        print_html_tree(&filtered_tree, 0);
+                    } else {
+                        println!("HTML Tree (no duplicates to filter):");
+                        print_html_tree(html_tree, 0);
+                    }
+                }
+            }
+
             println!("---");
         }
     }
 
-    info!("SmartCrawler finished processing {} URLs", args.links.len());
+    info!("SmartCrawler finished processing {} URLs", all_urls.len());
+}
+
+async fn process_url(
+    browser: &mut Browser,
+    parser: &HtmlParser,
+    storage: &mut UrlStorage,
+    url: &str,
+    return_html: bool,
+) -> Result<String, String> {
+    info!("Processing URL: {}", url);
+
+    if let Some(url_data) = storage.get_url_data_mut(url) {
+        url_data.update_status(FetchStatus::InProgress);
+    }
+
+    match browser.navigate_to(url).await {
+        Ok(()) => {
+            debug!("Successfully navigated to {}", url);
+
+            match browser.get_html_source().await {
+                Ok(html_source) => {
+                    let title = browser.get_page_title().await.ok();
+                    let html_tree = parser.parse(&html_source);
+
+                    if let Some(url_data) = storage.get_url_data_mut(url) {
+                        url_data.set_html_data(html_source.clone(), html_tree, title);
+                        url_data.update_status(FetchStatus::Success);
+                    }
+
+                    if return_html {
+                        Ok(html_source)
+                    } else {
+                        Ok(String::new())
+                    }
+                }
+                Err(e) => {
+                    let error_msg = format!("Failed to get HTML source: {e}");
+                    if let Some(url_data) = storage.get_url_data_mut(url) {
+                        url_data.update_status(FetchStatus::Failed(error_msg.clone()));
+                    }
+                    Err(error_msg)
+                }
+            }
+        }
+        Err(e) => {
+            let error_msg = format!("Failed to navigate: {e}");
+            if let Some(url_data) = storage.get_url_data_mut(url) {
+                url_data.update_status(FetchStatus::Failed(error_msg.clone()));
+            }
+            Err(error_msg)
+        }
+    }
+}
+
+fn print_html_tree(node: &smart_crawler::HtmlNode, indent: usize) {
+    let indent_str = "  ".repeat(indent);
+
+    // Build the element info string with tag, id, and classes
+    let mut element_info = node.tag.clone();
+    if let Some(id) = &node.id {
+        element_info.push_str(&format!("#{id}"));
+    }
+    if !node.classes.is_empty() {
+        element_info.push_str(&format!("[{}]", node.classes.join(" ")));
+    }
+
+    if !node.content.is_empty() {
+        println!(
+            "{}{}: {}",
+            indent_str,
+            element_info,
+            node.content.chars().take(100).collect::<String>()
+        );
+    } else {
+        println!("{indent_str}{element_info}");
+    }
+
+    for child in &node.children {
+        print_html_tree(child, indent + 1);
+    }
 }
diff --git a/src/storage.rs b/src/storage.rs
index 0eb5e4f..467c887 100644
--- a/src/storage.rs
+++ b/src/storage.rs
@@ -2,7 +2,9 @@ use crate::html_parser::HtmlNode;
 use crate::utils::extract_domain_from_url;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::collections::{HashMap, HashSet};
+use std::hash::{Hash, Hasher};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum FetchStatus {
@@ -62,12 +64,14 @@ impl UrlData {
 #[derive(Debug, Default)]
 pub struct UrlStorage {
     urls_by_domain: HashMap<String, HashMap<String, UrlData>>,
+    domain_duplicates: HashMap<String, DomainDuplicates>,
 }
 
 impl UrlStorage {
     pub fn new() -> Self {
         UrlStorage {
             urls_by_domain: HashMap::new(),
+            domain_duplicates: HashMap::new(),
         }
     }
 
@@ -111,6 +115,94 @@ impl UrlStorage {
             .filter(|url_data| matches!(url_data.status, FetchStatus::Success))
             .collect()
     }
+
+    pub fn analyze_domain_duplicates(&mut self, domain: &str) {
+        if let Some(domain_urls) = self.urls_by_domain.get(domain) {
+            let completed_urls: Vec<_> = domain_urls
+                .values()
+                .filter(|url_data| matches!(url_data.status, FetchStatus::Success))
+                .collect();
+
+            if completed_urls.len() < 2 {
+                return; // Need at least 2 pages to find duplicates
+            }
+
+            let mut node_occurrence_count: HashMap<NodeSignature, usize> = HashMap::new();
+
+            // Count occurrences of each node signature across all pages
+            for url_data in &completed_urls {
+                if let Some(html_tree) = &url_data.html_tree {
+                    Self::collect_node_signatures(html_tree, &mut node_occurrence_count);
+                }
+            }
+
+            // Mark nodes that appear in 2 or more pages as duplicates
+            let domain_duplicates = self
+                .domain_duplicates
+                .entry(domain.to_string())
+                .or_default();
+            for (signature, count) in node_occurrence_count {
+                if count >= 2 {
+                    domain_duplicates.add_duplicate_node(signature);
+                }
+            }
+        }
+    }
+
+    fn collect_node_signatures(node: &HtmlNode, signatures: &mut HashMap<NodeSignature, usize>) {
+        // Skip structural/container elements that naturally appear on every page
+        if !Self::is_structural_element(&node.tag) {
+            let signature = NodeSignature::from_html_node(node);
+            // Only count nodes with meaningful content or specific styling
+            if Self::is_meaningful_node(node) {
+                *signatures.entry(signature).or_insert(0) += 1;
+            }
+        }
+
+        for child in &node.children {
+            Self::collect_node_signatures(child, signatures);
+        }
+    }
+
+    fn is_structural_element(tag: &str) -> bool {
+        matches!(
+            tag,
+            "html" | "head" | "body" | "main" | "article" | "section"
+        )
+    }
+
+    fn is_meaningful_node(node: &HtmlNode) -> bool {
+        // Consider a node meaningful if it has:
+        // - Non-empty content (text content or children), OR
+        // - Specific CSS classes/IDs that indicate styling, OR
+        // - Is a semantic element that likely appears across multiple pages
+        (!node.content.trim().is_empty() || !node.children.is_empty())
+            || !node.classes.is_empty()
+            || node.id.is_some()
+            || matches!(
+                node.tag.as_str(),
+                "nav"
+                    | "header"
+                    | "footer"
+                    | "aside"
+                    | "form"
+                    | "button"
+                    | "a"
+                    | "ul"
+                    | "ol"
+                    | "menu"
+            )
+    }
+
+    pub fn get_domain_duplicates(&self, domain: &str) -> Option<&DomainDuplicates> {
+        self.domain_duplicates.get(domain)
+    }
+
+    pub fn add_urls_from_same_domain(&mut self, urls: Vec<String>) {
+        for url in urls {
+            self.add_url(url);
+        }
+    }
 }
 
 #[cfg(test)]
@@ -164,4 +256,193 @@ mod tests {
         assert!(matches!(url_data.status, FetchStatus::InProgress));
         assert!(url_data.updated_at > original_time);
     }
+
+    #[test]
+    fn test_add_urls_from_same_domain() {
+        let mut storage = UrlStorage::new();
+        let urls = vec![
+            "https://example.com/page1".to_string(),
+            "https://example.com/page2".to_string(),
+            "https://example.com/page3".to_string(),
+        ];
+
+        storage.add_urls_from_same_domain(urls);
+
+        let example_com_urls = storage.get_urls_by_domain("example.com");
+        assert!(example_com_urls.is_some());
+        assert_eq!(example_com_urls.unwrap().len(), 3);
+    }
+
+    #[test]
+    fn test_analyze_domain_duplicates() {
+        use crate::html_parser::HtmlParser;
+
+        let mut storage = UrlStorage::new();
+        let parser = HtmlParser::new();
+
+        storage.add_url("https://example.com/page1".to_string());
+        storage.add_url("https://example.com/page2".to_string());
+
+        // Create mock HTML trees with common elements
+        let html1 = r#"<html><body><nav class="navbar">Navigation</nav><div class="content">Page 1 content</div></body></html>"#;
+        let html2 = r#"<html><body><nav class="navbar">Navigation</nav><div class="content">Page 2 content</div></body></html>"#;
+
+        let tree1 = parser.parse(html1);
+        let tree2 = parser.parse(html2);
+
+        // Set the HTML data for both URLs
+        if let Some(url_data) = storage.get_url_data_mut("https://example.com/page1") {
+            url_data.set_html_data(html1.to_string(), tree1, Some("Page 1".to_string()));
+            url_data.update_status(FetchStatus::Success);
+        }
+
+        if let Some(url_data) = storage.get_url_data_mut("https://example.com/page2") {
+            url_data.set_html_data(html2.to_string(), tree2, Some("Page 2".to_string()));
+            url_data.update_status(FetchStatus::Success);
+        }
+
+        // Analyze domain duplicates
+        storage.analyze_domain_duplicates("example.com");
+
+        let duplicates = storage.get_domain_duplicates("example.com");
+        assert!(duplicates.is_some());
+        assert!(duplicates.unwrap().get_duplicate_count() > 0);
+    }
+
+    #[test]
+    fn test_node_signature_creation() {
+        use crate::html_parser::HtmlNode;
+
+        let node = HtmlNode::new(
+            "div".to_string(),
+            vec!["container".to_string(), "main".to_string()],
+            Some("content".to_string()),
+            "Test content".to_string(),
+        );
+
+        let signature = NodeSignature::from_html_node(&node);
+        assert_eq!(signature.tag, "div");
+        assert_eq!(signature.classes, vec!["container", "main"]);
+        assert_eq!(signature.id, Some("content".to_string()));
+        assert_eq!(signature.content, "Test content");
+        assert!(!signature.content_hash.is_empty());
+    }
+
+    #[test]
+    fn test_domain_duplicates_detection() {
+        let mut duplicates = DomainDuplicates::new();
+
+        let signature = NodeSignature {
+            tag: "nav".to_string(),
+            classes: vec!["navbar".to_string()],
+            id: None,
+            content: "Navigation".to_string(),
+            content_hash: "test_hash".to_string(),
+        };
+
+        assert!(!duplicates.is_duplicate(&signature));
+
+        duplicates.add_duplicate_node(signature.clone());
+        assert!(duplicates.is_duplicate(&signature));
+        assert_eq!(duplicates.get_duplicate_count(), 1);
+    }
+
+    #[test]
+    fn test_content_hash_includes_children() {
+        use crate::html_parser::HtmlParser;
+
+        let parser = HtmlParser::new();
+
+        // Two divs with same tag/class but different children
+        let html1 = r#"<div class="container"><p>Content 1</p></div>"#;
+        let html2 = r#"<div class="container"><p>Content 2</p></div>"#;
+        let html3 = r#"<div class="container"><p>Content 1</p></div>"#; // Same as html1
+
+        let node1 = parser.parse(html1);
+        let node2 = parser.parse(html2);
+        let node3 = parser.parse(html3);
+
+        let sig1 = NodeSignature::from_html_node(&node1);
+        let sig2 = NodeSignature::from_html_node(&node2);
+        let sig3 = NodeSignature::from_html_node(&node3);
+
+        // sig1 and sig2 should be different due to different child content
+        assert_ne!(sig1.content_hash, sig2.content_hash);
+
+        // sig1 and sig3 should be identical
+        assert_eq!(sig1.content_hash, sig3.content_hash);
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct NodeSignature {
+    pub tag: String,
+    pub classes: Vec<String>,
+    pub id: Option<String>,
+    pub content: String,
+    pub content_hash: String, // Hash of complete structure including children
+}
+
+impl NodeSignature {
+    pub fn from_html_node(node: &HtmlNode) -> Self {
+        let content_hash = Self::compute_content_hash(node);
+
+        NodeSignature {
+            tag: node.tag.clone(),
+            classes: node.classes.clone(),
+            id: node.id.clone(),
+            content: node.content.clone(),
+            content_hash,
+        }
+    }
+
+    fn compute_content_hash(node: &HtmlNode) -> String {
+        let mut hasher = DefaultHasher::new();
+
+        // Hash the complete structure: tag, classes, id, content, and children structure
+        node.tag.hash(&mut hasher);
+        node.classes.hash(&mut hasher);
+        node.id.hash(&mut hasher);
+        node.content.hash(&mut hasher);
+
+        // Recursively hash children structure
+        Self::hash_children(&node.children, &mut hasher);
+
+        format!("{:x}", hasher.finish())
+    }
+
+    fn hash_children(children: &[HtmlNode], hasher: &mut DefaultHasher) {
+        for child in children {
+            child.tag.hash(hasher);
+            child.classes.hash(hasher);
+            child.id.hash(hasher);
+            child.content.hash(hasher);
+            Self::hash_children(&child.children, hasher);
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct DomainDuplicates {
+    duplicate_nodes: HashSet<NodeSignature>,
+}
+
+impl DomainDuplicates {
+    pub fn new() -> Self {
+        DomainDuplicates {
+            duplicate_nodes: HashSet::new(),
+        }
+    }
+
+    pub fn add_duplicate_node(&mut self, signature: NodeSignature) {
+        self.duplicate_nodes.insert(signature);
+    }
+
+    pub fn is_duplicate(&self, signature: &NodeSignature) -> bool {
+        self.duplicate_nodes.contains(signature)
+    }
+
+    pub fn get_duplicate_count(&self) -> usize {
+        self.duplicate_nodes.len()
+    }
 }
diff --git a/src/utils.rs b/src/utils.rs
index 4ccf823..55e4ad6 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -19,8 +19,21 @@ pub fn extract_domain_from_url(url: &str) -> Option<String> {
         .and_then(|parsed| parsed.host_str().map(|host| host.to_string()))
 }
 
-pub fn is_numeric_id(id: &str) -> bool {
-    !id.is_empty() && id.chars().all(|c| c.is_ascii_digit())
+pub fn construct_root_url(domain: &str) -> String {
+    format!("https://{domain}")
+}
+
+pub fn is_root_url(url: &str) -> bool {
+    if let Ok(parsed) = url::Url::parse(url) {
+        let path = parsed.path();
+        let query = parsed.query();
+        let fragment = parsed.fragment();
+
+        // Root URL has path "/" or empty, no query parameters, and no fragment
+        (path == "/" || path.is_empty()) && query.is_none() && fragment.is_none()
+    } else {
+        false
+    }
 }
 
 #[cfg(test)]
@@ -52,11 +65,25 @@ mod tests {
     }
 
     #[test]
-    fn test_is_numeric_id() {
-        assert!(is_numeric_id("123"));
-        assert!(is_numeric_id("0"));
-        assert!(!is_numeric_id("abc"));
-        assert!(!is_numeric_id("12a"));
-        assert!(!is_numeric_id(""));
+    fn test_construct_root_url() {
+        assert_eq!(construct_root_url("example.com"), "https://example.com");
+        assert_eq!(
+            construct_root_url("subdomain.example.com"),
+            "https://subdomain.example.com"
+        );
+    }
+
+    #[test]
+    fn test_is_root_url() {
+        assert!(is_root_url("https://example.com"));
+        assert!(is_root_url("https://example.com/"));
+        assert!(is_root_url("http://example.com"));
+        assert!(is_root_url("http://example.com/"));
+
+        assert!(!is_root_url("https://example.com/path"));
+        assert!(!is_root_url("https://example.com/?query=value"));
+        assert!(!is_root_url("https://example.com/#fragment"));
+        assert!(!is_root_url("https://example.com/path?query=value"));
+        assert!(!is_root_url("invalid-url"));
     }
 }