AtlasAnalyticsLab · chengjiahao1234 · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -570,6 +570,7 @@ front matter...
    ```
    - **Purpose:** Tells search engines where to find the sitemap
    - **Impact:** SEO - affects how search engines crawl the site
+   - **Note:** `/sitemap.xml` is generated automatically by the `jekyll-sitemap` plugin (configured in `_config.yml`). Do not maintain a manual `sitemap.xml` file.
    - **Priority:** 🔴 CRITICAL
 
 3. **`CNAME`** (line 19)
@@ -615,6 +616,7 @@ If you need to change the website URL:
 #### Step 1: Update Critical Configuration
 - [ ] Update `_config.yml` → `url:` field
 - [ ] Verify `robots.txt` → `Sitemap:` line (generated from `{{ site.url }}{{ site.baseurl }}`)
+- [ ] Verify `/sitemap.xml` is generated (jekyll-sitemap) and includes key pages
 - [ ] Update or remove `CNAME` file if using custom domain
 
 #### Step 2: Test Locally
@@ -768,9 +770,10 @@ headline: "Text with [link](OPENINGS_LINK)"
 **Security & Protection:**
 - ✅ Enhanced 404 page with navigation buttons and Bootstrap icons
 - ✅ Comprehensive `robots.txt` with crawler access control
-  - Allows: Googlebot, Bingbot, Slurp with 10-second crawl delay
-  - Blocks: MJ12bot, AhrefsBot, SemrushBot, DotBot, PetalBot, DataForSeoBot
-  - Restricts: `/images/`, `/assets/`, `/css/`, `/js/` directories
+  - Allows: major search engines (Googlebot, Bingbot, Slurp, DuckDuckBot, etc.)
+  - Blocks: known heavy scraper / SEO bots (AhrefsBot, SemrushBot, MJ12bot, DotBot, PetalBot, DataForSeoBot)
+  - Avoids `Crawl-delay` (ignored by Googlebot and may trigger Search Console warnings)
+  - Restricts: internal build artifacts only (`/_site/`, `/bin/`)
 - ✅ Apache security configuration (`.htaccess`)
   - Directory browsing disabled
   - Security headers (X-Frame-Options, X-XSS-Protection, etc.)
@@ -968,7 +971,7 @@ headline: "Text with [link](OPENINGS_LINK)"
    - Test theme compatibility
 
 3. **Security Review:**
-   - Review `robots.txt` blocked crawlers
+   - Review `robots.txt` blocked scraper list (and confirm sitemap URL)
    - Update `.htaccess` security headers
    - Check GitHub Pages security settings
 

diff --git a/Gemfile b/Gemfile
@@ -10,6 +10,7 @@
 # Key Dependencies:
 #   - jekyll 4.4.1+: Static site generator
 #   - jekyll-scholar: BibTeX bibliography support
+#   - jekyll-sitemap: Automatic sitemap.xml generation for search engines
 #   - webrick 1.9+: Ruby web server for local development
 #
 # Installation:
@@ -29,5 +30,6 @@ gem "jekyll", "4.4.1"
 # gem "github-pages", "~> 232", group: :jekyll_plugins
 
 gem "jekyll-scholar", group: :jekyll_plugins
+gem "jekyll-sitemap", group: :jekyll_plugins
 gem "webrick", "~> 1.9"
 gem "wdm", ">= 0.1.0" if Gem.win_platform?
diff --git a/README.md b/README.md
@@ -328,7 +328,7 @@ bundle install
 
 ## Security Features
 
-- ✅ **Crawler Protection:** `robots.txt` controls search engine access
+- ✅ **Crawler Protection:** `robots.txt` allows major search engines and blocks known heavy scraper bots
 - ✅ **Custom 404 Page:** User-friendly error handling with navigation
 - ✅ **DDoS Protection:** GitHub Pages + Cloudflare CDN
 - ✅ **Security Headers:** Content security and XSS protection

diff --git a/_config.yml b/_config.yml
@@ -34,6 +34,10 @@ include:
   - _pages
   - robots.txt
 
+plugins:
+  - jekyll-scholar
+  - jekyll-sitemap
+
 sass:
   sass_dir: _sass
 

diff --git a/_pages/aboutwebsite.md b/_pages/aboutwebsite.md
@@ -2,7 +2,6 @@
 title: "About the website"
 layout: textlay
 excerpt: "About the website."
-sitemap: false
 permalink: /aboutwebsite.html
 ---
 <!--

diff --git a/_pages/allnews.md b/_pages/allnews.md
@@ -2,7 +2,6 @@
 title: "News"
 layout: textlay
 excerpt: "Atlas Analytics Lab at Concordia University."
-sitemap: false
 permalink: /allnews.html
 ---
 <!--

diff --git a/_pages/contact.md b/_pages/contact.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab - Contact"
 layout: textlay
 excerpt: "Ways to reach the Atlas Analytics Lab."
-sitemap: false
 permalink: /contact/
 ---
 <!--

diff --git a/_pages/funding.md b/_pages/funding.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab - Funding"
 layout: textlay
 excerpt: "Atlas Analytics Lab -- Funding."
-sitemap: false
 permalink: /funding/
 ---
 <!--

diff --git a/_pages/gallery.md b/_pages/gallery.md
@@ -2,7 +2,6 @@
 title: "Lab Life"
 layout: gallerylay
 excerpt: "Atlas Analytics Lab at Concordia University."
-sitemap: false
 permalink: /gallery/
 ---
 <!--

diff --git a/_pages/home.md b/_pages/home.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab - Home"
 layout: homelay
 excerpt: "Atlas Analytics Lab at Concordia University."
-sitemap: false
 permalink: /
 ---
 <!--

diff --git a/_pages/openings.md b/_pages/openings.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab – Vacancies"
 layout: textlay
 excerpt: "Openings"
-sitemap: false
 permalink: /openings
 ---
 <!--

diff --git a/_pages/publications.md b/_pages/publications.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab - Publications"
 layout: gridlay
 excerpt: "Atlas Analytics Lab -- Publications."
-sitemap: false
 permalink: /publications/
 ---
 <!--

diff --git a/_pages/team.md b/_pages/team.md
@@ -2,7 +2,6 @@
 title: "Atlas Analytics Lab - Team"
 layout: team
 excerpt: "Team members"
-sitemap: false
 permalink: /team/
 ---
 <!--

diff --git a/images/favicon.ico b/images/favicon.ico
diff --git a/robots.txt b/robots.txt
@@ -2,61 +2,48 @@
 layout: null
 permalink: /robots.txt
 ---
-# robots.txt - Crawler Access Control
-#
-# Purpose:
-#   Controls which web crawlers can access the site and how frequently.
-#   Helps prevent excessive crawler traffic and bandwidth usage.
-#
-# Note:
-#   - Good crawlers (Google, Bing) respect these rules
-#   - Malicious crawlers may ignore this file
-#   - For GitHub Pages, this provides basic protection
-
-# Allow major search engines.
-# Note: Googlebot ignores Crawl-delay directives, so we omit it to avoid Search Console warnings.
+
+# ============================================
+# Atlas Analytics Lab – robots.txt
+# Goals:
+#  - Maximize search engine indexing (all agents)
+#  - Avoid crawl-delay except where supported
+#  - Block known heavy scraper bots
+#  - Keep CSS/JS/images crawlable
+# ============================================
+
+# --- Major search engines (explicitly allowed) ---
 User-agent: Googlebot
-Disallow: /images/
-Disallow: /assets/
-Disallow: /_site/
-Disallow: /bin/
-Disallow: /CNAME
-Disallow: /README.md
-Disallow: /DEVELOPMENT.md
-Disallow: /.htaccess
+Allow: /
 
 User-agent: Bingbot
-Crawl-delay: 10
-Disallow: /images/
-Disallow: /assets/
-Disallow: /_site/
-Disallow: /bin/
-Disallow: /CNAME
-Disallow: /README.md
-Disallow: /DEVELOPMENT.md
-Disallow: /.htaccess
+Allow: /
 
 User-agent: Slurp
-Crawl-delay: 10
-Disallow: /images/
-Disallow: /assets/
-Disallow: /_site/
-Disallow: /bin/
-Disallow: /CNAME
-Disallow: /README.md
-Disallow: /DEVELOPMENT.md
-Disallow: /.htaccess
+Allow: /
 
-# Block aggressive/problematic crawlers
-User-agent: MJ12bot
-Disallow: /
+User-agent: DuckDuckBot
+Allow: /
 
+User-agent: Baiduspider
+Allow: /
+
+User-agent: YandexBot
+Allow: /
+
+User-agent: SeznamBot
+Allow: /
+
+# --- Block known heavy scraper / SEO bots ---
 User-agent: AhrefsBot
 Disallow: /
 
 User-agent: SemrushBot
 Disallow: /
 
+User-agent: MJ12bot
+Disallow: /
+
 User-agent: DotBot
 Disallow: /
 
@@ -66,33 +53,23 @@ Disallow: /
 User-agent: DataForSeoBot
 Disallow: /
 
-# Block AI training crawlers (optional - uncomment if you want to block)
+# --- Optional: block AI training crawlers ---
 # User-agent: GPTBot
 # Disallow: /
-#
 # User-agent: CCBot
 # Disallow: /
-#
 # User-agent: anthropic-ai
 # Disallow: /
-#
 # User-agent: Claude-Web
 # Disallow: /
 
-# Default rule for unlisted crawlers
+# --- Default rule: allow all other crawlers ---
 User-agent: *
-Crawl-delay: 10
-Disallow: /images/
-Disallow: /assets/
+Allow: /
+
+# Hide build artifacts if they ever appear (normally not deployed)
 Disallow: /_site/
 Disallow: /bin/
-Disallow: /CNAME
-Disallow: /README.md
-Disallow: /DEVELOPMENT.md
-Disallow: /.htaccess
-
-# Allow access to main pages (everything else is allowed by default)
-Allow: /
 
-# Sitemap location (helps good crawlers index efficiently)
+# Sitemap location
 Sitemap: {{ site.url }}{{ site.baseurl }}/sitemap.xml
diff --git a/sitemap.xml b/sitemap.xml