diff --git a/.github/workflows/install-tester.yml b/.github/workflows/install-tester.yml index edf39cf..e1b4320 100644 --- a/.github/workflows/install-tester.yml +++ b/.github/workflows/install-tester.yml @@ -8,7 +8,7 @@ on: jobs: gha: - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-latest" steps: - uses: untitaker/hyperlink@0.1.44 with: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9f34a48..0d5332e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -# This file was autogenerated by cargo-dist: https://opensource.axo.dev/cargo-dist/ +# This file was autogenerated by dist: https://axodotdev.github.io/cargo-dist # # Copyright 2022-2024, axodotdev # SPDX-License-Identifier: MIT or Apache-2.0 @@ -6,7 +6,7 @@ # CI that: # # * checks for a Git Tag that looks like a release -# * builds artifacts with cargo-dist (archives, installers, hashes) +# * builds artifacts with dist (archives, installers, hashes) # * uploads those artifacts to temporary workflow zip # * on success, uploads the artifacts to a GitHub Release # @@ -24,10 +24,10 @@ permissions: # must be a Cargo-style SemVer Version (must have at least major.minor.patch). # # If PACKAGE_NAME is specified, then the announcement will be for that -# package (erroring out if it doesn't have the given version or isn't cargo-dist-able). +# package (erroring out if it doesn't have the given version or isn't dist-able). # # If PACKAGE_NAME isn't specified, then the announcement will be for all -# (cargo-dist-able) packages in the workspace with that version (this mode is +# (dist-able) packages in the workspace with that version (this mode is # intended for workspaces with only one dist-able package, or with all dist-able # packages versioned/released in lockstep). # @@ -45,9 +45,9 @@ on: - '**[0-9]+.[0-9]+.[0-9]+*' jobs: - # Run 'cargo dist plan' (or host) to determine what tasks we need to do + # Run 'dist plan' (or host) to determine what tasks we need to do plan: - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" outputs: val: ${{ steps.plan.outputs.manifest }} tag: ${{ !github.event.pull_request && github.ref_name || '' }} @@ -58,17 +58,18 @@ jobs: steps: - uses: actions/checkout@v4 with: + persist-credentials: false submodules: recursive - - name: Install cargo-dist + - name: Install dist # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.23.0/cargo-dist-installer.sh | sh" - - name: Cache cargo-dist + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" + - name: Cache dist uses: actions/upload-artifact@v4 with: name: cargo-dist-cache - path: ~/.cargo/bin/cargo-dist + path: ~/.cargo/bin/dist # sure would be cool if github gave us proper conditionals... # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible # functionality based on whether this is a pull_request, and whether it's from a fork. @@ -76,8 +77,8 @@ jobs: # but also really annoying to build CI around when it needs secrets to work right.) - id: plan run: | - cargo dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json - echo "cargo dist ran successfully" + dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json + echo "dist ran successfully" cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" @@ -95,18 +96,19 @@ jobs: if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }} strategy: fail-fast: false - # Target platforms/runners are computed by cargo-dist in create-release. + # Target platforms/runners are computed by dist in create-release. # Each member of the matrix has the following arguments: # # - runner: the github runner - # - dist-args: cli flags to pass to cargo dist - # - install-dist: expression to run to install cargo-dist on the runner + # - dist-args: cli flags to pass to dist + # - install-dist: expression to run to install dist on the runner # # Typically there will be: # - 1 "global" task that builds universal installers # - N "local" tasks that build each platform's binaries and platform-specific installers matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }} runs-on: ${{ matrix.runner }} + container: ${{ matrix.container && matrix.container.image || null }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json @@ -116,9 +118,17 @@ jobs: git config --global core.longpaths true - uses: actions/checkout@v4 with: + persist-credentials: false submodules: recursive - - name: Install cargo-dist - run: ${{ matrix.install_dist }} + - name: Install Rust non-interactively if not already installed + if: ${{ matrix.container }} + run: | + if ! command -v cargo > /dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + fi + - name: Install dist + run: ${{ matrix.install_dist.run }} # Get the dist-manifest - name: Fetch local artifacts uses: actions/download-artifact@v4 @@ -132,8 +142,8 @@ jobs: - name: Build artifacts run: | # Actually do builds and make zips and whatnot - cargo dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json + echo "dist ran successfully" - id: cargo-dist name: Post-build # We force bash here just because github makes it really hard to get values up @@ -143,7 +153,7 @@ jobs: run: | # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" - jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" + dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" cp dist-manifest.json "$BUILD_MANIFEST_NAME" @@ -160,20 +170,21 @@ jobs: needs: - plan - build-local-artifacts - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json steps: - uses: actions/checkout@v4 with: + persist-credentials: false submodules: recursive - - name: Install cached cargo-dist + - name: Install cached dist uses: actions/download-artifact@v4 with: name: cargo-dist-cache path: ~/.cargo/bin/ - - run: chmod +x ~/.cargo/bin/cargo-dist + - run: chmod +x ~/.cargo/bin/dist # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts uses: actions/download-artifact@v4 @@ -184,8 +195,8 @@ jobs: - id: cargo-dist shell: bash run: | - cargo dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json + echo "dist ran successfully" # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" @@ -206,23 +217,24 @@ jobs: - plan - build-local-artifacts - build-global-artifacts - # Only run if we're "publishing", and only if local and global didn't fail (skipped is fine) - if: ${{ always() && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} + # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine) + if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" outputs: val: ${{ steps.host.outputs.manifest }} steps: - uses: actions/checkout@v4 with: + persist-credentials: false submodules: recursive - - name: Install cached cargo-dist + - name: Install cached dist uses: actions/download-artifact@v4 with: name: cargo-dist-cache path: ~/.cargo/bin/ - - run: chmod +x ~/.cargo/bin/cargo-dist + - run: chmod +x ~/.cargo/bin/dist # Fetch artifacts from scratch-storage - name: Fetch artifacts uses: actions/download-artifact@v4 @@ -233,7 +245,7 @@ jobs: - id: host shell: bash run: | - cargo dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json + dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json echo "artifacts uploaded and released successfully" cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" @@ -270,7 +282,7 @@ jobs: needs: - plan - host - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PLAN: ${{ needs.plan.outputs.val }} @@ -303,10 +315,11 @@ jobs: # still allowing individual publish jobs to skip themselves (for prereleases). # "host" however must run to completion, no skipping allowed! if: ${{ always() && needs.host.result == 'success' && (needs.publish-npm.result == 'skipped' || needs.publish-npm.result == 'success') }} - runs-on: "ubuntu-20.04" + runs-on: "ubuntu-22.04" env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v4 with: + persist-credentials: false submodules: recursive diff --git a/README.md b/README.md index 7394c66..691bb5d 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,41 @@ and `--github-actions` feature. fairly feature-rich, but was a non-starter due to performance. This applies to other countless link checkers we tried that are not mentioned here. +## Redirects + +Since 0.1.45 `hyperlink` supports reading configured redirects from a file. + +At the root of your site, make a file `_redirects`: + +``` +# lines starting with # are ignored +/old-url.html /new-url.html + +# on the next line, trailing data like the 301 status code is ignored +/old-url2.html /new-url2.html 301 + +# /old-url.html will become a valid link target +# hyperlink will validate that /new-url.html exists. +``` + +This format is supported by at least Netlify, [Codeberg +pages](https://codeberg.page) and [Grebedoc](https://grebedoc.dev) + +References for this format can be found at +[Codeberg](https://docs.codeberg.org/codeberg-pages/redirects/) and +[Netlify](https://docs.netlify.com/manage/routing/redirects/overview/). + +The major things missing from the implementation are: + +* `hyperlink` completely ignores any status codes or country code conditions. + The only thing it parses are `from to`, and the rest is ignored. + +* "Splat sources" (`/articles/*`) and "splat targets" (`/posts/:splat`) are + not supported. + +* Generally speaking, `hyperlink` does not support "pretty URLs", i.e. one + cannot request `/mypage` and expect `mypage.html` to be loaded. + ## Testimonials > We use Hyperlink to check for dead links on diff --git a/dist-workspace.toml b/dist-workspace.toml index 25455ce..c09736d 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -1,10 +1,10 @@ [workspace] members = ["cargo:."] -# Config for 'cargo dist' +# Config for 'dist' [dist] -# The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.23.0" +# The preferred dist version to use in CI (Cargo.toml SemVer syntax) +cargo-dist-version = "0.30.2" # CI backends to support ci = "github" # The installers to generate for each app diff --git a/src/html/mod.rs b/src/html/mod.rs index 37d78b5..6751d3f 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -3,7 +3,7 @@ mod parser; use std::borrow::Cow; use std::fmt; use std::fs; -use std::io::Read; +use std::io::{BufRead, BufReader, Read}; use std::path::{Path, PathBuf}; use std::str; use std::sync::Arc; @@ -308,6 +308,41 @@ impl Document { Href(href.into_bump_str()) } + pub fn extract_links<'b, 'l, P: ParagraphWalker, F>( + &self, + doc_buf: &'b mut DocumentBuffers, + check_anchors: bool, + mut callback: F, + ) -> Result + where + 'b: 'l, + F: FnMut(Link<'l, P::Paragraph>), + { + if self.href == "_redirects" { + for link in self.parse_redirects::

(doc_buf, check_anchors)? { + callback(link); + } + return Ok(true); + } + + if self + .path + .extension() + .and_then(|extension| { + let ext = extension.to_str()?; + Some(ext == "html" || ext == "htm") + }) + .unwrap_or(false) + { + for link in self.links_from_html::

(doc_buf, check_anchors)? { + callback(link); + } + return Ok(true); + } + + Ok(false) + } + pub fn links<'b, 'l, P: ParagraphWalker>( &self, doc_buf: &'b mut DocumentBuffers, @@ -319,6 +354,62 @@ impl Document { self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors) } + fn links_from_html<'b, 'l, P: ParagraphWalker>( + &self, + doc_buf: &'b mut DocumentBuffers, + check_anchors: bool, + ) -> Result>, Error> + where + 'b: 'l, + { + self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors) + } + + fn parse_redirects<'b, 'l, P: ParagraphWalker>( + &self, + doc_buf: &'b mut DocumentBuffers, + check_anchors: bool, + ) -> Result>, Error> + where + 'b: 'l, + { + let mut link_buf = BumpVec::new_in(&doc_buf.arena); + let file = fs::File::open(&*self.path)?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + + let parts: Vec<&str> = trimmed.split_whitespace().collect(); + if parts.len() >= 2 { + let source = parts[0]; + let target = parts[1]; + + let source_str = doc_buf.arena.alloc_str(source); + let target_str = doc_buf.arena.alloc_str(target); + + link_buf.push(Link::Defines(DefinedLink { + href: self.join(&doc_buf.arena, check_anchors, source_str), + })); + + if !is_external_link(target.as_bytes()) { + link_buf.push(Link::Uses(UsedLink { + href: self.join(&doc_buf.arena, check_anchors, target_str), + path: self.path.clone(), + paragraph: None, + })); + } + } + } + + Ok(link_buf.into_iter()) + } + fn links_from_read<'b, 'l, R: Read, P: ParagraphWalker>( &self, doc_buf: &'b mut DocumentBuffers, diff --git a/src/main.rs b/src/main.rs index 6727478..1051c63 100755 --- a/src/main.rs +++ b/src/main.rs @@ -468,26 +468,17 @@ fn extract_html_links, P: ParagraphWalker>( })); file_count += 1; - if !document - .path - .extension() - .and_then(|extension| Some(HTML_FILES.contains(&extension.to_str()?))) - .unwrap_or(false) - { - return Ok((doc_buf, collector, documents_count, file_count)); + let was_parsed = document + .extract_links::(&mut doc_buf, check_anchors, |link| { + collector.ingest(link); + }) + .with_context(|| format!("Failed to read file {}", document.path.display()))?; + + if was_parsed { + doc_buf.reset(); + documents_count += 1; } - for link in document - .links::

(&mut doc_buf, check_anchors) - .with_context(|| format!("Failed to read file {}", document.path.display()))? - { - collector.ingest(link); - } - - doc_buf.reset(); - - documents_count += 1; - Ok((doc_buf, collector, documents_count, file_count)) }, ) diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs index 7d1dac6..eb03827 100644 --- a/tests/cli_snapshots.rs +++ b/tests/cli_snapshots.rs @@ -1,3 +1,4 @@ +use assert_fs::prelude::*; use insta_cmd::{assert_cmd_snapshot, get_cargo_bin}; use std::process::Command; @@ -98,3 +99,85 @@ fn test_version() { ----- stderr ----- "###); } + +#[test] +fn test_redirects() { + let site = assert_fs::TempDir::new().unwrap(); + + site.child("_redirects") + .write_str( + "# This is a comment\n\ + \n\ + /old-page /new-page.html 301\n\ + /external https://example.com/page\n\ + /broken /missing-page.html\n\ + /another /target.html", + ) + .unwrap(); + + site.child("new-page.html").touch().unwrap(); + site.child("target.html").touch().unwrap(); + + site.child("index.html") + .write_str("link") + .unwrap(); + + let mut settings = insta::Settings::clone_current(); + settings.add_filter(r"[/\\]", "/"); + let _guard = settings.bind_to_scope(); + + assert_cmd_snapshot!(cli().arg(".").current_dir(site.path()), @r###" + success: false + exit_code: 1 + ----- stdout ----- + Reading files + Checking 4 links from 4 files (4 documents) + ./_redirects + error: bad link /missing-page.html + + Found 1 bad links + + ----- stderr ----- + "###); + + site.close().unwrap(); +} + +#[test] +fn test_redirects_only_at_root() { + let site = assert_fs::TempDir::new().unwrap(); + + site.child("_redirects") + .write_str("/old-page /new-page.html") + .unwrap(); + + site.child("subdir/_redirects") + .write_str("/sub-old /sub-new.html") + .unwrap(); + + site.child("new-page.html").touch().unwrap(); + + site.child("index.html") + .write_str("link to oldlink to sub") + .unwrap(); + + let mut settings = insta::Settings::clone_current(); + settings.add_filter(r"[/\\]", "/"); + let _guard = settings.bind_to_scope(); + + assert_cmd_snapshot!(cli().arg(".").current_dir(site.path()), @r###" + success: false + exit_code: 1 + ----- stdout ----- + Reading files + Checking 3 links from 4 files (3 documents) + ./index.html + error: bad link /sub-old + + Found 1 bad links + + ----- stderr ----- + "###); + + site.close().unwrap(); +}