From 1eae85ee5ee4ab4829365b59d9de807a2ade9458 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Fri, 20 Feb 2026 10:46:51 -0500 Subject: [PATCH] feat: strip inline data URI images from LLM markdown output Replace base64-encoded and other data URI images with text placeholders (using alt text when available) during HTML-to-markdown conversion. This prevents large base64 strings from wasting LLM context tokens. --- R/build-llm.R | 17 +++++++++++++++++ tests/testthat/test-build-llm.R | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/R/build-llm.R b/R/build-llm.R index 15284e639..1b32ec657 100644 --- a/R/build-llm.R +++ b/R/build-llm.R @@ -70,6 +70,7 @@ convert_md <- function(src_path, dst_path, url = NULL) { simplify_popovers_to_footnotes(main_html) simplify_lifecycle_badges(main_html) simplify_dls(main_html) + simplify_inline_images(main_html) create_absolute_links(main_html, url) path <- file_temp() @@ -182,6 +183,22 @@ simplify_lifecycle_badges <- function(html) { invisible() } +simplify_inline_images <- function(html) { + img_nodes <- xml2::xml_find_all(html, ".//img[contains(@src, 'data:')]") + + purrr::walk(img_nodes, function(img) { + alt_text <- xml2::xml_attr(img, "alt") + replacement <- if (!is.na(alt_text) && nzchar(alt_text)) { + sprintf("[Image: %s]", alt_text) + } else { + "[Image]" + } + xml2::xml_replace(img, "span", replacement) + }) + + invisible() +} + create_absolute_links <- function(main_html, url = NULL) { a <- xml2::xml_find_all(main_html, ".//a") xml2::xml_attr(a, "class") <- NULL diff --git a/tests/testthat/test-build-llm.R b/tests/testthat/test-build-llm.R index 5c71f6474..9740ef224 100644 --- a/tests/testthat/test-build-llm.R +++ b/tests/testthat/test-build-llm.R @@ -31,6 +31,22 @@ test_that("replaces lifecycle badges with strong text", { ) }) +test_that("replaces inline data URI images with text placeholders", { + html <- xml2::read_html( + 'A plot' + ) + simplify_inline_images(html) + expect_equal(xpath_text(html, ".//span"), "[Image: A plot]") +}) + +test_that("replaces inline data URI images without alt text", { + html <- xml2::read_html( + '' + ) + simplify_inline_images(html) + expect_equal(xpath_text(html, ".//span"), "[Image]") +}) + test_that("converts internal urls to absolute with .md ending", { html <- xml2::read_html( r"(