rtesseract is an R package providing a fast and flexible interface to the Tesseract OCR engine via Rcpp. It allows you to extract text from images and convert scanned PDFs into searchable PDFs.
- OCR on single or multiple images
- Convert scanned PDFs to searchable PDFs
- Control DPI and rendering options
- Cross-platform support (Windows, Linux, macOS)
- Direct integration with Tesseract C++ API
- Uses
pdftoppm(Poppler) for PDF rasterization
- Tesseract OCR
- Leptonica
- Poppler (pdftoppm)
run_cmd <- function(cmd, args = character(), error_msg = NULL, echo = TRUE) {
if (echo) {
message(">> ", paste(c(cmd, args), collapse = " "))
}
status <- system2(cmd, args = args)
if (!identical(status, 0L)) {
stop(if (is.null(error_msg)) paste("Command failed:", cmd) else error_msg, call. = FALSE)
}
invisible(TRUE)
}
check_cmd <- function(cmd) {
nzchar(Sys.which(cmd))
}
write_tesseract_env <- function(include_dir,
lib_dir,
bin_dir = NULL,
scope = c("session", "renviron")) {
scope <- match.arg(scope)
Sys.setenv(
TESSERACT_INCLUDE = include_dir,
TESSERACT_LIB = lib_dir
)
if (!is.null(bin_dir)) {
Sys.setenv(TESSERACT_BIN = bin_dir)
}
if (scope == "renviron") {
renv <- path.expand("~/.Renviron")
old <- if (file.exists(renv)) readLines(renv, warn = FALSE) else character()
# retire anciennes lignes
old <- old[!grepl("^TESSERACT_INCLUDE=", old)]
old <- old[!grepl("^TESSERACT_LIB=", old)]
old <- old[!grepl("^TESSERACT_BIN=", old)]
new_lines <- c(
old,
sprintf('TESSERACT_INCLUDE="%s"', normalizePath(include_dir, winslash = "/", mustWork = FALSE)),
sprintf('TESSERACT_LIB="%s"', normalizePath(lib_dir, winslash = "/", mustWork = FALSE))
)
if (!is.null(bin_dir)) {
new_lines <- c(
new_lines,
sprintf('TESSERACT_BIN="%s"', normalizePath(bin_dir, winslash = "/", mustWork = FALSE))
)
}
writeLines(new_lines, renv)
message("Variables to add in ~/.Renviron")
}
message("TESSERACT_INCLUDE = ", Sys.getenv("TESSERACT_INCLUDE"))
message("TESSERACT_LIB = ", Sys.getenv("TESSERACT_LIB"))
message("TESSERACT_BIN = ", Sys.getenv("TESSERACT_BIN"))
invisible(TRUE)
}
install_tesseract_linux <- function(use_sudo = TRUE, persist = TRUE) {
if (!check_cmd("apt-get")) {
stop("This Linux function is intended for Debian/Ubuntu (apt-get).", call. = FALSE)
}
apt_pkgs <- c("tesseract-ocr", "libtesseract-dev", "libleptonica-dev")
if (use_sudo) {
run_cmd("sudo", c("apt-get", "update"), "apt-get update failed.")
run_cmd("sudo", c("apt-get", "install", "-y", apt_pkgs),
"APT installation of Tesseract failed.")
} else {
run_cmd("apt-get", c("update"), "apt-get update failed.")
run_cmd("apt-get", c("install", "-y", apt_pkgs),
"APT installation of Tesseract failed")
}
include_candidates <- c("/usr/include", "/usr/local/include")
lib_candidates <- c("/usr/lib/x86_64-linux-gnu", "/usr/lib", "/usr/local/lib")
bin_candidates <- c("/usr/bin", "/usr/local/bin")
include_dir <- include_candidates[
file.exists(file.path(include_candidates, "tesseract", "baseapi.h"))
][1]
lib_dir <- lib_candidates[
file.exists(file.path(lib_candidates, "libtesseract.so")) |
file.exists(file.path(lib_candidates, "libtesseract.a")) |
file.exists(file.path(lib_candidates, "libtesseract.dll.a"))
][1]
bin_dir <- bin_candidates[
file.exists(file.path(bin_candidates, "tesseract"))
][1]
if (is.na(include_dir) || is.na(lib_dir)) {
warning("Tesseract is installed, but the include/lib paths were not detected automatically")
include_dir <- "/usr/include"
lib_dir <- "/usr/lib"
}
if (is.na(bin_dir)) {
bin_dir <- "/usr/bin"
}
write_tesseract_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(include = include_dir, lib = lib_dir, bin = bin_dir))
}
install_tesseract_macos <- function(persist = TRUE) {
if (!check_cmd("brew")) {
stop("Homebrew is not available.", call. = FALSE)
}
run_cmd("brew", c("install", "tesseract"),
"Homebrew installation of Tesseract failed.")
brew_prefix <- trimws(system2("brew", "--prefix", stdout = TRUE))
tess_prefix <- trimws(system2("brew", c("--prefix", "tesseract"), stdout = TRUE))
include_candidates <- c(
file.path(tess_prefix, "include"),
file.path(brew_prefix, "include")
)
lib_candidates <- c(
file.path(tess_prefix, "lib"),
file.path(brew_prefix, "lib")
)
bin_candidates <- c(
file.path(tess_prefix, "bin"),
file.path(brew_prefix, "bin")
)
include_dir <- include_candidates[
file.exists(file.path(include_candidates, "tesseract", "baseapi.h"))
][1]
lib_dir <- lib_candidates[
file.exists(file.path(lib_candidates, "libtesseract.dylib")) |
file.exists(file.path(lib_candidates, "libtesseract.a"))
][1]
bin_dir <- bin_candidates[
file.exists(file.path(bin_candidates, "tesseract"))
][1]
if (is.na(include_dir) || is.na(lib_dir)) {
warning("Tesseract is installed, but the include/lib paths were not detected automatically.")
include_dir <- file.path(brew_prefix, "include")
lib_dir <- file.path(brew_prefix, "lib")
}
if (is.na(bin_dir)) {
bin_dir <- file.path(brew_prefix, "bin")
}
write_tesseract_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(prefix = tess_prefix, include = include_dir, lib = lib_dir, bin = bin_dir))
}
install_tesseract_windows_msys2 <- function(rtools_root = "C:/rtools45",
repo = "ucrt64",
persist = TRUE) {
bash <- file.path(rtools_root, "usr", "bin", "bash.exe")
if (!file.exists(bash)) {
stop("bash.exe not found in Rtools/MSYS2 ", bash, call. = FALSE)
}
pkgs <- switch(
repo,
ucrt64 = c("mingw-w64-ucrt-x86_64-tesseract-ocr"),
clang64 = c("mingw-w64-clang-x86_64-tesseract-ocr"),
mingw64 = c("mingw-w64-x86_64-tesseract-ocr"),
stop("repo has to be 'ucrt64', 'clang64' ou 'mingw64'")
)
msys_cmd <- sprintf("pacman -S --needed --noconfirm %s", paste(pkgs, collapse = " "))
run_cmd(bash, c("-lc", shQuote(msys_cmd)),
"MSYS2 installation of Tesseract failed.")
prefix <- switch(
repo,
ucrt64 = file.path(rtools_root, "ucrt64"),
clang64 = file.path(rtools_root, "clang64"),
mingw64 = file.path(rtools_root, "mingw64")
)
include_dir <- file.path(prefix, "include")
lib_dir <- file.path(prefix, "lib")
bin_dir <- file.path(prefix, "bin")
hdr <- file.path(include_dir, "tesseract", "baseapi.h")
if (!file.exists(hdr)) {
warning("Tesseract header not found at the expected location: ", hdr)
}
libs_found <- any(file.exists(file.path(lib_dir, c(
"libtesseract.dll.a", "libtesseract.a", "tesseract.lib"
))))
if (!libs_found) {
warning("Tesseract library not found at the expected location: ", lib_dir)
}
exe <- file.path(bin_dir, "tesseract.exe")
if (!file.exists(exe)) {
warning("Tesseract executable not found at the expected location : ", exe)
}
write_tesseract_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(
prefix = prefix,
include = include_dir,
lib = lib_dir,
bin = bin_dir,
packages = pkgs
))
}
install_tesseract <- function(...) {
sys <- Sys.info()[["sysname"]]
if (identical(sys, "Linux")) {
return(install_tesseract_linux(...))
}
if (identical(sys, "Darwin")) {
return(install_tesseract_macos(...))
}
if (identical(sys, "Windows")) {
return(install_tesseract_windows_msys2(...))
}
stop("OS not supported: ", sys, call. = FALSE)
}
install_tesseract(rtools_root = "C:/rtools45", repo = "ucrt64")
install_tesseract_languages <- function(
langs = c("eng", "fra", "osd"),
dest = "C:/rtools45/ucrt64/share/tessdata/",
overwrite = FALSE
) {
base_url <- "https://github.com/tesseract-ocr/tessdata_best/raw/main"
# create dir if necessary
if (!dir.exists(dest)) {
dir.create(dest, recursive = TRUE)
}
message("Installation in : ", dest)
for (lang in langs) {
file <- paste0(lang, ".traineddata")
url <- paste0(base_url, "/", file)
destfile <- file.path(dest, file)
if (file.exists(destfile) && !overwrite) {
message("✔ Already there : ", file)
next
}
message("⬇ Download : ", file)
tryCatch({
download.file(url, destfile, mode = "wb", quiet = TRUE)
message("✔ OK : ", file)
}, error = function(e) {
message("✖ Error for ", file, " : ", e$message)
})
}
# config environnement
Sys.setenv(TESSDATA_PREFIX = dest)
message("\nTESSDATA_PREFIX defined at : ", dest)
# vérification
message("\nVérification :")
print(file.exists(file.path(dest, paste0(langs, ".traineddata"))))
invisible(dest)
}
install_tesseract_languages()
run_cmd <- function(cmd, args = character(), error_msg = NULL, echo = TRUE) {
if (echo) {
message(">> ", paste(c(cmd, args), collapse = " "))
}
status <- system2(cmd, args = args)
if (!identical(status, 0L)) {
stop(if (is.null(error_msg)) paste("Command failed:", cmd) else error_msg, call. = FALSE)
}
invisible(TRUE)
}
check_cmd <- function(cmd) {
nzchar(Sys.which(cmd))
}
write_pdftoppm_env <- function(include_dir = NULL,
lib_dir = NULL,
bin_dir = NULL,
scope = c("session", "renviron")) {
scope <- match.arg(scope)
if (!is.null(include_dir)) {
Sys.setenv(POPPLER_INCLUDE = include_dir)
}
if (!is.null(lib_dir)) {
Sys.setenv(POPPLER_LIB = lib_dir)
}
if (!is.null(bin_dir)) {
Sys.setenv(PDFTOPPM_BIN = bin_dir)
}
if (scope == "renviron") {
renv <- path.expand("~/.Renviron")
old <- if (file.exists(renv)) readLines(renv, warn = FALSE) else character()
old <- old[!grepl("^POPPLER_INCLUDE=", old)]
old <- old[!grepl("^POPPLER_LIB=", old)]
old <- old[!grepl("^PDFTOPPM_BIN=", old)]
new_lines <- old
if (!is.null(include_dir)) {
new_lines <- c(
new_lines,
sprintf('POPPLER_INCLUDE="%s"',
normalizePath(include_dir, winslash = "/", mustWork = FALSE))
)
}
if (!is.null(lib_dir)) {
new_lines <- c(
new_lines,
sprintf('POPPLER_LIB="%s"',
normalizePath(lib_dir, winslash = "/", mustWork = FALSE))
)
}
if (!is.null(bin_dir)) {
new_lines <- c(
new_lines,
sprintf('PDFTOPPM_BIN="%s"',
normalizePath(bin_dir, winslash = "/", mustWork = FALSE))
)
}
writeLines(new_lines, renv)
message("Added variables to ~/.Renviron")
}
message("POPPLER_INCLUDE = ", Sys.getenv("POPPLER_INCLUDE"))
message("POPPLER_LIB = ", Sys.getenv("POPPLER_LIB"))
message("PDFTOPPM_BIN = ", Sys.getenv("PDFTOPPM_BIN"))
invisible(TRUE)
}
install_pdftoppm_linux <- function(use_sudo = TRUE, persist = TRUE) {
if (!check_cmd("apt-get")) {
stop("This Linux function is intended for Debian/Ubuntu (apt-get).", call. = FALSE)
}
apt_pkgs <- c("poppler-utils", "libpoppler-cpp-dev")
if (use_sudo) {
run_cmd("sudo", c("apt-get", "update"), "apt-get update failed.")
run_cmd("sudo", c("apt-get", "install", "-y", apt_pkgs),
"Installation APT of Poppler/pdftoppm failed.")
} else {
run_cmd("apt-get", c("update"), "apt-get update failed.")
run_cmd("apt-get", c("install", "-y", apt_pkgs),
"APT installation of Poppler/pdftoppm failed")
}
include_candidates <- c("/usr/include", "/usr/local/include")
lib_candidates <- c("/usr/lib/x86_64-linux-gnu", "/usr/lib", "/usr/local/lib")
bin_candidates <- c("/usr/bin", "/usr/local/bin")
include_dir <- include_candidates[
file.exists(file.path(include_candidates, "poppler", "cpp", "poppler-document.h")) |
file.exists(file.path(include_candidates, "poppler", "poppler-config.h"))
][1]
lib_dir <- lib_candidates[
file.exists(file.path(lib_candidates, "libpoppler.so")) |
file.exists(file.path(lib_candidates, "libpoppler-cpp.so")) |
file.exists(file.path(lib_candidates, "libpoppler.a")) |
file.exists(file.path(lib_candidates, "libpoppler-cpp.a"))
][1]
bin_dir <- bin_candidates[
file.exists(file.path(bin_candidates, "pdftoppm"))
][1]
if (is.na(include_dir)) {
warning("Poppler headers not found automatically")
include_dir <- "/usr/include"
}
if (is.na(lib_dir)) {
warning("Poppler libraries not found automatically")
lib_dir <- "/usr/lib"
}
if (is.na(bin_dir)) {
warning("pdftoppm executable not found automatically")
bin_dir <- "/usr/bin"
}
write_pdftoppm_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(include = include_dir, lib = lib_dir, bin = bin_dir))
}
install_pdftoppm_macos <- function(persist = TRUE) {
if (!check_cmd("brew")) {
stop("Homebrew is not available", call. = FALSE)
}
run_cmd("brew", c("install", "poppler"),
"Homebrew installation of Poppler/pdftoppm failed")
brew_prefix <- trimws(system2("brew", "--prefix", stdout = TRUE))
poppler_prefix <- trimws(system2("brew", c("--prefix", "poppler"), stdout = TRUE))
include_candidates <- c(
file.path(poppler_prefix, "include"),
file.path(brew_prefix, "include")
)
lib_candidates <- c(
file.path(poppler_prefix, "lib"),
file.path(brew_prefix, "lib")
)
bin_candidates <- c(
file.path(poppler_prefix, "bin"),
file.path(brew_prefix, "bin")
)
include_dir <- include_candidates[
file.exists(file.path(include_candidates, "poppler", "cpp", "poppler-document.h")) |
file.exists(file.path(include_candidates, "poppler", "poppler-config.h"))
][1]
lib_dir <- lib_candidates[
file.exists(file.path(lib_candidates, "libpoppler.dylib")) |
file.exists(file.path(lib_candidates, "libpoppler-cpp.dylib")) |
file.exists(file.path(lib_candidates, "libpoppler.a"))
][1]
bin_dir <- bin_candidates[
file.exists(file.path(bin_candidates, "pdftoppm"))
][1]
if (is.na(include_dir)) {
warning("Poppler headers not found automatically")
include_dir <- file.path(poppler_prefix, "include")
}
if (is.na(lib_dir)) {
warning("Poppler libraries not found automatically")
lib_dir <- file.path(poppler_prefix, "lib")
}
if (is.na(bin_dir)) {
warning("pdftoppm executable not found automatically")
bin_dir <- file.path(poppler_prefix, "bin")
}
write_pdftoppm_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(prefix = poppler_prefix, include = include_dir, lib = lib_dir, bin = bin_dir))
}
install_pdftoppm_windows_msys2 <- function(rtools_root = "C:/rtools45",
repo = "ucrt64",
persist = TRUE) {
bash <- file.path(rtools_root, "usr", "bin", "bash.exe")
if (!file.exists(bash)) {
stop("bash.exe not found in Rtools/MSYS2 ", bash, call. = FALSE)
}
pkgs <- switch(
repo,
ucrt64 = c("mingw-w64-ucrt-x86_64-poppler"),
clang64 = c("mingw-w64-clang-x86_64-poppler"),
mingw64 = c("mingw-w64-x86_64-poppler"),
stop("repo must be 'ucrt64', 'clang64', or 'mingw64'")
)
msys_cmd <- sprintf("pacman -S --needed --noconfirm %s", paste(pkgs, collapse = " "))
run_cmd(bash, c("-lc", shQuote(msys_cmd)),
"MSYS2 installation of Poppler/pdftoppm failed.")
prefix <- switch(
repo,
ucrt64 = file.path(rtools_root, "ucrt64"),
clang64 = file.path(rtools_root, "clang64"),
mingw64 = file.path(rtools_root, "mingw64")
)
include_dir <- file.path(prefix, "include")
lib_dir <- file.path(prefix, "lib")
bin_dir <- file.path(prefix, "bin")
hdr_found <- any(file.exists(c(
file.path(include_dir, "poppler", "cpp", "poppler-document.h"),
file.path(include_dir, "poppler", "poppler-config.h")
)))
if (!hdr_found) {
warning("Poppler headers not found at the expected location : ", include_dir)
}
libs_found <- any(file.exists(file.path(lib_dir, c(
"libpoppler.dll.a",
"libpoppler-cpp.dll.a",
"libpoppler.a",
"poppler.lib"
))))
if (!libs_found) {
warning("Poppler libraries not found at the expected location : ", lib_dir)
}
exe <- file.path(bin_dir, "pdftoppm.exe")
if (!file.exists(exe)) {
warning("pdftoppm executable not found at the expected location : ", exe)
}
write_pdftoppm_env(
include_dir = include_dir,
lib_dir = lib_dir,
bin_dir = bin_dir,
scope = if (persist) "renviron" else "session"
)
invisible(list(
prefix = prefix,
include = include_dir,
lib = lib_dir,
bin = bin_dir,
packages = pkgs
))
}
install_pdftoppm <- function(...) {
sys <- Sys.info()[["sysname"]]
if (identical(sys, "Linux")) {
return(install_pdftoppm_linux(...))
}
if (identical(sys, "Darwin")) {
return(install_pdftoppm_macos(...))
}
if (identical(sys, "Windows")) {
return(install_pdftoppm_windows_msys2(...))
}
stop("OS not supported: ", sys, call. = FALSE)
}
# Exemple Windows / Rtools45 / UCRT64
install_pdftoppm(rtools_root = "C:/rtools45", repo = "ucrt64")
library(rtesseract)
#### OCR english
eng <- tess_open(
language = "fra",
datapath = "C:/rtools45/ucrt64/share/tessdata/"
)
setwd("D:/Dropbox/test_Tesseract/")
#### Infos
tess_info(eng)
#### Image file
txt <- tess_ocr_file(eng, "lnCHO.png")
cat(txt)
#### Precise zone ####
rect <- c(1, 20, 30, 15)
txt <- tess_ocr_file(eng, "lnCHO.png", rect)
cat(txt)
#### TSV format ####
tsv <- tess_tsv_file(eng, "lnCHO.png")
cat(substr(tsv, 1, 500))
#### Structured HTML ####
hocr <- tess_hocr_file(eng, "lnCHO.png")
cat(substr(hocr, 1, 500))
#### Boxes ####
words <- tess_boxes_file(eng, "lnCHO.png", level = "word")
print(words)
#### Lines ####
words <- tess_boxes_file(eng, "lnCHO.png", level = "word")
print(words)
#### OCR multiple zones ####
rects <- matrix(
c(10, 20, 50, 50,
10, 80, 50, 60),
ncol = 4,
byrow = TRUE
)
res <- tess_ocr_rects_file(eng, "lnCHO.png", rects)
print(res)
#### Confidence
conf <- tess_confidence_file(eng, "lnCHO.png")
print(conf)
#### Mode
tess_set_page_seg_mode(eng, 6)
tess_set_variable(eng, "tessedit_char_whitelist", "0123456789")
#### Image to PDF searchable
tess_searchable_pdf_from_images(
eng,
image_paths = c("lnCHO.png"),
output_pdf = "output.pdf",
dpi = 300
)
setwd("D:/test_Tesseract/")
eng <- tess_open(
language = "eng",
datapath = "C:/rtools45/ucrt64/share/tessdata/"
)
#### Scanned PDF to PDF searchable
tess_searchable_pdf_from_scanned_pdf(
eng,
input_pdf = "D:\\test_Tesseract\\scan.pdf",
output_pdf = "D:\\test_Tesseract\\scan_ocr.pdf",
dpi = 300
)
MIT License
Emmanuel Hamel
Pull requests are welcome! Feel free to open issues for bugs or feature requests.