Skip to content

ManuHamel/rtesseract

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

5 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

rtesseract

rtesseract is an R package providing a fast and flexible interface to the Tesseract OCR engine via Rcpp. It allows you to extract text from images and convert scanned PDFs into searchable PDFs.


✨ Features

  • OCR on single or multiple images
  • Convert scanned PDFs to searchable PDFs
  • Control DPI and rendering options
  • Cross-platform support (Windows, Linux, macOS)
  • Direct integration with Tesseract C++ API
  • Uses pdftoppm (Poppler) for PDF rasterization

📦 Installation

1. Install system dependencies


📌 Dependencies

  • Tesseract OCR
  • Leptonica
  • Poppler (pdftoppm)

Installation Tesseract

run_cmd <- function(cmd, args = character(), error_msg = NULL, echo = TRUE) {
  if (echo) {
    message(">> ", paste(c(cmd, args), collapse = " "))
  }
  status <- system2(cmd, args = args)
  if (!identical(status, 0L)) {
    stop(if (is.null(error_msg)) paste("Command failed:", cmd) else error_msg, call. = FALSE)
  }
  invisible(TRUE)
}

check_cmd <- function(cmd) {
  nzchar(Sys.which(cmd))
}

write_tesseract_env <- function(include_dir,
                                lib_dir,
                                bin_dir = NULL,
                                scope = c("session", "renviron")) {
  scope <- match.arg(scope)

  Sys.setenv(
    TESSERACT_INCLUDE = include_dir,
    TESSERACT_LIB = lib_dir
  )

  if (!is.null(bin_dir)) {
    Sys.setenv(TESSERACT_BIN = bin_dir)
  }

  if (scope == "renviron") {
    renv <- path.expand("~/.Renviron")
    old <- if (file.exists(renv)) readLines(renv, warn = FALSE) else character()

    # retire anciennes lignes
    old <- old[!grepl("^TESSERACT_INCLUDE=", old)]
    old <- old[!grepl("^TESSERACT_LIB=", old)]
    old <- old[!grepl("^TESSERACT_BIN=", old)]

    new_lines <- c(
      old,
      sprintf('TESSERACT_INCLUDE="%s"', normalizePath(include_dir, winslash = "/", mustWork = FALSE)),
      sprintf('TESSERACT_LIB="%s"', normalizePath(lib_dir, winslash = "/", mustWork = FALSE))
    )

    if (!is.null(bin_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('TESSERACT_BIN="%s"', normalizePath(bin_dir, winslash = "/", mustWork = FALSE))
      )
    }

    writeLines(new_lines, renv)
    message("Variables to add in ~/.Renviron")
  }

  message("TESSERACT_INCLUDE = ", Sys.getenv("TESSERACT_INCLUDE"))
  message("TESSERACT_LIB     = ", Sys.getenv("TESSERACT_LIB"))
  message("TESSERACT_BIN     = ", Sys.getenv("TESSERACT_BIN"))

  invisible(TRUE)
}

install_tesseract_linux <- function(use_sudo = TRUE, persist = TRUE) {
  if (!check_cmd("apt-get")) {
    stop("This Linux function is intended for Debian/Ubuntu (apt-get).", call. = FALSE)
  }

  apt_pkgs <- c("tesseract-ocr", "libtesseract-dev", "libleptonica-dev")

  if (use_sudo) {
    run_cmd("sudo", c("apt-get", "update"), "apt-get update failed.")
    run_cmd("sudo", c("apt-get", "install", "-y", apt_pkgs),
            "APT installation of Tesseract failed.")
  } else {
    run_cmd("apt-get", c("update"), "apt-get update failed.")
    run_cmd("apt-get", c("install", "-y", apt_pkgs),
            "APT installation of Tesseract failed")
  }

  include_candidates <- c("/usr/include", "/usr/local/include")
  lib_candidates <- c("/usr/lib/x86_64-linux-gnu", "/usr/lib", "/usr/local/lib")
  bin_candidates <- c("/usr/bin", "/usr/local/bin")

  include_dir <- include_candidates[
    file.exists(file.path(include_candidates, "tesseract", "baseapi.h"))
  ][1]

  lib_dir <- lib_candidates[
    file.exists(file.path(lib_candidates, "libtesseract.so")) |
      file.exists(file.path(lib_candidates, "libtesseract.a")) |
      file.exists(file.path(lib_candidates, "libtesseract.dll.a"))
  ][1]

  bin_dir <- bin_candidates[
    file.exists(file.path(bin_candidates, "tesseract"))
  ][1]

  if (is.na(include_dir) || is.na(lib_dir)) {
    warning("Tesseract is installed, but the include/lib paths were not detected automatically")
    include_dir <- "/usr/include"
    lib_dir <- "/usr/lib"
  }

  if (is.na(bin_dir)) {
    bin_dir <- "/usr/bin"
  }

  write_tesseract_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(include = include_dir, lib = lib_dir, bin = bin_dir))
}

install_tesseract_macos <- function(persist = TRUE) {
  if (!check_cmd("brew")) {
    stop("Homebrew is not available.", call. = FALSE)
  }

  run_cmd("brew", c("install", "tesseract"),
          "Homebrew installation of Tesseract failed.")

  brew_prefix <- trimws(system2("brew", "--prefix", stdout = TRUE))
  tess_prefix <- trimws(system2("brew", c("--prefix", "tesseract"), stdout = TRUE))

  include_candidates <- c(
    file.path(tess_prefix, "include"),
    file.path(brew_prefix, "include")
  )

  lib_candidates <- c(
    file.path(tess_prefix, "lib"),
    file.path(brew_prefix, "lib")
  )

  bin_candidates <- c(
    file.path(tess_prefix, "bin"),
    file.path(brew_prefix, "bin")
  )

  include_dir <- include_candidates[
    file.exists(file.path(include_candidates, "tesseract", "baseapi.h"))
  ][1]

  lib_dir <- lib_candidates[
    file.exists(file.path(lib_candidates, "libtesseract.dylib")) |
      file.exists(file.path(lib_candidates, "libtesseract.a"))
  ][1]

  bin_dir <- bin_candidates[
    file.exists(file.path(bin_candidates, "tesseract"))
  ][1]

  if (is.na(include_dir) || is.na(lib_dir)) {
    warning("Tesseract is installed, but the include/lib paths were not detected automatically.")
    include_dir <- file.path(brew_prefix, "include")
    lib_dir <- file.path(brew_prefix, "lib")
  }

  if (is.na(bin_dir)) {
    bin_dir <- file.path(brew_prefix, "bin")
  }

  write_tesseract_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(prefix = tess_prefix, include = include_dir, lib = lib_dir, bin = bin_dir))
}

install_tesseract_windows_msys2 <- function(rtools_root = "C:/rtools45",
                                            repo = "ucrt64",
                                            persist = TRUE) {
  bash <- file.path(rtools_root, "usr", "bin", "bash.exe")
  if (!file.exists(bash)) {
    stop("bash.exe not found in Rtools/MSYS2 ", bash, call. = FALSE)
  }

  pkgs <- switch(
    repo,
    ucrt64 = c("mingw-w64-ucrt-x86_64-tesseract-ocr"),
    clang64 = c("mingw-w64-clang-x86_64-tesseract-ocr"),
    mingw64 = c("mingw-w64-x86_64-tesseract-ocr"),
    stop("repo has to be 'ucrt64', 'clang64' ou 'mingw64'")
  )

  msys_cmd <- sprintf("pacman -S --needed --noconfirm %s", paste(pkgs, collapse = " "))
  run_cmd(bash, c("-lc", shQuote(msys_cmd)),
          "MSYS2 installation of Tesseract failed.")

  prefix <- switch(
    repo,
    ucrt64  = file.path(rtools_root, "ucrt64"),
    clang64 = file.path(rtools_root, "clang64"),
    mingw64 = file.path(rtools_root, "mingw64")
  )

  include_dir <- file.path(prefix, "include")
  lib_dir <- file.path(prefix, "lib")
  bin_dir <- file.path(prefix, "bin")

  hdr <- file.path(include_dir, "tesseract", "baseapi.h")
  if (!file.exists(hdr)) {
    warning("Tesseract header not found at the expected location: ", hdr)
  }

  libs_found <- any(file.exists(file.path(lib_dir, c(
    "libtesseract.dll.a", "libtesseract.a", "tesseract.lib"
  ))))
  if (!libs_found) {
    warning("Tesseract library not found at the expected location: ", lib_dir)
  }

  exe <- file.path(bin_dir, "tesseract.exe")
  if (!file.exists(exe)) {
    warning("Tesseract executable not found at the expected location : ", exe)
  }

  write_tesseract_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(
    prefix = prefix,
    include = include_dir,
    lib = lib_dir,
    bin = bin_dir,
    packages = pkgs
  ))
}

install_tesseract <- function(...) {
  sys <- Sys.info()[["sysname"]]

  if (identical(sys, "Linux")) {
    return(install_tesseract_linux(...))
  }
  if (identical(sys, "Darwin")) {
    return(install_tesseract_macos(...))
  }
  if (identical(sys, "Windows")) {
    return(install_tesseract_windows_msys2(...))
  }

  stop("OS not supported: ", sys, call. = FALSE)
}

install_tesseract(rtools_root = "C:/rtools45", repo = "ucrt64")

Download tesseract training data

install_tesseract_languages <- function(
    langs = c("eng", "fra", "osd"),
    dest = "C:/rtools45/ucrt64/share/tessdata/",
    overwrite = FALSE
) {
  base_url <- "https://github.com/tesseract-ocr/tessdata_best/raw/main"

  # create dir if necessary
  if (!dir.exists(dest)) {
    dir.create(dest, recursive = TRUE)
  }

  message("Installation in : ", dest)

  for (lang in langs) {
    file <- paste0(lang, ".traineddata")
    url <- paste0(base_url, "/", file)
    destfile <- file.path(dest, file)

    if (file.exists(destfile) && !overwrite) {
      message("✔ Already there : ", file)
      next
    }

    message("⬇ Download : ", file)

    tryCatch({
      download.file(url, destfile, mode = "wb", quiet = TRUE)
      message("✔ OK : ", file)
    }, error = function(e) {
      message("✖ Error for ", file, " : ", e$message)
    })
  }

  # config environnement
  Sys.setenv(TESSDATA_PREFIX = dest)

  message("\nTESSDATA_PREFIX defined at : ", dest)

  # vérification
  message("\nVérification :")
  print(file.exists(file.path(dest, paste0(langs, ".traineddata"))))

  invisible(dest)
}

install_tesseract_languages()

Installation Poppler

run_cmd <- function(cmd, args = character(), error_msg = NULL, echo = TRUE) {
  if (echo) {
    message(">> ", paste(c(cmd, args), collapse = " "))
  }
  status <- system2(cmd, args = args)
  if (!identical(status, 0L)) {
    stop(if (is.null(error_msg)) paste("Command failed:", cmd) else error_msg, call. = FALSE)
  }
  invisible(TRUE)
}

check_cmd <- function(cmd) {
  nzchar(Sys.which(cmd))
}

write_pdftoppm_env <- function(include_dir = NULL,
                               lib_dir = NULL,
                               bin_dir = NULL,
                               scope = c("session", "renviron")) {
  scope <- match.arg(scope)

  if (!is.null(include_dir)) {
    Sys.setenv(POPPLER_INCLUDE = include_dir)
  }
  if (!is.null(lib_dir)) {
    Sys.setenv(POPPLER_LIB = lib_dir)
  }
  if (!is.null(bin_dir)) {
    Sys.setenv(PDFTOPPM_BIN = bin_dir)
  }

  if (scope == "renviron") {
    renv <- path.expand("~/.Renviron")
    old <- if (file.exists(renv)) readLines(renv, warn = FALSE) else character()

    old <- old[!grepl("^POPPLER_INCLUDE=", old)]
    old <- old[!grepl("^POPPLER_LIB=", old)]
    old <- old[!grepl("^PDFTOPPM_BIN=", old)]

    new_lines <- old

    if (!is.null(include_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('POPPLER_INCLUDE="%s"',
                normalizePath(include_dir, winslash = "/", mustWork = FALSE))
      )
    }

    if (!is.null(lib_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('POPPLER_LIB="%s"',
                normalizePath(lib_dir, winslash = "/", mustWork = FALSE))
      )
    }

    if (!is.null(bin_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('PDFTOPPM_BIN="%s"',
                normalizePath(bin_dir, winslash = "/", mustWork = FALSE))
      )
    }

    writeLines(new_lines, renv)
    message("Added variables to ~/.Renviron")
  }

  message("POPPLER_INCLUDE = ", Sys.getenv("POPPLER_INCLUDE"))
  message("POPPLER_LIB     = ", Sys.getenv("POPPLER_LIB"))
  message("PDFTOPPM_BIN    = ", Sys.getenv("PDFTOPPM_BIN"))

  invisible(TRUE)
}

install_pdftoppm_linux <- function(use_sudo = TRUE, persist = TRUE) {
  if (!check_cmd("apt-get")) {
    stop("This Linux function is intended for Debian/Ubuntu (apt-get).", call. = FALSE)
  }

  apt_pkgs <- c("poppler-utils", "libpoppler-cpp-dev")

  if (use_sudo) {
    run_cmd("sudo", c("apt-get", "update"), "apt-get update failed.")
    run_cmd("sudo", c("apt-get", "install", "-y", apt_pkgs),
            "Installation APT of Poppler/pdftoppm failed.")
  } else {
    run_cmd("apt-get", c("update"), "apt-get update failed.")
    run_cmd("apt-get", c("install", "-y", apt_pkgs),
            "APT installation of Poppler/pdftoppm failed")
  }

  include_candidates <- c("/usr/include", "/usr/local/include")
  lib_candidates <- c("/usr/lib/x86_64-linux-gnu", "/usr/lib", "/usr/local/lib")
  bin_candidates <- c("/usr/bin", "/usr/local/bin")

  include_dir <- include_candidates[
    file.exists(file.path(include_candidates, "poppler", "cpp", "poppler-document.h")) |
      file.exists(file.path(include_candidates, "poppler", "poppler-config.h"))
  ][1]

  lib_dir <- lib_candidates[
    file.exists(file.path(lib_candidates, "libpoppler.so")) |
      file.exists(file.path(lib_candidates, "libpoppler-cpp.so")) |
      file.exists(file.path(lib_candidates, "libpoppler.a")) |
      file.exists(file.path(lib_candidates, "libpoppler-cpp.a"))
  ][1]

  bin_dir <- bin_candidates[
    file.exists(file.path(bin_candidates, "pdftoppm"))
  ][1]

  if (is.na(include_dir)) {
    warning("Poppler headers not found automatically")
    include_dir <- "/usr/include"
  }

  if (is.na(lib_dir)) {
    warning("Poppler libraries not found automatically")
    lib_dir <- "/usr/lib"
  }

  if (is.na(bin_dir)) {
    warning("pdftoppm executable not found automatically")
    bin_dir <- "/usr/bin"
  }

  write_pdftoppm_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(include = include_dir, lib = lib_dir, bin = bin_dir))
}

install_pdftoppm_macos <- function(persist = TRUE) {
  if (!check_cmd("brew")) {
    stop("Homebrew is not available", call. = FALSE)
  }

  run_cmd("brew", c("install", "poppler"),
          "Homebrew installation of Poppler/pdftoppm failed")

  brew_prefix <- trimws(system2("brew", "--prefix", stdout = TRUE))
  poppler_prefix <- trimws(system2("brew", c("--prefix", "poppler"), stdout = TRUE))

  include_candidates <- c(
    file.path(poppler_prefix, "include"),
    file.path(brew_prefix, "include")
  )

  lib_candidates <- c(
    file.path(poppler_prefix, "lib"),
    file.path(brew_prefix, "lib")
  )

  bin_candidates <- c(
    file.path(poppler_prefix, "bin"),
    file.path(brew_prefix, "bin")
  )

  include_dir <- include_candidates[
    file.exists(file.path(include_candidates, "poppler", "cpp", "poppler-document.h")) |
      file.exists(file.path(include_candidates, "poppler", "poppler-config.h"))
  ][1]

  lib_dir <- lib_candidates[
    file.exists(file.path(lib_candidates, "libpoppler.dylib")) |
      file.exists(file.path(lib_candidates, "libpoppler-cpp.dylib")) |
      file.exists(file.path(lib_candidates, "libpoppler.a"))
  ][1]

  bin_dir <- bin_candidates[
    file.exists(file.path(bin_candidates, "pdftoppm"))
  ][1]

  if (is.na(include_dir)) {
    warning("Poppler headers not found automatically")
    include_dir <- file.path(poppler_prefix, "include")
  }

  if (is.na(lib_dir)) {
    warning("Poppler libraries not found automatically")
    lib_dir <- file.path(poppler_prefix, "lib")
  }

  if (is.na(bin_dir)) {
    warning("pdftoppm executable not found automatically")
    bin_dir <- file.path(poppler_prefix, "bin")
  }

  write_pdftoppm_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(prefix = poppler_prefix, include = include_dir, lib = lib_dir, bin = bin_dir))
}

install_pdftoppm_windows_msys2 <- function(rtools_root = "C:/rtools45",
                                           repo = "ucrt64",
                                           persist = TRUE) {
  bash <- file.path(rtools_root, "usr", "bin", "bash.exe")
  if (!file.exists(bash)) {
    stop("bash.exe not found in Rtools/MSYS2 ", bash, call. = FALSE)
  }

  pkgs <- switch(
    repo,
    ucrt64  = c("mingw-w64-ucrt-x86_64-poppler"),
    clang64 = c("mingw-w64-clang-x86_64-poppler"),
    mingw64 = c("mingw-w64-x86_64-poppler"),
    stop("repo must be 'ucrt64', 'clang64', or 'mingw64'")
  )

  msys_cmd <- sprintf("pacman -S --needed --noconfirm %s", paste(pkgs, collapse = " "))
  run_cmd(bash, c("-lc", shQuote(msys_cmd)),
          "MSYS2 installation of Poppler/pdftoppm failed.")

  prefix <- switch(
    repo,
    ucrt64  = file.path(rtools_root, "ucrt64"),
    clang64 = file.path(rtools_root, "clang64"),
    mingw64 = file.path(rtools_root, "mingw64")
  )

  include_dir <- file.path(prefix, "include")
  lib_dir <- file.path(prefix, "lib")
  bin_dir <- file.path(prefix, "bin")

  hdr_found <- any(file.exists(c(
    file.path(include_dir, "poppler", "cpp", "poppler-document.h"),
    file.path(include_dir, "poppler", "poppler-config.h")
  )))
  if (!hdr_found) {
    warning("Poppler headers not found at the expected location : ", include_dir)
  }

  libs_found <- any(file.exists(file.path(lib_dir, c(
    "libpoppler.dll.a",
    "libpoppler-cpp.dll.a",
    "libpoppler.a",
    "poppler.lib"
  ))))
  if (!libs_found) {
    warning("Poppler libraries not found at the expected location : ", lib_dir)
  }

  exe <- file.path(bin_dir, "pdftoppm.exe")
  if (!file.exists(exe)) {
    warning("pdftoppm executable not found at the expected location : ", exe)
  }

  write_pdftoppm_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(
    prefix = prefix,
    include = include_dir,
    lib = lib_dir,
    bin = bin_dir,
    packages = pkgs
  ))
}

install_pdftoppm <- function(...) {
  sys <- Sys.info()[["sysname"]]

  if (identical(sys, "Linux")) {
    return(install_pdftoppm_linux(...))
  }
  if (identical(sys, "Darwin")) {
    return(install_pdftoppm_macos(...))
  }
  if (identical(sys, "Windows")) {
    return(install_pdftoppm_windows_msys2(...))
  }

  stop("OS not supported: ", sys, call. = FALSE)
}

# Exemple Windows / Rtools45 / UCRT64
install_pdftoppm(rtools_root = "C:/rtools45", repo = "ucrt64")

Example

library(rtesseract)

#### OCR english
eng <- tess_open(
  language = "fra",
  datapath = "C:/rtools45/ucrt64/share/tessdata/"
)

setwd("D:/Dropbox/test_Tesseract/")

#### Infos
tess_info(eng)

#### Image file
txt <- tess_ocr_file(eng, "lnCHO.png")
cat(txt)

#### Precise zone ####
rect <- c(1, 20, 30, 15)
txt <- tess_ocr_file(eng, "lnCHO.png", rect)
cat(txt)

#### TSV format ####
tsv <- tess_tsv_file(eng, "lnCHO.png")
cat(substr(tsv, 1, 500))

#### Structured HTML ####
hocr <- tess_hocr_file(eng, "lnCHO.png")
cat(substr(hocr, 1, 500))

#### Boxes ####
words <- tess_boxes_file(eng, "lnCHO.png", level = "word")
print(words)

#### Lines ####
words <- tess_boxes_file(eng, "lnCHO.png", level = "word")
print(words)

#### OCR multiple zones ####
rects <- matrix(
  c(10, 20, 50, 50,
    10, 80, 50, 60),
  ncol = 4,
  byrow = TRUE
)

res <- tess_ocr_rects_file(eng, "lnCHO.png", rects)
print(res)


#### Confidence
conf <- tess_confidence_file(eng, "lnCHO.png")
print(conf)


#### Mode
tess_set_page_seg_mode(eng, 6)
tess_set_variable(eng, "tessedit_char_whitelist", "0123456789")


#### Image to PDF searchable
tess_searchable_pdf_from_images(
  eng,
  image_paths = c("lnCHO.png"),
  output_pdf = "output.pdf",
  dpi = 300
)


setwd("D:/test_Tesseract/")

eng <- tess_open(
  language = "eng",
  datapath = "C:/rtools45/ucrt64/share/tessdata/"
)

#### Scanned PDF to PDF searchable
tess_searchable_pdf_from_scanned_pdf(
  eng,
  input_pdf = "D:\\test_Tesseract\\scan.pdf",
  output_pdf = "D:\\test_Tesseract\\scan_ocr.pdf",
  dpi = 300
)

📄 License

MIT License


👤 Author

Emmanuel Hamel


🤝 Contributing

Pull requests are welcome! Feel free to open issues for bugs or feature requests.

About

rtesseract is an R package providing a fast and flexible interface to the Tesseract OCR engine via Rcpp. It allows you to extract text from images and convert scanned PDFs into searchable PDFs.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors