Skip to content

ManuHamel/rlibpostal

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

📦 rlibpostal

Rcpp bindings for libpostal — parsing and normalizing international postal addresses directly in R.

✨ Features 🌍 International address normalization (expand_address) 🧩 Address parsing into structured components (parse_address) ⚡ High performance via Rcpp + libpostal (C) 🔁 Persistent initialization (no reload of models at each call) 📚 About libpostal

rlibpostal is a thin R wrapper around the C library libpostal, which provides:

Language-agnostic address normalization Machine-learning based parsing Global address support

👉 https://github.com/openvenues/libpostal

Installation of libpostal

Here is an example of how you can install libpostal with a R code :

run_cmd <- function(cmd, args = character(), error_msg = NULL, echo = TRUE) {
  if (echo) {
    message(">> ", paste(c(cmd, args), collapse = " "))
  }
  status <- system2(cmd, args = args)
  if (!identical(status, 0L)) {
    stop(if (is.null(error_msg)) paste("Command failed:", cmd) else error_msg, call. = FALSE)
  }
  invisible(TRUE)
}

check_cmd <- function(cmd) {
  nzchar(Sys.which(cmd))
}

write_libpostal_env <- function(include_dir = NULL,
                                lib_dir = NULL,
                                bin_dir = NULL,
                                data_dir = NULL,
                                scope = c("session", "renviron")) {
  scope <- match.arg(scope)

  if (!is.null(include_dir)) {
    Sys.setenv(LIBPOSTAL_INCLUDE = include_dir)
  }
  if (!is.null(lib_dir)) {
    Sys.setenv(LIBPOSTAL_LIB = lib_dir)
  }
  if (!is.null(bin_dir)) {
    Sys.setenv(LIBPOSTAL_BIN = bin_dir)
  }
  if (!is.null(data_dir)) {
    Sys.setenv(LIBPOSTAL_DATA_DIR = data_dir)
  }

  if (scope == "renviron") {
    renv <- path.expand("~/.Renviron")
    old <- if (file.exists(renv)) readLines(renv, warn = FALSE) else character()

    old <- old[!grepl("^LIBPOSTAL_INCLUDE=", old)]
    old <- old[!grepl("^LIBPOSTAL_LIB=", old)]
    old <- old[!grepl("^LIBPOSTAL_BIN=", old)]
    old <- old[!grepl("^LIBPOSTAL_DATA_DIR=", old)]

    new_lines <- old

    if (!is.null(include_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('LIBPOSTAL_INCLUDE="%s"',
                normalizePath(include_dir, winslash = "/", mustWork = FALSE))
      )
    }

    if (!is.null(lib_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('LIBPOSTAL_LIB="%s"',
                normalizePath(lib_dir, winslash = "/", mustWork = FALSE))
      )
    }

    if (!is.null(bin_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('LIBPOSTAL_BIN="%s"',
                normalizePath(bin_dir, winslash = "/", mustWork = FALSE))
      )
    }

    if (!is.null(data_dir)) {
      new_lines <- c(
        new_lines,
        sprintf('LIBPOSTAL_DATA_DIR="%s"',
                normalizePath(data_dir, winslash = "/", mustWork = FALSE))
      )
    }

    writeLines(new_lines, renv)
    message("Variables added to ~/.Renviron")
  }

  message("LIBPOSTAL_INCLUDE  = ", Sys.getenv("LIBPOSTAL_INCLUDE"))
  message("LIBPOSTAL_LIB      = ", Sys.getenv("LIBPOSTAL_LIB"))
  message("LIBPOSTAL_BIN      = ", Sys.getenv("LIBPOSTAL_BIN"))
  message("LIBPOSTAL_DATA_DIR = ", Sys.getenv("LIBPOSTAL_DATA_DIR"))

  invisible(TRUE)
}

find_libpostal_data_dir <- function(prefix) {
  candidates <- c(
    file.path(prefix, "share", "libpostal"),
    file.path(prefix, "share", "libpostal_data"),
    file.path(prefix, "libpostal"),
    prefix
  )

  hit <- candidates[
    dir.exists(candidates) &
      (
        file.exists(file.path(candidates, "address_parser.dat")) |
          file.exists(file.path(candidates, "parser")) |
          file.exists(file.path(candidates, "transliteration")) |
          length(list.files(candidates, recursive = TRUE, all.files = TRUE, no.. = TRUE)) > 0
      )
  ][1]

  if (is.na(hit)) NA_character_ else hit
}

install_libpostal_linux <- function(use_sudo = TRUE,
                                    persist = TRUE,
                                    prefix = "/usr/local",
                                    data_dir = file.path(prefix, "share", "libpostal"),
                                    src_dir = tempfile("libpostal_src_"),
                                    ref = "master") {
  if (!check_cmd("apt-get")) {
    stop("This Linux function is intended for Debian/Ubuntu (apt-get).", call. = FALSE)
  }

  build_deps <- c("curl", "autoconf", "automake", "libtool", "pkg-config", "git", "make")

  if (use_sudo) {
    run_cmd("sudo", c("apt-get", "update"), "apt-get update failed.")
    run_cmd("sudo", c("apt-get", "install", "-y", build_deps),
            "APT installation of libpostal build dependencies failed.")
  } else {
    run_cmd("apt-get", c("update"), "apt-get update failed.")
    run_cmd("apt-get", c("install", "-y", build_deps),
            "APT installation of libpostal build dependencies failed.")
  }

  dir.create(src_dir, recursive = TRUE, showWarnings = FALSE)

  if (!dir.exists(file.path(src_dir, ".git"))) {
    run_cmd("git", c("clone", "https://github.com/openvenues/libpostal.git", src_dir),
            "Git clone of libpostal failed.")
  }

  run_cmd("git", c("-C", src_dir, "fetch", "--all", "--tags"),
          "Git fetch for libpostal failed.")
  run_cmd("git", c("-C", src_dir, "checkout", ref),
          "Git checkout for libpostal failed.")

  run_cmd("bash", c(file.path(src_dir, "bootstrap.sh")),
          "libpostal bootstrap failed.")

  configure_args <- c(
    sprintf("--datadir=%s", shQuote(normalizePath(data_dir, winslash = "/", mustWork = FALSE))),
    sprintf("--prefix=%s", shQuote(normalizePath(prefix, winslash = "/", mustWork = FALSE)))
  )

  run_cmd(file.path(src_dir, "configure"), configure_args,
          "libpostal configure failed.")
  run_cmd("make", c("-C", src_dir),
          "libpostal compilation failed.")

  if (use_sudo) {
    run_cmd("sudo", c("make", "-C", src_dir, "install"),
            "libpostal installation failed.")
    if (check_cmd("ldconfig")) {
      run_cmd("sudo", c("ldconfig"), "ldconfig failed.")
    }
  } else {
    run_cmd("make", c("-C", src_dir, "install"),
            "libpostal installation failed.")
    if (check_cmd("ldconfig")) {
      run_cmd("ldconfig", character(), "ldconfig failed.")
    }
  }

  include_candidates <- c(
    file.path(prefix, "include"),
    "/usr/local/include",
    "/usr/include"
  )

  lib_candidates <- c(
    file.path(prefix, "lib"),
    "/usr/local/lib",
    "/usr/lib/x86_64-linux-gnu",
    "/usr/lib"
  )

  bin_candidates <- c(
    file.path(prefix, "bin"),
    "/usr/local/bin",
    "/usr/bin"
  )

  include_dir <- include_candidates[
    file.exists(file.path(include_candidates, "libpostal", "libpostal.h"))
  ][1]

  lib_dir <- lib_candidates[
    file.exists(file.path(lib_candidates, "libpostal.so")) |
      file.exists(file.path(lib_candidates, "libpostal.a")) |
      file.exists(file.path(lib_candidates, "libpostal.so.1"))
  ][1]

  bin_dir <- bin_candidates[
    file.exists(file.path(bin_candidates, "libpostal")) |
      file.exists(file.path(bin_candidates, "address_parser")) |
      file.exists(file.path(bin_candidates, "address_expand"))
  ][1]

  detected_data_dir <- find_libpostal_data_dir(prefix)
  if (is.na(detected_data_dir) && dir.exists(data_dir)) {
    detected_data_dir <- data_dir
  }

  if (is.na(include_dir)) {
    warning("libpostal headers were not found automatically.")
    include_dir <- file.path(prefix, "include")
  }

  if (is.na(lib_dir)) {
    warning("libpostal libraries were not found automatically.")
    lib_dir <- file.path(prefix, "lib")
  }

  if (is.na(bin_dir)) {
    warning("libpostal executables were not found automatically.")
    bin_dir <- file.path(prefix, "bin")
  }

  if (is.na(detected_data_dir)) {
    warning("libpostal data directory was not found automatically.")
    detected_data_dir <- data_dir
  }

  write_libpostal_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    data_dir = detected_data_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(
    prefix = prefix,
    include = include_dir,
    lib = lib_dir,
    bin = bin_dir,
    data = detected_data_dir,
    source = src_dir
  ))
}

install_libpostal_macos <- function(persist = TRUE,
                                    use_brew_binary = TRUE,
                                    prefix = NULL,
                                    data_dir = NULL,
                                    disable_sse2 = FALSE) {
  if (!check_cmd("brew")) {
    stop("Homebrew is not available.", call. = FALSE)
  }

  if (isTRUE(use_brew_binary)) {
    run_cmd("brew", c("install", "libpostal"),
            "Homebrew installation of libpostal failed.")

    brew_prefix <- trimws(system2("brew", "--prefix", stdout = TRUE))
    lp_prefix <- trimws(system2("brew", c("--prefix", "libpostal"), stdout = TRUE))

    include_candidates <- c(file.path(lp_prefix, "include"), file.path(brew_prefix, "include"))
    lib_candidates <- c(file.path(lp_prefix, "lib"), file.path(brew_prefix, "lib"))
    bin_candidates <- c(file.path(lp_prefix, "bin"), file.path(brew_prefix, "bin"))

    include_dir <- include_candidates[
      file.exists(file.path(include_candidates, "libpostal", "libpostal.h"))
    ][1]

    lib_dir <- lib_candidates[
      file.exists(file.path(lib_candidates, "libpostal.dylib")) |
        file.exists(file.path(lib_candidates, "libpostal.a"))
    ][1]

    bin_dir <- bin_candidates[
      file.exists(file.path(bin_candidates, "libpostal")) |
        file.exists(file.path(bin_candidates, "address_parser")) |
        file.exists(file.path(bin_candidates, "address_expand"))
    ][1]

    detected_data_dir <- find_libpostal_data_dir(lp_prefix)
    if (is.null(data_dir) || is.na(data_dir)) {
      data_dir <- detected_data_dir
    }

    if (is.na(include_dir)) {
      warning("libpostal headers were not found automatically.")
      include_dir <- file.path(lp_prefix, "include")
    }

    if (is.na(lib_dir)) {
      warning("libpostal libraries were not found automatically.")
      lib_dir <- file.path(lp_prefix, "lib")
    }

    if (is.na(bin_dir)) {
      warning("libpostal executables were not found automatically.")
      bin_dir <- file.path(lp_prefix, "bin")
    }

    if (is.null(data_dir) || is.na(data_dir)) {
      warning("libpostal data directory was not found automatically.")
      data_dir <- file.path(lp_prefix, "share", "libpostal")
    }

    write_libpostal_env(
      include_dir = include_dir,
      lib_dir = lib_dir,
      bin_dir = bin_dir,
      data_dir = data_dir,
      scope = if (persist) "renviron" else "session"
    )

    return(invisible(list(
      prefix = lp_prefix,
      include = include_dir,
      lib = lib_dir,
      bin = bin_dir,
      data = data_dir,
      method = "brew"
    )))
  }

  if (is.null(prefix)) {
    prefix <- "/usr/local"
  }
  if (is.null(data_dir)) {
    data_dir <- file.path(prefix, "share", "libpostal")
  }

  run_cmd("brew", c("install", "curl", "autoconf", "automake", "libtool", "pkg-config", "git"),
          "Homebrew installation of libpostal build dependencies failed.")

  src_dir <- tempfile("libpostal_src_")
  run_cmd("git", c("clone", "https://github.com/openvenues/libpostal.git", src_dir),
          "Git clone of libpostal failed.")

  run_cmd("bash", c(file.path(src_dir, "bootstrap.sh")),
          "libpostal bootstrap failed.")

  cfg <- c(
    sprintf("--datadir=%s", shQuote(normalizePath(data_dir, winslash = "/", mustWork = FALSE))),
    sprintf("--prefix=%s", shQuote(normalizePath(prefix, winslash = "/", mustWork = FALSE)))
  )

  if (isTRUE(disable_sse2)) {
    cfg <- c(cfg, "--disable-sse2")
  }

  run_cmd(file.path(src_dir, "configure"), cfg,
          "libpostal configure failed.")
  run_cmd("make", c("-C", src_dir),
          "libpostal compilation failed.")
  run_cmd("make", c("-C", src_dir, "install"),
          "libpostal installation failed.")

  write_libpostal_env(
    include_dir = file.path(prefix, "include"),
    lib_dir = file.path(prefix, "lib"),
    bin_dir = file.path(prefix, "bin"),
    data_dir = data_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(
    prefix = prefix,
    include = file.path(prefix, "include"),
    lib = file.path(prefix, "lib"),
    bin = file.path(prefix, "bin"),
    data = data_dir,
    source = src_dir,
    method = "source"
  ))
}


`%||%` <- function(x, y) {
  if (is.null(x) || length(x) == 0 || is.na(x)) y else x
}

to_msys_path <- function(path) {
  path <- normalizePath(path, winslash = "/", mustWork = FALSE)
  if (grepl("^[A-Za-z]:", path)) {
    drv <- tolower(substr(path, 1, 1))
    rest <- substr(path, 3, nchar(path))
    path <- paste0("/", drv, rest)
  }
  path
}

run_bash_msys2 <- function(bash, cmd, repo = "ucrt64", rtools_root = "C:/rtools45",
                           error_msg = NULL, echo = TRUE) {
  repo <- match.arg(repo, c("ucrt64", "clang64", "mingw64"))

  repo_bin <- switch(
    repo,
    ucrt64  = file.path(rtools_root, "ucrt64", "bin"),
    clang64 = file.path(rtools_root, "clang64", "bin"),
    mingw64 = file.path(rtools_root, "mingw64", "bin")
  )

  usr_bin <- file.path(rtools_root, "usr", "bin")

  old_msystem <- Sys.getenv("MSYSTEM", unset = NA)
  old_chere   <- Sys.getenv("CHERE_INVOKING", unset = NA)
  old_path    <- Sys.getenv("PATH", unset = NA)

  on.exit({
    if (is.na(old_msystem)) Sys.unsetenv("MSYSTEM") else Sys.setenv(MSYSTEM = old_msystem)
    if (is.na(old_chere))   Sys.unsetenv("CHERE_INVOKING") else Sys.setenv(CHERE_INVOKING = old_chere)
    if (is.na(old_path))    Sys.unsetenv("PATH") else Sys.setenv(PATH = old_path)
  }, add = TRUE)

  Sys.setenv(
    MSYSTEM = toupper(repo),
    CHERE_INVOKING = "1",
    PATH = paste(
      normalizePath(repo_bin, winslash = "\\", mustWork = FALSE),
      normalizePath(usr_bin, winslash = "\\", mustWork = FALSE),
      old_path,
      sep = ";"
    )
  )

  script <- tempfile(fileext = ".sh")
  writeLines(c(
    "#!/usr/bin/env bash",
    "set -e",
    cmd
  ), script, useBytes = TRUE)

  if (echo) {
    message(">> ", bash, " ", normalizePath(script, winslash = "/", mustWork = FALSE))
  }

  status <- system2(
    bash,
    args = normalizePath(script, winslash = "/", mustWork = FALSE)
  )

  unlink(script)

  if (!identical(status, 0L)) {
    stop(if (is.null(error_msg)) "Bash command failed." else error_msg, call. = FALSE)
  }

  invisible(TRUE)
}

install_libpostal_windows_msys2 <- function(rtools_root = "C:/rtools45",
                                            repo = "ucrt64",
                                            persist = TRUE,
                                            prefix = NULL,
                                            data_dir = NULL,
                                            src_dir = "C:/libpostal-src") {
  repo <- match.arg(repo, c("ucrt64", "clang64", "mingw64"))

  bash <- file.path(rtools_root, "usr", "bin", "bash.exe")
  if (!file.exists(bash)) {
    stop("bash.exe was not found in Rtools/MSYS2: ", bash, call. = FALSE)
  }

  prefix <- prefix %||% switch(
    repo,
    ucrt64  = file.path(rtools_root, "ucrt64"),
    clang64 = file.path(rtools_root, "clang64"),
    mingw64 = file.path(rtools_root, "mingw64")
  )

  if (is.null(data_dir)) {
    data_dir <- file.path(prefix, "share", "libpostal")
  }

  msys_src_dir  <- to_msys_path(src_dir)
  msys_prefix   <- to_msys_path(prefix)
  msys_data_dir <- to_msys_path(data_dir)

  pkgs <- switch(
    repo,
    ucrt64 = c(
      "autoconf", "automake", "curl", "git", "make", "libtool", "pkgconf",
      "mingw-w64-ucrt-x86_64-gcc"
    ),
    clang64 = c(
      "autoconf", "automake", "curl", "git", "make", "libtool", "pkgconf",
      "mingw-w64-clang-x86_64-gcc"
    ),
    mingw64 = c(
      "autoconf", "automake", "curl", "git", "make", "libtool", "pkgconf",
      "mingw-w64-x86_64-gcc"
    )
  )

  run_bash_msys2(
    bash = bash,
    cmd = sprintf("pacman -S --needed --noconfirm %s", paste(pkgs, collapse = " ")),
    repo = repo,
    rtools_root = rtools_root,
    error_msg = "MSYS2 installation of libpostal build dependencies failed."
  )

  run_bash_msys2(
    bash = bash,
    cmd = "echo MSYSTEM=$MSYSTEM && which gcc && gcc --version",
    repo = repo,
    rtools_root = rtools_root,
    error_msg = "MSYS2 compiler test failed."
  )

  build_script <- paste(
    sprintf("mkdir -p %s", shQuote(msys_src_dir)),
    sprintf(
      "if [ ! -d %s/.git ]; then git clone https://github.com/openvenues/libpostal.git %s; fi",
      shQuote(msys_src_dir), shQuote(msys_src_dir)
    ),
    sprintf("cd %s", shQuote(msys_src_dir)),
    "git fetch --all --tags",
    "git checkout master",
    "cp -rf windows/* ./",
    "./bootstrap.sh",
    sprintf("./configure --datadir=%s --prefix=%s", shQuote(msys_data_dir), shQuote(msys_prefix)),
    "make",
    "make install",
    sep = "\n"
  )

  run_bash_msys2(
    bash = bash,
    cmd = build_script,
    repo = repo,
    rtools_root = rtools_root,
    error_msg = "MSYS2 build/install of libpostal failed."
  )

  include_dir <- file.path(prefix, "include")
  lib_dir <- file.path(prefix, "lib")
  bin_dir <- file.path(prefix, "bin")

  write_libpostal_env(
    include_dir = include_dir,
    lib_dir = lib_dir,
    bin_dir = bin_dir,
    data_dir = data_dir,
    scope = if (persist) "renviron" else "session"
  )

  invisible(list(
    prefix = prefix,
    include = include_dir,
    lib = lib_dir,
    bin = bin_dir,
    data = data_dir,
    source = src_dir
  ))
}

# Example Windows / Rtools45 / UCRT64
install_libpostal(rtools_root = "C:/rtools45", repo = "ucrt64")

Example

> library(rlibpostal)
> 
> libpostal_init()
[1] TRUE
> 
> libpostal_parse_address(
+   "123 rue Saint-Paul Ouest, Montréal, QC H2Y 1Z5"
+ )
         label                value
1 house_number                  123
2         road rue saint-paul ouest
3         city             montréal
4        state                   qc
5     postcode              h2y 1z5
> libpostal_expand_address(
+   "123 rue Saint-Paul Ouest, Montréal, QC H2Y 1Z5"
+ )
 [1] "123 rue saint-paul ouest montreal qc h2y 1z5"      
 [2] "123 rue saint-paul ouest montreal quebec h2y 1z5"  
 [3] "123 rue saint-paul ouest montreal qc h2y 1 z5"     
 [4] "123 rue saint-paul ouest montreal quebec h2y 1 z5" 
 [5] "123 rue saint-paul ouest montreal qc h 2y 1z5"     
 [6] "123 rue saint-paul ouest montreal quebec h 2y 1z5" 
 [7] "123 rue saint-paul ouest montreal qc h 2y 1 z5"    
 [8] "123 rue saint-paul ouest montreal quebec h 2y 1 z5"
 [9] "123 rue saint paul ouest montreal qc h2y 1z5"      
[10] "123 rue saint paul ouest montreal quebec h2y 1z5"  
[11] "123 rue saint paul ouest montreal qc h2y 1 z5"     
[12] "123 rue saint paul ouest montreal quebec h2y 1 z5" 
[13] "123 rue saint paul ouest montreal qc h 2y 1z5"     
[14] "123 rue saint paul ouest montreal quebec h 2y 1z5" 
[15] "123 rue saint paul ouest montreal qc h 2y 1 z5"    
[16] "123 rue saint paul ouest montreal quebec h 2y 1 z5"
> libpostal_teardown_all()
[1] TRUE

📄 License

MIT (package) libpostal: BSD-style license

🙌 Acknowledgments libpostal by OpenVenues Rcpp ecosystem

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages