Skip to content
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(extrai_NomeProprio)
export(ident_erros_munic_galileo)
export(nome_de_solteira)
export(remove_preposicao_nomes)
export(remove_pronome_tratamento)
import(RCurl)
import(data.table)
importFrom(dplyr,"%>%")
Expand Down
7 changes: 7 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@
#' \item{Nome_Municipio}{City names}
#' }
"geocod_base"

#' Brazilian treatment pronouns
#'
#' A regex character with the most common Brazilian treatment pronouns used in Brazil
#'
#' @format Character
"lista"
70 changes: 65 additions & 5 deletions R/text_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,22 @@ abrevia_nomes_meio_coluna<- function(nomes){
}


funcao_generica <- function(base, ..., suffixo, FUN){
funcao_generica <- function(base, ..., suffixo, FUN, spark_conn){
FUN <- match.fun(FUN)
if(is.character(base)){
if(!is.null(spark_conn)){ return(FUN(base,spark_conn)) }
return(FUN(base))
}
other_columns <- unlist(eval(substitute(alist(...))))
stopifnot(length(other_columns) > 0)
if(!is.data.table(base)){ setDT(base) }
new_columns <- sapply(other_columns, function(x) paste0(x, suffixo))
mapply( function(x, y){ set(base, j = x, value = FUN(base[[y]])) },
new_columns, other_columns)

if(!is.null(spark_conn)){
mapply( function(x, y){ set(base, j = x, value = FUN(base[[y]],spark_conn)) }, new_columns, other_columns)
} else {
mapply( function(x, y){ set(base, j = x, value = FUN(base[[y]])) }, new_columns, other_columns)
}
return(base)

}


Expand Down Expand Up @@ -85,3 +87,61 @@ abrevia_nome_meio <- function(base, ..., suffixo = "_abrev"){
return(funcao_generica(base, ..., suffixo = suffixo, FUN = abrevia_nomes_meio_coluna))

}

#' Remove commom treatment pronouns used in Brazil.
#'
#' \code{remove_pronome_tratamento} return names without treatment pronouns (Sra, Sr, Dr, etc).
#'
#'
#' @param base A data table, data frame or character vector.
#' @param suffixo Name of the new column to be created.
#' @param ... columns for apply the function
#' @param spark_conn A character with the spark's connection name. For NULL, it runs locally.
#'
#' @import data.table sparklyr dplyr
#' @importFrom stringr str_replace_all
#' @return the base param with a new column.
#'
#' @examples
#' remove_pronome_tratamento("Dr. Fulano")
#' remove_pronome_tratamento("Exmo. Sr. Cicrano de Tal")
#'
#' base <- data.frame(nome = c("Ph.D Pedro Anjos", "Prof Maria Gracas", "Pe. João"))
#' base <- remove_pronome_tratamento(base, "nome", suffixo = "_new_names")
#'
#' @export
remove_pronome_tratamento <- function(base, ..., suffixo = "_sem_pron", spark_conn = NULL){
if(is.null(spark_conn)){
return(funcao_generica(base, ..., suffixo = suffixo, FUN = remove_pronome_tratamento_coluna, spark_conn = spark_conn))
} else{
return(funcao_generica(base, ..., suffixo = suffixo, FUN = remove_pronome_tratamento_coluna_spark, spark_conn = spark_conn))
}

}


remove_pronome_tratamento_coluna <- function(nomes){
lista <- NULL
data("list_pronomes",envir = environment())
novos_nomes <- sapply(nomes, USE.NAMES = F, function(nome){
if(is.na(nome)){ return(nome) }
nome <- str_replace_all(nome, "\\s+"," ")
nome <- str_replace_all(toupper(nome),lista,"")
return(nome)
})
return(novos_nomes)
}


remove_pronome_tratamento_coluna_spark <- function(nomes,spark_conn){
lista_spark <- NULL
data("list_pronomes_spark",envir = environment())
nomes <- data.table(nome = nomes)
nomes_tbl <- dplyr::copy_to(spark_conn,nomes,"nomes",overwrite = TRUE)
if(!("nomes" %in% src_tbls(spark_conn))){ stop("Unable to copy 'base' to Spark") }
nomes_tbl <- nomes_tbl %>% dplyr::summarise(nome = toupper(nome)) %>% dplyr::mutate(regex = regexp_replace(nome,lista_spark,""))
novos_nomes <- nomes_tbl %>% dplyr::select(regex) %>% dplyr::collect() %>% as.data.table()
dplyr::db_drop_table(spark_conn,"nomes")
return(novos_nomes)
}

Binary file added data/list_pronomes.rda
Binary file not shown.
Binary file added data/list_pronomes_spark.rda
Binary file not shown.
14 changes: 14 additions & 0 deletions man/lista.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/remove_pronome_tratamento.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions tests/testthat/test_remove_pronome_tratamento.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
test_that("remove_pronome_tratamento", {
nomes <- c("Dr. Joao das Neves", "Exmo. Sr. Pedro dos Anjos", "Maria das Gracas")
nomes_sem_pronomes <- c("JOAO DAS NEVES", "PEDRO DOS ANJOS", "MARIA DAS GRACAS")
base <- data.table(nome = nomes)
base <- remove_pronome_tratamento(base, "nome")
expect_equal(c("nome", "nome_sem_pron"), names(base))
expect_equal(nomes_sem_pronomes, base$nome_sem_pron)
})


test_that("remove_pronome_tratamento data.frame e NA", {
nomes <- c("Dr. Joao das Neves", "Exmo. Sr. Pedro dos Anjos", "Maria das Gracas", NA)
nomes_sem_pronomes <- c("JOAO DAS NEVES", "PEDRO DOS ANJOS", "MARIA DAS GRACAS", NA)
base <- data.frame(nome = nomes)
base <- remove_pronome_tratamento(base, "nome")
expect_equal(c("nome", "nome_sem_pron"), names(base))
expect_equal(nomes_sem_pronomes, base$nome_sem_pron)
})

test_that("remove_pronome_tratamento de um vetor de caracteres", {
nomes <- c("Dr. Joao das Neves", "Exmo. Sr. Pedro dos Anjos", "Maria das Gracas", NA)
nomes_sem_pronomes <- c("JOAO DAS NEVES", "PEDRO DOS ANJOS", "MARIA DAS GRACAS", NA)
expect_equal(nomes_sem_pronomes, remove_pronome_tratamento(nomes))
})


11 changes: 11 additions & 0 deletions tests/testthat/test_text_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ test_that("remove_preposicao_nomes", {
expect_equal(nomes_sem_preposicao, base$nome_semD)
})


test_that("remove_preposicao_nomes", {
nomes <- c("João das Neves", "Pedro dos Anjos", "Maria das Gracas")
nomes_sem_preposicao <- c("João Neves", "Pedro Anjos", "Maria Gracas")
base <- data.frame(nome = nomes)
base <- remove_preposicao_nomes(base, "nome")
expect_equal(c("nome", "nome_semD"), names(base))
expect_equal(nomes_sem_preposicao, base$nome_semD)
})


test_that("remove_preposicao_nomes de um vetor de caracteres", {
nomes <- c("João das Neves", "Pedro dos Anjos", "Maria das Gracas")
nomes_sem_preposicao <- c("João Neves", "Pedro Anjos", "Maria Gracas")
Expand Down