From 2202c97553314542887318ee9746a4e4be2f64b0 Mon Sep 17 00:00:00 2001
From: "ANJOS, J. S." <0rakul0render@gmail.com>
Date: Mon, 8 Jun 2026 23:36:37 -0300
Subject: [PATCH 1/3] att
---
.gitignore | 1 +
python-package/MIGRATION_PLAN.md | 184 ++++++++
python-package/README.md | 229 +++++++++
python-package/placeholder.txt | 0
python-package/pyproject.toml | 35 ++
python-package/src/geocodebr/__init__.py | 27 ++
python-package/src/geocodebr/cache.py | 99 ++++
python-package/src/geocodebr/constants.py | 48 ++
python-package/src/geocodebr/db.py | 37 ++
.../src/geocodebr/download_cnefe.py | 69 +++
python-package/src/geocodebr/errors.py | 14 +
python-package/src/geocodebr/fields.py | 52 ++
python-package/src/geocodebr/geocode.py | 429 +++++++++++++++++
python-package/src/geocodebr/matching.py | 444 ++++++++++++++++++
python-package/src/geocodebr/messages.py | 28 ++
python-package/src/geocodebr/reverse.py | 213 +++++++++
python-package/src/geocodebr/string_dist.py | 53 +++
python-package/src/geocodebr/tables.py | 92 ++++
python-package/src/geocodebr/utils.py | 201 ++++++++
python-package/tests/conftest.py | 19 +
python-package/tests/test_busca_por_cep.py | 32 ++
python-package/tests/test_cache.py | 13 +
python-package/tests/test_fields.py | 22 +
python-package/tests/test_geocode.py | 55 +++
python-package/tests/test_geocode_reverso.py | 32 ++
25 files changed, 2428 insertions(+)
create mode 100644 python-package/MIGRATION_PLAN.md
create mode 100644 python-package/README.md
delete mode 100644 python-package/placeholder.txt
create mode 100644 python-package/pyproject.toml
create mode 100644 python-package/src/geocodebr/__init__.py
create mode 100644 python-package/src/geocodebr/cache.py
create mode 100644 python-package/src/geocodebr/constants.py
create mode 100644 python-package/src/geocodebr/db.py
create mode 100644 python-package/src/geocodebr/download_cnefe.py
create mode 100644 python-package/src/geocodebr/errors.py
create mode 100644 python-package/src/geocodebr/fields.py
create mode 100644 python-package/src/geocodebr/geocode.py
create mode 100644 python-package/src/geocodebr/matching.py
create mode 100644 python-package/src/geocodebr/messages.py
create mode 100644 python-package/src/geocodebr/reverse.py
create mode 100644 python-package/src/geocodebr/string_dist.py
create mode 100644 python-package/src/geocodebr/tables.py
create mode 100644 python-package/src/geocodebr/utils.py
create mode 100644 python-package/tests/conftest.py
create mode 100644 python-package/tests/test_busca_por_cep.py
create mode 100644 python-package/tests/test_cache.py
create mode 100644 python-package/tests/test_fields.py
create mode 100644 python-package/tests/test_geocode.py
create mode 100644 python-package/tests/test_geocode_reverso.py
diff --git a/.gitignore b/.gitignore
index f47593b..80314c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ vignettes/*.html
docs
/data_prep/data/*
/data_prep/data_raw/*
+*.pyc
diff --git a/python-package/MIGRATION_PLAN.md b/python-package/MIGRATION_PLAN.md
new file mode 100644
index 0000000..1cf4f77
--- /dev/null
+++ b/python-package/MIGRATION_PLAN.md
@@ -0,0 +1,184 @@
+# Plano de migracao R -> Python
+
+Este documento resume a analise dos scripts R do pacote `geocodebr` e propoe
+uma versao Python preservando a dinamica de uso e os nomes das funcoes publicas.
+
+## API publica a manter
+
+As funcoes exportadas no `NAMESPACE` devem existir tambem no pacote Python:
+
+- `definir_campos(estado, municipio, logradouro=None, numero=None, cep=None, localidade=None)`
+- `geocode(enderecos, campos_endereco=..., resultado_completo=False, resolver_empates=True, resultado_sf=False, h3_res=None, padronizar_enderecos=True, verboso=True, cache=True, n_cores=None)`
+- `busca_por_cep(cep, h3_res=None, resultado_sf=False, verboso=True, cache=True)`
+- `geocode_reverso(pontos, dist_max=1000, verboso=True, cache=True, n_cores=None)`
+- `download_cnefe(tabela="todas", verboso=True, cache=True)`
+- `definir_pasta_cache(path, verboso=True)`
+- `listar_pasta_cache()`
+- `listar_dados_cache(print_tree=False)`
+- `deletar_pasta_cache()`
+
+Em Python, a recomendacao e manter esses nomes em portugues para reduzir a
+curva de aprendizado. Internamente, os modulos podem ser separados por dominio.
+
+## Estrutura sugerida
+
+```text
+python-package/
+ pyproject.toml
+ src/geocodebr/
+ __init__.py
+ cache.py
+ download_cnefe.py
+ fields.py
+ geocode.py
+ reverse.py
+ db.py
+ matching.py
+ tables.py
+ string_dist.py
+ utils.py
+ errors.py
+ messages.py
+ tests/
+ test_cache.py
+ test_fields.py
+ test_busca_por_cep.py
+ test_geocode.py
+ test_geocode_reverso.py
+```
+
+## Dependencias Python recomendadas
+
+- `duckdb`: motor SQL central e tambem a camada principal de manipulacao
+ tabular. A versao Python deve manter os dados em tabelas/views DuckDB sempre
+ que possivel, evitando transformar o fluxo interno em `pandas`.
+- `pyarrow`: leitura/escrita Parquet e interoperabilidade com DuckDB.
+- `requests` ou `httpx`: download dos Parquets do release CNEFE.
+- `platformdirs`: diretorio de cache/config persistente equivalente a `tools::R_user_dir`.
+- `tqdm`: barra de progresso equivalente a `cli`/progress bar.
+- `geopandas`, `shapely`, `pyproj`: saida espacial e `geocode_reverso`.
+- `h3`: criacao de colunas `h3_03`, `h3_04` etc.
+- `rapidfuzz` ou UDF DuckDB: apenas se `jaro_similarity` nao estiver disponivel
+ de forma consistente na instalacao DuckDB Python.
+- Uma camada propria de padronizacao de endereco ou dependencia Python
+ equivalente a `enderecobr`.
+- `pandas`: opcional apenas para aceitar/retornar dados no estilo familiar da
+ API Python. Internamente, nao deve ser o motor de processamento.
+
+## Pontos criticos de paridade
+
+1. `enderecobr`
+ - O R depende de `enderecobr::padronizar_enderecos`, `padronizar_ceps`,
+ `padronizar_municipios` e `correspondencia_campos`.
+ - A versao Python precisa gerar as mesmas colunas padronizadas:
+ `estado_padr`, `municipio_padr`, `logradouro_padr`, `numero_padr`,
+ `cep_padr`, `bairro_padr`.
+ - Este e o maior risco de divergencia entre R e Python.
+
+2. SQL DuckDB
+ - As funcoes de matching (`match_cases`, `match_weighted_cases`,
+ `match_cases_probabilistic`, `match_weighted_cases_probabilistic`) sao
+ quase totalmente SQL.
+ - A melhor estrategia e portar os templates SQL para Python com interpolacao
+ controlada, preservando nomes de tabelas temporarias e colunas.
+ - O DuckDB deve substituir tanto `data.table` quanto a maior parte do uso
+ potencial de `pandas`: padronizacao, filtros, joins, atualizacoes,
+ desempate, H3 e montagem do output devem preferir SQL/tabelas temporarias.
+
+3. Dados CNEFE
+ - `data_release` atual: `v0.4.1`.
+ - Fonte: `https://github.com/ipeaGIT/padronizacao_cnefe/releases/download/{data_release}/{arquivo}`.
+ - Os arquivos Parquet baixados sao a base compartilhada entre R e Python.
+
+4. Geometria
+ - `resultado_sf=TRUE` no R retorna `sf`.
+ - Em Python, o equivalente natural e `geopandas.GeoDataFrame` com CRS
+ `EPSG:4674`.
+ - `geocode_reverso` hoje usa DuckDB Spatial via `duckspatial`; em Python deve
+ usar `duckdb` com extensao `spatial` ou uma combinacao `geopandas`/`sjoin`.
+
+## Sequencia de implementacao sugerida
+
+1. Criar pacote Python basico com `pyproject.toml`, `src/geocodebr` e exports em
+ `__init__.py`.
+2. Portar cache:
+ - `definir_pasta_cache`
+ - `listar_pasta_cache`
+ - `listar_dados_cache`
+ - `deletar_pasta_cache`
+3. Portar `definir_campos` e validacoes de colunas.
+4. Portar `download_cnefe`.
+5. Portar constantes e utilitarios DuckDB-first:
+ - `data_release`
+ - `get_key_cols`
+ - `get_reference_table`
+ - listas de `match_type`
+ - `add_precision_col`
+ - `merge_results_to_input`
+6. Portar funcoes de matching SQL.
+7. Implementar/pinchar padronizacao de enderecos em Python.
+8. Portar `busca_por_cep`.
+9. Portar `geocode`.
+10. Portar `geocode_reverso`. Primeira versao implementada com DuckDB Spatial,
+ aceitando `lon`/`lat`, `longitude`/`latitude`, `x`/`y` ou GeoDataFrame em
+ `EPSG:4674`, e retornando `pyarrow.Table`.
+11. Criar testes Python com `pytest`, usando os testes R como contrato de
+ comportamento.
+
+## Mapeamento dos arquivos R
+
+- `R/cache.R` -> `cache.py`
+- `R/download_cnefe.R` -> `download_cnefe.py`
+- `R/definir_campos.R` -> `fields.py`
+- `R/create_geocodebr_db.R` -> `db.py`
+- `R/geocode.R` -> `geocode.py`
+- `R/busca_por_cep.R` -> `geocode.py` ou `cep.py`
+- `R/geocode_reverso.R` -> `reverse.py`
+- `R/match_cases*.R`, `R/match_weighted_cases*.R` -> `matching.py`
+- `R/register_cnefe_tables.R` -> `tables.py`
+- `R/string_dist.R` -> `string_dist.py`
+- `R/trata_empates_geocode_duckdb.R` -> `matching.py` ou `ties.py`
+- `R/utils.R` -> `utils.py`
+- `R/error.R`, `R/message.R`, `R/progress_bar.R` -> `errors.py`, `messages.py`
+
+## Contratos de saida importantes
+
+`geocode` deve aceitar dados de entrada em formatos convenientes, mas o fluxo
+interno deve registrar a entrada diretamente no DuckDB. A saida padrao pode ser
+uma relacao DuckDB materializada sob demanda ou um `DataFrame` para ergonomia da
+API; a decisao deve ficar explicita na implementacao.
+
+`geocode` deve preservar as colunas originais e adicionar:
+
+- sempre: `lat`, `lon`, `precisao`, `tipo_resultado`, `desvio_metros`,
+ `endereco_encontrado`
+- se `resultado_completo=True`: `logradouro_encontrado`, `numero_encontrado`,
+ `cep_encontrado`, `localidade_encontrada`, `municipio_encontrado`,
+ `estado_encontrado`, `similaridade_logradouro`, `contagem_cnefe`, `empate`,
+ `cod_setor`
+- se `h3_res` for informado: `h3_03`, `h3_04`, etc.
+
+`busca_por_cep` deve retornar `cep`, `estado`, `municipio`, `logradouro`,
+`localidade`, `lon`, `lat` e H3 quando solicitado.
+
+`geocode_reverso` deve receber pontos em `EPSG:4674`, validar bounding box do
+Brasil e retornar o endereco mais proximo dentro de `dist_max`, com
+`distancia_metros`.
+
+## Observacoes de risco
+
+- O teste de paridade deve comparar resultados Python vs R em amostras pequenas,
+ incluindo casos determiniscos, probabilisticos, interpolacao por numero,
+ empates e CEP inexistente.
+- A implementacao deve evitar chamar `.df()`/`.fetchdf()` no meio do pipeline.
+ Essas chamadas devem ficar restritas a limites claros da API, por exemplo no
+ retorno final quando o usuario pedir um objeto Python em memoria.
+- A padronizacao inicial ja cobre normalizacao de acentos/caixa, UF por extenso
+ para sigla, CEP numerico, numero inteiro e tentativa de municipio por codigo
+ IBGE quando a tabela `municipio.parquet` trouxer coluna de codigo reconhecida.
+ Ainda precisa de validacao ampla contra `enderecobr`.
+- Ha possiveis bugs/quirks no R que talvez precisem ser replicados ou corrigidos
+ explicitamente. Exemplo: nos loops de H3, o nome da coluna usa `h3_res` em vez
+ do item `i`; funciona para vetor curto no teste, mas deve ser revisto ao portar.
+- A versao Python deve evitar montar SQL com valores vindos diretamente do usuario.
+ Os nomes de colunas podem ser validados contra `^[A-Za-z0-9_]+$`, como no R.
diff --git a/python-package/README.md b/python-package/README.md
new file mode 100644
index 0000000..54be0d8
--- /dev/null
+++ b/python-package/README.md
@@ -0,0 +1,229 @@
+# geocodebr Python: Geolocalizacao de Enderecos Brasileiros
+
+Versao Python experimental do `geocodebr`, usando DuckDB como motor tabular
+principal. A proposta e preservar a dinamica de uso do pacote R, incluindo nomes
+de funcoes em portugues, mas mantendo o processamento interno em SQL/DuckDB para
+boa performance e menor uso de memoria.
+
+O pacote geolocaliza enderecos brasileiros sem limite de numero de consultas,
+com base em dados abertos do CNEFE (Cadastro Nacional de Enderecos para Fins
+Estatisticos), publicado pelo IBGE.
+
+## Instalacao
+
+No momento, esta versao Python ainda esta em desenvolvimento dentro deste
+repositorio. Para instalar localmente:
+
+```bash
+cd python-package
+python -m pip install -e .
+```
+
+Dependencias principais:
+
+- `duckdb`: motor principal de dados e SQL.
+- `pyarrow`: formato padrao de retorno e interoperabilidade com Parquet.
+- `requests`: download dos dados CNEFE.
+- `h3`: criacao opcional de celulas H3.
+
+Para desenvolvimento e testes:
+
+```bash
+python -m pip install -e ".[test]"
+python -m pytest -q
+```
+
+## Utilizacao
+
+O pacote possui tres funcoes principais:
+
+1. `geocode()`
+2. `geocode_reverso()`
+3. `busca_por_cep()`
+
+As funcoes retornam, por padrao, um `pyarrow.Table`. Caso precise converter para
+`pandas`, use `.to_pandas()` no resultado final.
+
+## 1. Geolocalizacao: de enderecos para coordenadas
+
+Primeiro, indique quais colunas da sua tabela representam cada campo do
+endereco usando `definir_campos()`. Depois, chame `geocode()`.
+
+```python
+import pyarrow.csv as pv
+
+from geocodebr import definir_campos, geocode
+
+enderecos = pv.read_csv("../inst/extdata/small_sample.csv")
+
+campos = definir_campos(
+ logradouro="nm_logradouro",
+ numero="Numero",
+ cep="Cep",
+ localidade="Bairro",
+ municipio="nm_municipio",
+ estado="nm_uf",
+)
+
+resultado = geocode(
+ enderecos=enderecos,
+ campos_endereco=campos,
+ resultado_completo=False,
+ resolver_empates=True,
+ h3_res=[8, 10],
+ verboso=False,
+)
+
+print(resultado.schema.names)
+print(resultado.to_pandas().head())
+```
+
+Tambem e possivel passar um caminho para arquivo `.csv` ou `.parquet`:
+
+```python
+resultado = geocode(
+ enderecos="../inst/extdata/small_sample.csv",
+ campos_endereco=campos,
+ verboso=False,
+)
+```
+
+O resultado preserva as colunas originais e adiciona, entre outras:
+
+- `lat`
+- `lon`
+- `precisao`
+- `tipo_resultado`
+- `desvio_metros`
+- `endereco_encontrado`
+
+Com `resultado_completo=True`, tambem retorna campos encontrados no CNEFE, como
+`logradouro_encontrado`, `numero_encontrado`, `cep_encontrado`,
+`localidade_encontrada`, `municipio_encontrado`, `estado_encontrado`,
+`similaridade_logradouro`, `contagem_cnefe`, `empate` e `cod_setor`.
+
+## 2. Geolocalizacao reversa: de coordenadas para enderecos
+
+`geocode_reverso()` busca o endereco mais proximo de cada ponto dentro de uma
+distancia maxima em metros.
+
+A entrada pode ser:
+
+- tabela com colunas `lon` e `lat`
+- tabela com colunas `longitude` e `latitude`
+- tabela com colunas `x` e `y`
+- `GeoDataFrame` em `EPSG:4674`
+
+```python
+import pyarrow as pa
+
+from geocodebr import geocode_reverso
+
+pontos = pa.table(
+ {
+ "id": [1, 2],
+ "lon": [-47.9001, -43.2001],
+ "lat": [-15.8001, -22.9001],
+ }
+)
+
+enderecos_proximos = geocode_reverso(
+ pontos=pontos,
+ dist_max=1000,
+ verboso=False,
+)
+
+print(enderecos_proximos.to_pandas())
+```
+
+O resultado inclui os campos do endereco encontrado e a coluna
+`distancia_metros`.
+
+## 3. Busca por CEP
+
+`busca_por_cep()` retorna os enderecos associados a um ou mais CEPs.
+
+```python
+from geocodebr import busca_por_cep
+
+ceps = ["70390-025", "20071-001", "99999-999"]
+
+resultado_cep = busca_por_cep(
+ cep=ceps,
+ h3_res=10,
+ verboso=False,
+)
+
+print(resultado_cep.to_pandas())
+```
+
+O resultado inclui:
+
+- `cep`
+- `estado`
+- `municipio`
+- `logradouro`
+- `localidade`
+- `lon`
+- `lat`
+
+Se `h3_res` for informado, o pacote adiciona colunas como `h3_08` ou `h3_10`.
+
+## Cache dos dados CNEFE
+
+Na primeira execucao, o pacote baixa arquivos Parquet do release CNEFE usado
+pelo `geocodebr`. Esses arquivos ficam em cache local para acelerar chamadas
+futuras.
+
+```python
+from geocodebr import (
+ definir_pasta_cache,
+ listar_pasta_cache,
+ listar_dados_cache,
+ deletar_pasta_cache,
+ download_cnefe,
+)
+
+print(listar_pasta_cache())
+
+download_cnefe(tabela="municipio_logradouro_cep_localidade", verboso=True)
+
+arquivos = listar_dados_cache()
+print(arquivos)
+
+# definir uma pasta de cache especifica
+definir_pasta_cache("D:/dados/geocodebr-cache", verboso=True)
+
+# apagar cache configurado
+# deletar_pasta_cache()
+```
+
+## DuckDB-first
+
+Esta versao evita usar `pandas` no pipeline interno. O fluxo principal registra
+entradas no DuckDB, executa joins/filtros/matches em SQL e so materializa o
+resultado no final como `pyarrow.Table`.
+
+Isso facilita a paridade com o pacote R, que tambem usa DuckDB para o motor de
+geocodificacao, e ajuda em bases maiores.
+
+## Estado atual
+
+Esta versao Python ainda e experimental.
+
+Ja implementado:
+
+- `definir_campos()`
+- `download_cnefe()`
+- funcoes de cache
+- `busca_por_cep()`
+- `geocode()` com motor DuckDB
+- `geocode_reverso()` com DuckDB Spatial
+- testes unitarios com Parquets sinteticos
+
+Pontos que ainda precisam de validacao ampla:
+
+- paridade completa da padronizacao com o pacote R `enderecobr`
+- comparacao Python vs R em amostras reais maiores
+- retorno espacial equivalente a `sf`/`GeoDataFrame` quando `resultado_sf=True`
+
diff --git a/python-package/placeholder.txt b/python-package/placeholder.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
new file mode 100644
index 0000000..0322229
--- /dev/null
+++ b/python-package/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = ["hatchling>=1.25"]
+build-backend = "hatchling.build"
+
+[project]
+name = "geocodebr"
+version = "0.0.1"
+description = "Geolocalizacao de enderecos brasileiros com DuckDB"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+dependencies = [
+ "duckdb>=1.0.0",
+ "pyarrow>=15.0.0",
+ "requests>=2.31.0",
+ "platformdirs>=4.0.0",
+ "tqdm>=4.66.0",
+ "h3>=4.0.0",
+]
+
+[project.optional-dependencies]
+geo = [
+ "geopandas>=0.14.0",
+ "shapely>=2.0.0",
+ "pyproj>=3.6.0",
+]
+test = [
+ "pytest>=8.0.0",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/geocodebr"]
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
diff --git a/python-package/src/geocodebr/__init__.py b/python-package/src/geocodebr/__init__.py
new file mode 100644
index 0000000..fa0dbf8
--- /dev/null
+++ b/python-package/src/geocodebr/__init__.py
@@ -0,0 +1,27 @@
+from .cache import (
+ definir_pasta_cache,
+ deletar_pasta_cache,
+ listar_dados_cache,
+ listar_pasta_cache,
+)
+from .download_cnefe import download_cnefe
+from .fields import definir_campos
+from .geocode import busca_por_cep, geocode
+
+try:
+ from .reverse import geocode_reverso
+except Exception: # pragma: no cover
+ geocode_reverso = None
+
+__all__ = [
+ "busca_por_cep",
+ "definir_campos",
+ "definir_pasta_cache",
+ "deletar_pasta_cache",
+ "download_cnefe",
+ "geocode",
+ "geocode_reverso",
+ "listar_dados_cache",
+ "listar_pasta_cache",
+]
+
diff --git a/python-package/src/geocodebr/cache.py b/python-package/src/geocodebr/cache.py
new file mode 100644
index 0000000..eda040d
--- /dev/null
+++ b/python-package/src/geocodebr/cache.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+
+try:
+ from platformdirs import user_cache_dir, user_config_dir
+except ModuleNotFoundError: # pragma: no cover
+ def user_cache_dir(appname: str) -> str:
+ return str(Path.home() / "AppData" / "Local" / appname / "Cache")
+
+ def user_config_dir(appname: str) -> str:
+ return str(Path.home() / "AppData" / "Roaming" / appname)
+
+from .messages import message_cache
+
+
+def listar_pasta_cache_padrao() -> str:
+ return str(Path(user_cache_dir("geocodebr")))
+
+
+def listar_arquivo_config() -> str:
+ return str(Path(user_config_dir("geocodebr")) / "cache_dir")
+
+
+def definir_pasta_cache(path: str | None, verboso: bool = True) -> str:
+ if path is not None and not isinstance(path, str):
+ raise TypeError("path deve ser uma string ou None.")
+ if not isinstance(verboso, bool):
+ raise TypeError("verboso deve ser True ou False.")
+
+ cache_dir = Path(listar_pasta_cache_padrao()) if path is None else Path(path)
+ cache_dir = cache_dir.expanduser()
+
+ config_file = Path(listar_arquivo_config())
+ config_file.parent.mkdir(parents=True, exist_ok=True)
+ config_file.write_text(str(cache_dir), encoding="utf-8")
+
+ if verboso:
+ print(f"Definido como pasta de cache {cache_dir}.")
+
+ return str(cache_dir)
+
+
+def listar_pasta_cache() -> str:
+ config_file = Path(listar_arquivo_config())
+ if config_file.exists():
+ value = config_file.read_text(encoding="utf-8").strip()
+ if value:
+ return str(Path(value).expanduser())
+ return listar_pasta_cache_padrao()
+
+
+def listar_dados_cache(print_tree: bool = False) -> list[str]:
+ if not isinstance(print_tree, bool):
+ raise TypeError("print_tree deve ser True ou False.")
+
+ cache_dir = Path(listar_pasta_cache())
+ if not cache_dir.exists():
+ message_cache(True)
+ return []
+
+ files = sorted(str(path) for path in cache_dir.rglob("*") if path.is_file())
+ if print_tree:
+ _print_tree(cache_dir)
+ return files
+
+
+def deletar_pasta_cache() -> str:
+ cache_dir = Path(listar_pasta_cache())
+ if cache_dir.exists():
+ shutil.rmtree(cache_dir)
+ print(f"Deletada a pasta de cache que se encontrava em {cache_dir}.")
+ return str(cache_dir)
+
+
+def apaga_data_release_antigo(data_release: str) -> str:
+ cache_dir = Path(listar_pasta_cache())
+ if not cache_dir.exists():
+ return str(cache_dir)
+
+ release_dirs = [
+ path
+ for path in cache_dir.iterdir()
+ if path.is_dir() and path.name.startswith("geocodebr_data_release_")
+ ]
+ expected = cache_dir / f"geocodebr_data_release_{data_release}"
+ stale_dirs = [path for path in release_dirs if path != expected]
+ for path in stale_dirs:
+ shutil.rmtree(path)
+ return str(cache_dir)
+
+
+def _print_tree(root: Path) -> None:
+ print(root)
+ for path in sorted(root.rglob("*")):
+ depth = len(path.relative_to(root).parts)
+ prefix = " " * depth
+ print(f"{prefix}{path.name}")
diff --git a/python-package/src/geocodebr/constants.py b/python-package/src/geocodebr/constants.py
new file mode 100644
index 0000000..a54806d
--- /dev/null
+++ b/python-package/src/geocodebr/constants.py
@@ -0,0 +1,48 @@
+DATA_RELEASE = "v0.4.1"
+
+ALL_CNEFE_FILES = [
+ "municipio_logradouro_numero_localidade.parquet",
+ "municipio_logradouro_numero_cep_localidade.parquet",
+ "municipio.parquet",
+ "municipio_cep.parquet",
+ "municipio_cep_localidade.parquet",
+ "municipio_localidade.parquet",
+ "municipio_logradouro_cep_localidade.parquet",
+ "municipio_logradouro_localidade.parquet",
+]
+
+ALL_POSSIBLE_MATCH_TYPES = [
+ "dn01",
+ "da01",
+ "pn01",
+ "pa01",
+ "dn02",
+ "da02",
+ "pn02",
+ "pa02",
+ "dn03",
+ "da03",
+ "pn03",
+ "pa03",
+ "dn04",
+ "da04",
+ "dl01",
+ "pl01",
+ "dl02",
+ "pl02",
+ "dl03",
+ "pl03",
+ "dl04",
+ "dc01",
+ "dc02",
+ "db01",
+ "dm01",
+]
+
+NUMBER_EXACT_TYPES = {"dn01", "dn02", "dn03", "dn04"}
+NUMBER_INTERPOLATION_TYPES = {"da01", "da02", "da03", "da04"}
+PROBABILISTIC_EXACT_TYPES = {"pn01", "pn02", "pn03", "pn04"}
+PROBABILISTIC_INTERPOLATION_TYPES = {"pa01", "pa02", "pa03", "pa04"}
+EXACT_TYPES_NO_NUMBER = {"dl01", "dl02", "dl03", "dl04", "dc01", "dc02", "db01", "dm01"}
+PROBABILISTIC_TYPES_NO_NUMBER = {"pl01", "pl02", "pl03", "pl04"}
+
diff --git a/python-package/src/geocodebr/db.py b/python-package/src/geocodebr/db.py
new file mode 100644
index 0000000..f627a87
--- /dev/null
+++ b/python-package/src/geocodebr/db.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import duckdb
+
+
+def create_geocodebr_db(
+ db_path: str = "tempdir",
+ n_cores: int | None = None,
+ load_spatial: bool = False,
+) -> duckdb.DuckDBPyConnection:
+ if n_cores is not None and (not isinstance(n_cores, int) or n_cores < 1):
+ raise ValueError("n_cores deve ser um inteiro positivo ou None.")
+
+ if db_path == "tempdir":
+ handle = tempfile.NamedTemporaryFile(prefix="geocodebr", suffix=".duckdb", delete=True)
+ db_file = handle.name
+ handle.close()
+ Path(db_file).unlink(missing_ok=True)
+ elif db_path == "memory":
+ db_file = ":memory:"
+ else:
+ db_file = db_path
+
+ con = duckdb.connect(db_file)
+ if n_cores is not None:
+ con.execute(f"SET threads = {n_cores}")
+ con.execute("SET enable_progress_bar = false")
+
+ if load_spatial:
+ con.execute("INSTALL spatial")
+ con.execute("LOAD spatial")
+
+ return con
+
diff --git a/python-package/src/geocodebr/download_cnefe.py b/python-package/src/geocodebr/download_cnefe.py
new file mode 100644
index 0000000..fdfd968
--- /dev/null
+++ b/python-package/src/geocodebr/download_cnefe.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import requests
+from tqdm import tqdm
+
+from .cache import apaga_data_release_antigo, listar_pasta_cache
+from .constants import ALL_CNEFE_FILES, DATA_RELEASE
+from .messages import message_baixando_cnefe, message_usando_cnefe_local
+
+
+def download_cnefe(tabela: str = "todas", verboso: bool = True, cache: bool = True) -> str:
+ if not isinstance(tabela, str):
+ raise TypeError("tabela deve ser uma string.")
+ if not isinstance(verboso, bool) or not isinstance(cache, bool):
+ raise TypeError("verboso e cache devem ser True ou False.")
+
+ files = _select_files(tabela)
+ urls = [
+ f"https://github.com/ipeaGIT/padronizacao_cnefe/releases/download/{DATA_RELEASE}/{file}"
+ for file in files
+ ]
+
+ if cache:
+ apaga_data_release_antigo(DATA_RELEASE)
+ cache_dir = Path(listar_pasta_cache())
+ else:
+ cache_dir = Path(tempfile.mkdtemp(prefix="geocodebr_temp"))
+
+ data_dir = cache_dir / f"geocodebr_data_release_{DATA_RELEASE}"
+ data_dir.mkdir(parents=True, exist_ok=True)
+
+ existing = {path.name for path in data_dir.iterdir() if path.is_file()}
+ to_download = [(url, data_dir / Path(url).name) for url in urls if Path(url).name not in existing]
+
+ if not to_download:
+ message_usando_cnefe_local(verboso)
+ return str(cache_dir)
+
+ message_baixando_cnefe(verboso)
+ for url, dest in tqdm(to_download, disable=not verboso):
+ _download_file(url, dest)
+
+ return str(cache_dir)
+
+
+def _select_files(tabela: str) -> list[str]:
+ if tabela == "todas":
+ return ALL_CNEFE_FILES.copy()
+
+ valid = {Path(file).stem: file for file in ALL_CNEFE_FILES}
+ if tabela not in valid:
+ options = ", ".join(sorted(valid))
+ raise ValueError(f"A tabela deve ser uma das seguintes opcoes: {options}.")
+ return [valid[tabela]]
+
+
+def _download_file(url: str, dest: Path) -> None:
+ tmp = dest.with_suffix(dest.suffix + ".part")
+ with requests.get(url, stream=True, timeout=120) as response:
+ response.raise_for_status()
+ with tmp.open("wb") as file:
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
+ if chunk:
+ file.write(chunk)
+ tmp.replace(dest)
+
diff --git a/python-package/src/geocodebr/errors.py b/python-package/src/geocodebr/errors.py
new file mode 100644
index 0000000..69e7252
--- /dev/null
+++ b/python-package/src/geocodebr/errors.py
@@ -0,0 +1,14 @@
+class GeocodeBRError(Exception):
+ """Erro base do geocodebr Python."""
+
+
+class InputNaoPadronizadoError(GeocodeBRError):
+ """Entrada sem colunas padronizadas esperadas."""
+
+
+def error_input_nao_padronizado() -> None:
+ raise InputNaoPadronizadoError(
+ "Os dados de entrada nao estao padronizados. Use "
+ "padronizar_enderecos=True ou informe colunas *_padr equivalentes."
+ )
+
diff --git a/python-package/src/geocodebr/fields.py b/python-package/src/geocodebr/fields.py
new file mode 100644
index 0000000..fef0dc7
--- /dev/null
+++ b/python-package/src/geocodebr/fields.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+
+ADDRESS_FIELDS = ("logradouro", "numero", "cep", "localidade", "municipio", "estado")
+
+
+def definir_campos(
+ estado: str,
+ municipio: str,
+ logradouro: str | None = None,
+ numero: str | None = None,
+ cep: str | None = None,
+ localidade: str | None = None,
+) -> dict[str, str | None]:
+ values = {
+ "logradouro": logradouro,
+ "numero": numero,
+ "cep": cep,
+ "localidade": localidade,
+ "municipio": municipio,
+ "estado": estado,
+ }
+ for name, value in values.items():
+ if value is not None and not isinstance(value, str):
+ raise TypeError(f"{name} deve ser uma string ou None.")
+ if all(value is None for value in values.values()):
+ raise ValueError("Pelo menos um campo nao pode ser nulo.")
+ return values
+
+
+def assert_and_assign_address_fields(
+ address_fields: dict[str, str | None],
+ addresses_columns: list[str],
+) -> dict[str, str | None]:
+ if not isinstance(address_fields, dict):
+ raise TypeError("campos_endereco deve ser um dict.")
+
+ unknown = set(address_fields) - set(ADDRESS_FIELDS)
+ if unknown:
+ raise ValueError(f"Campos desconhecidos: {sorted(unknown)}.")
+
+ missing_columns = [
+ column
+ for column in address_fields.values()
+ if column is not None and column not in addresses_columns
+ ]
+ if missing_columns:
+ raise ValueError(f"Colunas ausentes em enderecos: {missing_columns}.")
+
+ complete = {field: address_fields.get(field) for field in ADDRESS_FIELDS}
+ return complete
+
diff --git a/python-package/src/geocodebr/geocode.py b/python-package/src/geocodebr/geocode.py
new file mode 100644
index 0000000..78dca9b
--- /dev/null
+++ b/python-package/src/geocodebr/geocode.py
@@ -0,0 +1,429 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import duckdb
+import pyarrow as pa
+
+from .cache import listar_pasta_cache
+from .constants import ALL_POSSIBLE_MATCH_TYPES, DATA_RELEASE
+from .db import create_geocodebr_db
+from .download_cnefe import download_cnefe
+from .errors import error_input_nao_padronizado
+from .fields import assert_and_assign_address_fields, definir_campos
+from .matching import (
+ create_output_db,
+ select_match_function,
+ trata_empates_geocode_duckdb,
+)
+from .messages import (
+ message_looking_for_matches,
+ message_preparando_output,
+ message_standardizing_addresses,
+)
+from .utils import (
+ add_precision_col,
+ check_clean_colnames,
+ cria_col_logradouro_confusao,
+ find_cached_parquet,
+ get_key_cols,
+ merge_results_to_input,
+ quote_ident,
+ sql_string,
+)
+
+
+def busca_por_cep(
+ cep: str | list[str] | tuple[str, ...],
+ h3_res: int | list[int] | tuple[int, ...] | None = None,
+ resultado_sf: bool = False,
+ verboso: bool = True,
+ cache: bool = True,
+) -> pa.Table:
+ if resultado_sf:
+ raise NotImplementedError("resultado_sf=True sera implementado com geopandas na proxima etapa.")
+ _assert_bool(verboso, "verboso")
+ _assert_bool(cache, "cache")
+ h3_values = _normalize_h3_res(h3_res)
+ ceps = _normalize_ceps(cep)
+ if not ceps:
+ raise ValueError("Nenhum CEP valido foi informado.")
+
+ download_cnefe("municipio_logradouro_cep_localidade", verboso=verboso, cache=cache)
+ con = create_geocodebr_db()
+ try:
+ path_to_parquet = (
+ Path(listar_pasta_cache())
+ / f"geocodebr_data_release_{DATA_RELEASE}"
+ / "municipio_logradouro_cep_localidade.parquet"
+ ).as_posix()
+ unique_ceps = ", ".join(sql_string(value) for value in sorted(set(ceps)))
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE output_df AS
+ SELECT cep, estado, municipio, logradouro, localidade, lon, lat
+ FROM read_parquet('{path_to_parquet}') m
+ WHERE m.cep IN ({unique_ceps})
+ """
+ )
+ missing = sorted(set(ceps) - set(row[0] for row in con.execute("SELECT DISTINCT cep FROM output_df").fetchall()))
+ if len(missing) == len(set(ceps)):
+ raise ValueError("Nenhum CEP foi encontrado.")
+ if missing:
+ values = ", ".join(f"({sql_string(value)})" for value in missing)
+ con.execute(f"INSERT INTO output_df (cep) VALUES {values}")
+ _add_h3_columns(con, "output_df", h3_values)
+ return con.execute("SELECT * FROM output_df").to_arrow_table()
+ finally:
+ con.close()
+
+
+def geocode(
+ enderecos: Any,
+ campos_endereco: dict[str, str | None] | None = None,
+ resultado_completo: bool = False,
+ resolver_empates: bool = True,
+ resultado_sf: bool = False,
+ h3_res: int | list[int] | tuple[int, ...] | None = None,
+ padronizar_enderecos: bool = True,
+ verboso: bool = True,
+ cache: bool = True,
+ n_cores: int | None = None,
+) -> pa.Table:
+ if resultado_sf:
+ raise NotImplementedError("resultado_sf=True sera implementado com geopandas na proxima etapa.")
+ for name, value in {
+ "resultado_completo": resultado_completo,
+ "resolver_empates": resolver_empates,
+ "padronizar_enderecos": padronizar_enderecos,
+ "verboso": verboso,
+ "cache": cache,
+ }.items():
+ _assert_bool(value, name)
+ h3_values = _normalize_h3_res(h3_res)
+ if campos_endereco is None:
+ campos_endereco = definir_campos(estado="estado", municipio="municipio")
+
+ download_cnefe("todas", verboso=verboso, cache=cache)
+ con = create_geocodebr_db(n_cores=n_cores)
+ try:
+ _register_input(con, enderecos)
+ input_columns = _table_columns(con, "enderecos_input")
+ check_clean_colnames(input_columns)
+ campos_endereco = assert_and_assign_address_fields(campos_endereco, input_columns)
+
+ con.execute(
+ """
+ CREATE OR REPLACE TEMP TABLE input_db AS
+ SELECT *, ROW_NUMBER() OVER ()::INTEGER AS tempidgeocodebr
+ FROM enderecos_input
+ """
+ )
+ original_columns = [col for col in input_columns] + ["tempidgeocodebr"]
+
+ if padronizar_enderecos:
+ message_standardizing_addresses(verboso)
+ _create_standardized_input(con, campos_endereco)
+ else:
+ _create_standardized_input_from_padr(con)
+
+ _assert_standardized_columns(con)
+ con.execute("ALTER TABLE input_padrao_db ADD COLUMN temp_lograd_determ TEXT")
+ con.execute("ALTER TABLE input_padrao_db ADD COLUMN similaridade_logradouro DOUBLE")
+ cria_col_logradouro_confusao(con)
+ create_output_db(con, resultado_completo)
+
+ if verboso:
+ message_looking_for_matches(verboso)
+
+ n_rows = con.execute("SELECT COUNT(*) FROM input_padrao_db").fetchone()[0]
+ matched_rows = 0
+ input_padrao_columns = set(_table_columns(con, "input_padrao_db"))
+ for match_type in ALL_POSSIBLE_MATCH_TYPES:
+ key_cols = get_key_cols(match_type)
+ if all(col in input_padrao_columns for col in key_cols):
+ match_fun = select_match_function(match_type)
+ affected = match_fun(
+ con,
+ match_type=match_type,
+ key_cols=key_cols,
+ resultado_completo=resultado_completo,
+ )
+ matched_rows += affected
+ if matched_rows == n_rows:
+ break
+
+ message_preparando_output(verboso)
+ empates_resolvidos = trata_empates_geocode_duckdb(
+ con, resultado_completo, resolver_empates, verboso
+ )
+ output_table_to_use = "output_db" if empates_resolvidos == 0 else "output_db2"
+ add_precision_col(con, output_table_to_use)
+ merge_results_to_input(
+ con,
+ x="input_db",
+ y=output_table_to_use,
+ select_columns=original_columns,
+ resultado_completo=resultado_completo,
+ )
+ _add_h3_columns(con, "geocodebr_result", h3_values)
+ con.execute(
+ """
+ CREATE OR REPLACE TEMP TABLE geocodebr_result AS
+ SELECT * EXCLUDE (tempidgeocodebr)
+ FROM geocodebr_result
+ ORDER BY tempidgeocodebr
+ """
+ )
+ return con.execute("SELECT * FROM geocodebr_result").to_arrow_table()
+ finally:
+ con.close()
+
+
+def _register_input(con: duckdb.DuckDBPyConnection, enderecos: Any) -> None:
+ if isinstance(enderecos, (str, Path)):
+ path = Path(enderecos)
+ suffix = path.suffix.lower()
+ path_sql = path.as_posix()
+ if suffix == ".parquet":
+ con.execute(f"CREATE OR REPLACE TEMP TABLE enderecos_input AS SELECT * FROM read_parquet('{path_sql}')")
+ elif suffix in {".csv", ".txt"}:
+ con.execute(f"CREATE OR REPLACE TEMP TABLE enderecos_input AS SELECT * FROM read_csv_auto('{path_sql}')")
+ else:
+ raise ValueError("Arquivos suportados: .parquet, .csv, .txt.")
+ return
+
+ con.register("enderecos_input_view", enderecos)
+ con.execute("CREATE OR REPLACE TEMP TABLE enderecos_input AS SELECT * FROM enderecos_input_view")
+ con.unregister("enderecos_input_view")
+
+
+def _create_standardized_input(
+ con: duckdb.DuckDBPyConnection,
+ campos_endereco: dict[str, str | None],
+) -> None:
+ select_parts = []
+ for field in ["estado", "municipio", "logradouro", "numero", "cep", "localidade"]:
+ source = campos_endereco.get(field)
+ if source is None:
+ expr = "NULL"
+ elif field == "numero":
+ expr = f"TRY_CAST(NULLIF(REGEXP_REPLACE(CAST({quote_ident(source)} AS VARCHAR), '[^0-9]', '', 'g'), '') AS INTEGER)"
+ elif field == "cep":
+ expr = f"NULLIF(REGEXP_REPLACE(CAST({quote_ident(source)} AS VARCHAR), '[^0-9]', '', 'g'), '')"
+ else:
+ expr = f"NULLIF(_geocodebr_norm(CAST({quote_ident(source)} AS VARCHAR)), '')"
+ out_name = "bairro" if field == "localidade" else field
+ select_parts.append(f"{expr} AS {out_name}")
+
+ _install_normalize_function(con)
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE input_padrao_db AS
+ SELECT {", ".join(select_parts)}, tempidgeocodebr
+ FROM input_db
+ """
+ )
+ con.execute("ALTER TABLE input_padrao_db RENAME bairro TO localidade")
+ _resolve_estado_names(con)
+ _resolve_municipio_codes(con)
+
+
+def _create_standardized_input_from_padr(con: duckdb.DuckDBPyConnection) -> None:
+ cols = _table_columns(con, "input_db")
+ padded = {
+ "estado_padr": "estado",
+ "municipio_padr": "municipio",
+ "logradouro_padr": "logradouro",
+ "numero_padr": "numero",
+ "cep_padr": "cep",
+ "bairro_padr": "localidade",
+ }
+ if not set(padded).issubset(cols):
+ error_input_nao_padronizado()
+ selects = [f"{src} AS {dst}" for src, dst in padded.items()]
+ selects.append("tempidgeocodebr")
+ con.execute(f"CREATE OR REPLACE TEMP TABLE input_padrao_db AS SELECT {', '.join(selects)} FROM input_db")
+
+
+def _assert_standardized_columns(con: duckdb.DuckDBPyConnection) -> None:
+ expected = {"estado", "municipio", "logradouro", "numero", "cep", "localidade"}
+ if not expected.issubset(set(_table_columns(con, "input_padrao_db"))):
+ error_input_nao_padronizado()
+
+
+def _install_normalize_function(con: duckdb.DuckDBPyConnection) -> None:
+ import unicodedata
+
+ estados = {
+ "ACRE": "AC",
+ "ALAGOAS": "AL",
+ "AMAPA": "AP",
+ "AMAZONAS": "AM",
+ "BAHIA": "BA",
+ "CEARA": "CE",
+ "DISTRITO FEDERAL": "DF",
+ "ESPIRITO SANTO": "ES",
+ "GOIAS": "GO",
+ "MARANHAO": "MA",
+ "MATO GROSSO": "MT",
+ "MATO GROSSO DO SUL": "MS",
+ "MINAS GERAIS": "MG",
+ "PARA": "PA",
+ "PARAIBA": "PB",
+ "PARANA": "PR",
+ "PERNAMBUCO": "PE",
+ "PIAUI": "PI",
+ "RIO DE JANEIRO": "RJ",
+ "RIO GRANDE DO NORTE": "RN",
+ "RIO GRANDE DO SUL": "RS",
+ "RONDONIA": "RO",
+ "RORAIMA": "RR",
+ "SANTA CATARINA": "SC",
+ "SAO PAULO": "SP",
+ "SERGIPE": "SE",
+ "TOCANTINS": "TO",
+ }
+
+ def normalize(value: str | None) -> str | None:
+ if value is None:
+ return None
+ text = unicodedata.normalize("NFKD", str(value))
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
+ text = text.upper()
+ text = "".join(ch if ch.isalnum() else " " for ch in text)
+ return " ".join(text.split())
+
+ def normalize_uf(value: str | None) -> str | None:
+ text = normalize(value)
+ if text is None:
+ return None
+ if len(text) == 2:
+ return text
+ return estados.get(text, text)
+
+ try:
+ con.create_function("_geocodebr_norm", normalize, ["VARCHAR"], "VARCHAR")
+ except duckdb.InvalidInputException:
+ pass
+ try:
+ con.create_function("_geocodebr_uf", normalize_uf, ["VARCHAR"], "VARCHAR")
+ except duckdb.InvalidInputException:
+ pass
+
+
+def _resolve_estado_names(con: duckdb.DuckDBPyConnection) -> None:
+ con.execute(
+ """
+ UPDATE input_padrao_db
+ SET estado = _geocodebr_uf(estado)
+ WHERE estado IS NOT NULL
+ """
+ )
+
+
+def _resolve_municipio_codes(con: duckdb.DuckDBPyConnection) -> None:
+ from .cache import listar_dados_cache
+
+ try:
+ path = find_cached_parquet(listar_dados_cache(), "municipio")
+ except FileNotFoundError:
+ return
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE _geocodebr_municipio_ref AS
+ SELECT * FROM read_parquet('{path}') LIMIT 0
+ """
+ )
+ cols = set(_table_columns(con, "_geocodebr_municipio_ref"))
+ code_col = next(
+ (col for col in ["cod_muni", "code_muni", "cod_municipio", "codigo_municipio"] if col in cols),
+ None,
+ )
+ if code_col is None or "municipio" not in cols:
+ return
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE _geocodebr_municipio_ref AS
+ SELECT DISTINCT CAST({quote_ident(code_col)} AS VARCHAR) AS municipio_codigo,
+ municipio AS municipio_nome
+ FROM read_parquet('{path}')
+ WHERE {quote_ident(code_col)} IS NOT NULL
+ """
+ )
+ con.execute(
+ """
+ UPDATE input_padrao_db
+ SET municipio = ref.municipio_nome
+ FROM _geocodebr_municipio_ref ref
+ WHERE REGEXP_MATCHES(input_padrao_db.municipio, '^[0-9]{7}$')
+ AND input_padrao_db.municipio = ref.municipio_codigo
+ """
+ )
+
+
+def _add_h3_columns(
+ con: duckdb.DuckDBPyConnection,
+ table_name: str,
+ h3_values: list[int],
+) -> None:
+ if not h3_values:
+ return
+ import h3
+
+ def h3_cell(lat: float | None, lon: float | None, res: int) -> str | None:
+ if lat is None or lon is None:
+ return None
+ if hasattr(h3, "latlng_to_cell"):
+ return h3.latlng_to_cell(lat, lon, res)
+ return h3.geo_to_h3(lat, lon, res)
+
+ try:
+ con.create_function("_geocodebr_h3", h3_cell, ["DOUBLE", "DOUBLE", "INTEGER"], "VARCHAR")
+ except duckdb.InvalidInputException:
+ pass
+
+ for value in h3_values:
+ colname = f"h3_{value:02d}"
+ con.execute(f"ALTER TABLE {quote_ident(table_name)} ADD COLUMN {quote_ident(colname)} TEXT")
+ con.execute(
+ f"""
+ UPDATE {quote_ident(table_name)}
+ SET {quote_ident(colname)} = _geocodebr_h3(lat, lon, {value})
+ WHERE lat IS NOT NULL
+ """
+ )
+
+
+def _normalize_ceps(cep: str | list[str] | tuple[str, ...]) -> list[str]:
+ values = [cep] if isinstance(cep, str) else list(cep)
+ out = []
+ for value in values:
+ if not isinstance(value, str):
+ raise TypeError("cep deve ser string ou sequencia de strings.")
+ digits = "".join(ch for ch in value if ch.isdigit())
+ if len(digits) == 8:
+ out.append(digits)
+ return sorted(set(out))
+
+
+def _normalize_h3_res(h3_res: int | list[int] | tuple[int, ...] | None) -> list[int]:
+ if h3_res is None:
+ return []
+ values = [h3_res] if isinstance(h3_res, int) else list(h3_res)
+ for value in values:
+ if not isinstance(value, int) or value < 0 or value > 15:
+ raise ValueError("h3_res deve conter inteiros entre 0 e 15.")
+ return values
+
+
+def _assert_bool(value: bool, name: str) -> None:
+ if not isinstance(value, bool):
+ raise TypeError(f"{name} deve ser True ou False.")
+
+
+def _table_columns(con: duckdb.DuckDBPyConnection, table_name: str) -> list[str]:
+ return [row[1] for row in con.execute(f"PRAGMA table_info('{table_name}')").fetchall()]
diff --git a/python-package/src/geocodebr/matching.py b/python-package/src/geocodebr/matching.py
new file mode 100644
index 0000000..5791259
--- /dev/null
+++ b/python-package/src/geocodebr/matching.py
@@ -0,0 +1,444 @@
+from __future__ import annotations
+
+import duckdb
+
+from .constants import (
+ EXACT_TYPES_NO_NUMBER,
+ NUMBER_EXACT_TYPES,
+ NUMBER_INTERPOLATION_TYPES,
+ PROBABILISTIC_EXACT_TYPES,
+ PROBABILISTIC_INTERPOLATION_TYPES,
+ PROBABILISTIC_TYPES_NO_NUMBER,
+)
+from .string_dist import calculate_string_dist
+from .tables import register_cnefe_table, register_unique_logradouros_table
+from .utils import get_key_cols, get_reference_table, quote_ident, update_input_db
+
+
+def create_output_db(con: duckdb.DuckDBPyConnection, resultado_completo: bool) -> None:
+ columns = [
+ "tempidgeocodebr INTEGER",
+ "lat DOUBLE",
+ "lon DOUBLE",
+ "endereco_encontrado TEXT",
+ "logradouro_encontrado TEXT",
+ "tipo_resultado TEXT",
+ "contagem_cnefe INTEGER",
+ "desvio_metros INTEGER",
+ "log_causa_confusao BOOLEAN",
+ "similaridade_logradouro DOUBLE",
+ ]
+ if resultado_completo:
+ columns.extend(
+ [
+ "numero_encontrado INTEGER",
+ "localidade_encontrada TEXT",
+ "cep_encontrado TEXT",
+ "municipio_encontrado TEXT",
+ "estado_encontrado TEXT",
+ "empate BOOLEAN",
+ "cod_setor TEXT",
+ ]
+ )
+ con.execute(f"CREATE OR REPLACE TEMP TABLE output_db ({', '.join(columns)})")
+
+
+def match_cases(
+ con: duckdb.DuckDBPyConnection,
+ x: str = "input_padrao_db",
+ output_tb: str = "output_db",
+ key_cols: list[str] | None = None,
+ match_type: str = "",
+ resultado_completo: bool = False,
+) -> int:
+ y = get_reference_table(match_type)
+ key_cols = get_key_cols(match_type)
+ register_cnefe_table(con, match_type)
+
+ join_condition = " AND ".join(f"{y}.{col} = {x}.{col}" for col in key_cols)
+ cols_not_null = " AND ".join(f"{x}.{col} IS NOT NULL" for col in key_cols)
+ colunas_encontradas, additional_cols = _complete_columns(y, key_cols, resultado_completo)
+
+ con.execute(
+ f"""
+ INSERT INTO output_db (
+ tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ desvio_metros, log_causa_confusao, contagem_cnefe {colunas_encontradas}
+ )
+ SELECT {x}.tempidgeocodebr,
+ {y}.lat,
+ {y}.lon,
+ {y}.endereco_completo AS endereco_encontrado,
+ '{match_type}' AS tipo_resultado,
+ {y}.desvio_metros,
+ {x}.log_causa_confusao,
+ {y}.n_casos AS contagem_cnefe {additional_cols}
+ FROM {x}
+ INNER JOIN {y}
+ ON {join_condition}
+ WHERE {cols_not_null}
+ """
+ )
+ return update_input_db(con, update_tb=x, reference_tb=output_tb)
+
+
+def match_weighted_cases(
+ con: duckdb.DuckDBPyConnection,
+ x: str = "input_padrao_db",
+ output_tb: str = "output_db",
+ key_cols: list[str] | None = None,
+ match_type: str = "",
+ resultado_completo: bool = False,
+) -> int:
+ y = get_reference_table(match_type)
+ original_key_cols = get_key_cols(match_type)
+ register_cnefe_table(con, match_type)
+
+ cols_not_null = " AND ".join(f"{x}.{col} IS NOT NULL" for col in original_key_cols)
+ key_cols = [col for col in original_key_cols if col != "numero"]
+ join_condition = " AND ".join(f"{y}.{col} = {x}.{col}" for col in key_cols)
+ colunas_encontradas, additional_first, additional_second = _complete_weighted_columns(y, key_cols, resultado_completo)
+
+ con.execute(
+ f"""
+ WITH temp_db AS (
+ SELECT {x}.tempidgeocodebr,
+ {x}.numero,
+ {y}.numero AS numero_cnefe,
+ {y}.lat, {y}.lon,
+ REGEXP_REPLACE({y}.endereco_completo, ', \\d+ -', CONCAT(', ', {x}.numero, ' (aprox) -')) AS endereco_encontrado,
+ {y}.desvio_metros,
+ {x}.log_causa_confusao,
+ {y}.n_casos AS contagem_cnefe {additional_first}
+ FROM {x}
+ INNER JOIN {y}
+ ON {join_condition}
+ WHERE {cols_not_null}
+ )
+ INSERT INTO output_db (
+ tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ desvio_metros, log_causa_confusao, contagem_cnefe {colunas_encontradas}
+ )
+ SELECT tempidgeocodebr,
+ SUM((1 / ABS(numero - numero_cnefe) * lat)) / SUM(1 / ABS(numero - numero_cnefe)) AS lat,
+ SUM((1 / ABS(numero - numero_cnefe) * lon)) / SUM(1 / ABS(numero - numero_cnefe)) AS lon,
+ FIRST(endereco_encontrado) AS endereco_encontrado,
+ '{match_type}' AS tipo_resultado,
+ AVG(desvio_metros) AS desvio_metros,
+ FIRST(log_causa_confusao) AS log_causa_confusao,
+ FIRST(contagem_cnefe) AS contagem_cnefe {additional_second}
+ FROM temp_db
+ GROUP BY tempidgeocodebr, endereco_encontrado
+ """
+ )
+ return update_input_db(con, update_tb=x, reference_tb=output_tb)
+
+
+def match_cases_probabilistic(
+ con: duckdb.DuckDBPyConnection,
+ x: str = "input_padrao_db",
+ output_tb: str = "output_db",
+ key_cols: list[str] | None = None,
+ match_type: str = "",
+ resultado_completo: bool = False,
+) -> int:
+ y = get_reference_table(match_type)
+ key_cols = get_key_cols(match_type)
+ register_cnefe_table(con, match_type)
+ unique_logradouros_tbl = register_unique_logradouros_table(con, match_type)
+ calculate_string_dist(con, match_type, unique_logradouros_tbl)
+
+ join_condition = " AND ".join(f"{y}.{col} = {x}.{col}" for col in key_cols)
+ join_condition = join_condition.replace("input_padrao_db.logradouro", "input_padrao_db.temp_lograd_determ")
+ cols_not_null = " AND ".join(f"{x}.{col} IS NOT NULL" for col in key_cols)
+ cols_not_null = cols_not_null.replace(".logradouro", ".temp_lograd_determ")
+ colunas_encontradas, additional_cols = _complete_columns(y, key_cols, resultado_completo, probabilistic=True)
+
+ con.execute(
+ f"""
+ INSERT INTO output_db (
+ tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ desvio_metros, log_causa_confusao, contagem_cnefe {colunas_encontradas}
+ )
+ SELECT {x}.tempidgeocodebr,
+ {y}.lat,
+ {y}.lon,
+ {y}.endereco_completo AS endereco_encontrado,
+ '{match_type}' AS tipo_resultado,
+ {y}.desvio_metros,
+ {x}.log_causa_confusao,
+ {y}.n_casos AS contagem_cnefe {additional_cols}
+ FROM {x}
+ INNER JOIN {y}
+ ON {join_condition}
+ WHERE {cols_not_null}
+ """
+ )
+ return update_input_db(con, update_tb=x, reference_tb=output_tb)
+
+
+def match_weighted_cases_probabilistic(
+ con: duckdb.DuckDBPyConnection,
+ x: str = "input_padrao_db",
+ output_tb: str = "output_db",
+ key_cols: list[str] | None = None,
+ match_type: str = "",
+ resultado_completo: bool = False,
+) -> int:
+ y = get_reference_table(match_type)
+ original_key_cols = get_key_cols(match_type)
+ register_cnefe_table(con, match_type)
+ unique_logradouros_tbl = register_unique_logradouros_table(con, match_type)
+ calculate_string_dist(con, match_type, unique_logradouros_tbl)
+
+ cols_not_null = " AND ".join(f"{x}.{col} IS NOT NULL" for col in original_key_cols)
+ key_cols = [col for col in original_key_cols if col != "numero"]
+ join_condition = " AND ".join(f"{y}.{col} = {x}.{col}" for col in key_cols)
+ join_condition = join_condition.replace("input_padrao_db.logradouro", "input_padrao_db.temp_lograd_determ")
+ cols_not_null_match = cols_not_null.replace(".logradouro", ".temp_lograd_determ")
+ colunas_encontradas, additional_first, additional_second = _complete_weighted_columns(y, key_cols, resultado_completo)
+
+ con.execute(
+ f"""
+ WITH temp_db AS (
+ SELECT {x}.tempidgeocodebr,
+ {x}.numero,
+ {y}.numero AS numero_cnefe,
+ {y}.lat, {y}.lon,
+ REGEXP_REPLACE({y}.endereco_completo, ', \\d+ -', CONCAT(', ', {x}.numero, ' (aprox) -')) AS endereco_encontrado,
+ {x}.similaridade_logradouro,
+ {y}.desvio_metros,
+ {x}.log_causa_confusao,
+ {y}.n_casos AS contagem_cnefe {additional_first}
+ FROM {x}
+ INNER JOIN {y}
+ ON {join_condition}
+ WHERE {cols_not_null_match}
+ )
+ INSERT INTO output_db (
+ tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ desvio_metros, log_causa_confusao, similaridade_logradouro, contagem_cnefe {colunas_encontradas}
+ )
+ SELECT tempidgeocodebr,
+ SUM((1 / ABS(numero - numero_cnefe) * lat)) / SUM(1 / ABS(numero - numero_cnefe)) AS lat,
+ SUM((1 / ABS(numero - numero_cnefe) * lon)) / SUM(1 / ABS(numero - numero_cnefe)) AS lon,
+ FIRST(endereco_encontrado) AS endereco_encontrado,
+ '{match_type}' AS tipo_resultado,
+ AVG(desvio_metros) AS desvio_metros,
+ FIRST(log_causa_confusao) AS log_causa_confusao,
+ FIRST(similaridade_logradouro) AS similaridade_logradouro,
+ FIRST(contagem_cnefe) AS contagem_cnefe {additional_second}
+ FROM temp_db
+ GROUP BY tempidgeocodebr, endereco_encontrado
+ """
+ )
+ return update_input_db(con, update_tb=x, reference_tb=output_tb)
+
+
+def select_match_function(match_type: str):
+ if match_type in NUMBER_EXACT_TYPES or match_type in EXACT_TYPES_NO_NUMBER:
+ return match_cases
+ if match_type in NUMBER_INTERPOLATION_TYPES:
+ return match_weighted_cases
+ if match_type in PROBABILISTIC_EXACT_TYPES or match_type in PROBABILISTIC_TYPES_NO_NUMBER:
+ return match_cases_probabilistic
+ if match_type in PROBABILISTIC_INTERPOLATION_TYPES:
+ return match_weighted_cases_probabilistic
+ raise ValueError(f"match_type sem funcao: {match_type}")
+
+
+def trata_empates_geocode_duckdb(
+ con: duckdb.DuckDBPyConnection,
+ resultado_completo: bool,
+ resolver_empates: bool,
+ verboso: bool,
+) -> int:
+ n_casos_empate = con.execute(
+ """
+ SELECT COUNT(*) AS n_casos_empate
+ FROM (
+ SELECT tempidgeocodebr
+ FROM output_db
+ GROUP BY tempidgeocodebr
+ HAVING COUNT(*) > 1
+ ) AS repeated
+ """
+ ).fetchone()[0]
+
+ if n_casos_empate == 0:
+ return 0
+
+ if not resolver_empates:
+ con.execute(
+ """
+ CREATE OR REPLACE TEMP TABLE output_db2 AS
+ SELECT *,
+ (COUNT(*) OVER (PARTITION BY tempidgeocodebr) > 1) AS empate
+ FROM output_db
+ """
+ )
+ return n_casos_empate
+
+ con.execute(
+ """
+ CREATE MACRO IF NOT EXISTS haversine(lat1, lon1, lat2, lon2) AS (
+ 6378137 * 2 * ASIN(
+ SQRT(
+ POWER(SIN(RADIANS(lat2 - lat1) / 2), 2) +
+ COS(RADIANS(lat1)) * COS(RADIANS(lat2)) *
+ POWER(SIN(RADIANS(lon2 - lon1) / 2), 2)
+ )
+ )
+ )
+ """
+ )
+
+ additional_cols_final = ""
+ cols_encontradas = ""
+ if resultado_completo:
+ additional_cols_final = """
+ , logradouro_encontrado, numero_encontrado, cep_encontrado,
+ localidade_encontrada, municipio_encontrado, estado_encontrado,
+ similaridade_logradouro, contagem_cnefe, empate, cod_setor
+ """
+ cols_encontradas = """
+ , logradouro_encontrado, numero_encontrado, cep_encontrado,
+ localidade_encontrada, municipio_encontrado, estado_encontrado,
+ similaridade_logradouro, cod_setor
+ """
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE output_db2 AS
+ WITH
+ base AS (
+ SELECT *,
+ (COUNT(*) OVER (PARTITION BY tempidgeocodebr) > 1) AS empate_inicial,
+ ROW_NUMBER() OVER (
+ PARTITION BY tempidgeocodebr
+ ORDER BY contagem_cnefe DESC, desvio_metros, endereco_encontrado
+ ) AS id
+ FROM output_db
+ ),
+ distd AS (
+ SELECT b.*,
+ CASE WHEN empate_inicial THEN
+ haversine(
+ lat, lon,
+ LEAD(lat) OVER (PARTITION BY tempidgeocodebr ORDER BY id),
+ LEAD(lon) OVER (PARTITION BY tempidgeocodebr ORDER BY id)
+ )
+ END AS dist_geocodebr_metros
+ FROM base b
+ ),
+ filtered AS (
+ SELECT d.*,
+ (COUNT(*) OVER (PARTITION BY tempidgeocodebr) > 1) AS empate,
+ MAX(dist_geocodebr_metros) OVER (PARTITION BY tempidgeocodebr) AS max_dist
+ FROM distd d
+ WHERE (empate_inicial IS FALSE)
+ OR (empate_inicial AND dist_geocodebr_metros IS NULL)
+ OR (empate_inicial AND dist_geocodebr_metros > 300)
+ ),
+ df_sem_empate AS (
+ SELECT tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ contagem_cnefe, desvio_metros, empate {cols_encontradas}
+ FROM filtered
+ WHERE empate = FALSE
+ ),
+ df_empates_perdidos AS (
+ SELECT tempidgeocodebr, lat, lon, endereco_encontrado, tipo_resultado,
+ contagem_cnefe, desvio_metros, TRUE AS empate {cols_encontradas}
+ FROM filtered
+ WHERE empate = TRUE
+ AND (
+ max_dist > 1000
+ OR log_causa_confusao
+ OR REGEXP_MATCHES(endereco_encontrado,
+ '(RUA (QUATRO|QUATORZE|QUINZE|DEZESSEIS|DEZESSETE|DEZOITO|DEZENOVE|VINTE|TRINTA|QUARENTA|CINQUENTA|SESSENTA|SETENTA|OITENTA|NOVENTA))'
+ )
+ )
+ AND NOT REGEXP_MATCHES(logradouro_encontrado, '\\bDE (JANEIRO|FEVEREIRO|MARCO|ABRIL|MAIO|JUNHO|JULHO|AGOSTO|SETEMBRO|OUTUBRO|NOVEMBRO|DEZEMBRO)\\b')
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY tempidgeocodebr ORDER BY contagem_cnefe DESC) = 1
+ ),
+ empates_restantes AS (
+ SELECT f.*
+ FROM filtered f
+ WHERE f.empate = TRUE
+ AND NOT EXISTS (SELECT 1 FROM df_sem_empate s WHERE s.tempidgeocodebr = f.tempidgeocodebr)
+ AND NOT EXISTS (SELECT 1 FROM df_empates_perdidos p WHERE p.tempidgeocodebr = f.tempidgeocodebr)
+ ),
+ empates_wavg AS (
+ SELECT e.*,
+ (SUM(lat * contagem_cnefe) OVER (PARTITION BY tempidgeocodebr)
+ / NULLIF(SUM(contagem_cnefe) OVER (PARTITION BY tempidgeocodebr), 0)) AS lat_wavg,
+ (SUM(lon * contagem_cnefe) OVER (PARTITION BY tempidgeocodebr)
+ / NULLIF(SUM(contagem_cnefe) OVER (PARTITION BY tempidgeocodebr), 0)) AS lon_wavg
+ FROM empates_restantes e
+ ),
+ df_empates_salve AS (
+ SELECT tempidgeocodebr, lat_wavg AS lat, lon_wavg AS lon,
+ endereco_encontrado, tipo_resultado, contagem_cnefe,
+ desvio_metros, TRUE AS empate {cols_encontradas}
+ FROM empates_wavg
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY tempidgeocodebr ORDER BY contagem_cnefe DESC) = 1
+ )
+ SELECT tempidgeocodebr, lat, lon, tipo_resultado, desvio_metros,
+ endereco_encontrado {additional_cols_final}
+ FROM df_sem_empate
+ UNION ALL
+ SELECT tempidgeocodebr, lat, lon, tipo_resultado, desvio_metros,
+ endereco_encontrado {additional_cols_final}
+ FROM df_empates_perdidos
+ UNION ALL
+ SELECT tempidgeocodebr, lat, lon, tipo_resultado, desvio_metros,
+ endereco_encontrado {additional_cols_final}
+ FROM df_empates_salve
+ """
+ )
+
+ if verboso:
+ plural = "caso" if n_casos_empate == 1 else "casos"
+ print(f"Foram encontrados e resolvidos {n_casos_empate} {plural} de empate.")
+ return n_casos_empate
+
+
+def _complete_columns(
+ y: str,
+ key_cols: list[str],
+ resultado_completo: bool,
+ probabilistic: bool = False,
+) -> tuple[str, str]:
+ if not resultado_completo:
+ return "", ""
+
+ output_cols = [_found_col_name(col) for col in key_cols]
+ select_cols = [f"{y}.{col} AS {_found_col_name(col)}" for col in key_cols]
+ if probabilistic:
+ output_cols.append("similaridade_logradouro")
+ select_cols.append("input_padrao_db.similaridade_logradouro AS similaridade_logradouro")
+ output_cols.append("cod_setor")
+ select_cols.append(f"{y}.cod_setor AS cod_setor")
+ return ", " + ", ".join(output_cols), ", " + ", ".join(select_cols)
+
+
+def _complete_weighted_columns(
+ y: str,
+ key_cols: list[str],
+ resultado_completo: bool,
+) -> tuple[str, str, str]:
+ if not resultado_completo:
+ return "", "", ""
+
+ output_cols = [_found_col_name(col) for col in key_cols] + ["cod_setor"]
+ first_cols = [f"{y}.{col} AS {_found_col_name(col)}" for col in key_cols]
+ first_cols.append(f"{y}.cod_setor AS cod_setor")
+ second_cols = [f"FIRST({_found_col_name(col)}) AS {_found_col_name(col)}" for col in key_cols]
+ second_cols.append("FIRST(cod_setor) AS cod_setor")
+ return ", " + ", ".join(output_cols), ", " + ", ".join(first_cols), ", " + ", ".join(second_cols)
+
+
+def _found_col_name(col: str) -> str:
+ if col == "localidade":
+ return "localidade_encontrada"
+ return f"{col}_encontrado"
diff --git a/python-package/src/geocodebr/messages.py b/python-package/src/geocodebr/messages.py
new file mode 100644
index 0000000..03aa234
--- /dev/null
+++ b/python-package/src/geocodebr/messages.py
@@ -0,0 +1,28 @@
+def inform(message: str, verboso: bool = True) -> None:
+ if verboso:
+ print(message)
+
+
+def message_standardizing_addresses(verboso: bool = True) -> None:
+ inform("Padronizando enderecos de entrada", verboso)
+
+
+def message_baixando_cnefe(verboso: bool = True) -> None:
+ inform("Baixando dados do CNEFE", verboso)
+
+
+def message_usando_cnefe_local(verboso: bool = True) -> None:
+ inform("Utilizando dados do CNEFE armazenados localmente", verboso)
+
+
+def message_looking_for_matches(verboso: bool = True) -> None:
+ inform("Geolocalizando enderecos", verboso)
+
+
+def message_preparando_output(verboso: bool = True) -> None:
+ inform("Preparando resultados", verboso)
+
+
+def message_cache(verboso: bool = True) -> None:
+ inform("Nenhum dado em cache local", verboso)
+
diff --git a/python-package/src/geocodebr/reverse.py b/python-package/src/geocodebr/reverse.py
new file mode 100644
index 0000000..3cf0848
--- /dev/null
+++ b/python-package/src/geocodebr/reverse.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import duckdb
+import pyarrow as pa
+
+from .cache import listar_pasta_cache
+from .constants import DATA_RELEASE
+from .db import create_geocodebr_db
+from .download_cnefe import download_cnefe
+from .geocode import _register_input, _table_columns
+from .utils import check_clean_colnames, quote_ident
+
+def geocode_reverso(
+ pontos: Any,
+ dist_max: int = 1000,
+ verboso: bool = True,
+ cache: bool = True,
+ n_cores: int | None = None,
+) -> pa.Table:
+ if not isinstance(dist_max, (int, float)) or dist_max < 500 or dist_max > 100000:
+ raise ValueError("dist_max deve estar entre 500 e 100000 metros.")
+ if not isinstance(verboso, bool) or not isinstance(cache, bool):
+ raise TypeError("verboso e cache devem ser True ou False.")
+
+ download_cnefe(
+ "municipio_logradouro_numero_cep_localidade",
+ verboso=verboso,
+ cache=cache,
+ )
+ con = create_geocodebr_db(n_cores=n_cores, load_spatial=True)
+ try:
+ _register_points_input(con, pontos)
+ input_columns = _table_columns(con, "pontos_input")
+ check_clean_colnames(input_columns)
+ lon_col, lat_col = _detect_coordinate_columns(input_columns)
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE pontos_db AS
+ SELECT *,
+ ROW_NUMBER() OVER ()::INTEGER AS tempidgeocodebr,
+ CAST({quote_ident(lon_col)} AS DOUBLE) AS _geocodebr_lon,
+ CAST({quote_ident(lat_col)} AS DOUBLE) AS _geocodebr_lat
+ FROM pontos_input
+ """
+ )
+ _validate_points_bbox(con)
+
+ bbox = con.execute(
+ """
+ SELECT
+ MIN(_geocodebr_lon), MIN(_geocodebr_lat),
+ MAX(_geocodebr_lon), MAX(_geocodebr_lat)
+ FROM pontos_db
+ """
+ ).fetchone()
+ margin = float(dist_max) / 111_320 + 0.05
+ xmin, ymin, xmax, ymax = (
+ bbox[0] - margin,
+ bbox[1] - margin,
+ bbox[2] + margin,
+ bbox[3] + margin,
+ )
+
+ path_to_parquet = (
+ Path(listar_pasta_cache())
+ / f"geocodebr_data_release_{DATA_RELEASE}"
+ / "municipio_logradouro_numero_cep_localidade.parquet"
+ ).as_posix()
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE cnefe_tb AS
+ SELECT
+ estado, municipio, logradouro, numero, cep, localidade,
+ lon AS cnefe_lon,
+ lat AS cnefe_lat,
+ ST_Transform(
+ ST_Point(CAST(lon AS DOUBLE), CAST(lat AS DOUBLE)),
+ 'EPSG:4674',
+ 'EPSG:31983',
+ always_xy := true
+ ) AS cnefe_geom_utm
+ FROM read_parquet('{path_to_parquet}')
+ WHERE lon BETWEEN {xmin} AND {xmax}
+ AND lat BETWEEN {ymin} AND {ymax}
+ """
+ )
+ con.execute(
+ """
+ CREATE OR REPLACE TEMP TABLE pontos_utm AS
+ SELECT *,
+ ST_Transform(
+ ST_Point(_geocodebr_lon, _geocodebr_lat),
+ 'EPSG:4674',
+ 'EPSG:31983',
+ always_xy := true
+ ) AS ponto_geom_utm
+ FROM pontos_db
+ """
+ )
+
+ original_columns = [
+ col
+ for col in input_columns
+ if col not in {"_geocodebr_lon", "_geocodebr_lat", "tempidgeocodebr"}
+ ]
+ select_original = ", ".join(f"p.{quote_ident(col)}" for col in original_columns)
+ address_select = _address_select_clause(set(original_columns))
+ leading_comma = ", " if select_original else ""
+
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE geocodebr_reverse_result AS
+ WITH ranked AS (
+ SELECT
+ {select_original}{leading_comma}
+ {address_select},
+ c.cnefe_lon AS lon_encontrado,
+ c.cnefe_lat AS lat_encontrado,
+ ST_Distance(p.ponto_geom_utm, c.cnefe_geom_utm) AS distancia_metros,
+ ROW_NUMBER() OVER (
+ PARTITION BY p.tempidgeocodebr
+ ORDER BY ST_Distance(p.ponto_geom_utm, c.cnefe_geom_utm)
+ ) AS rn,
+ p.tempidgeocodebr
+ FROM pontos_utm p
+ JOIN cnefe_tb c
+ ON ST_DWithin(p.ponto_geom_utm, c.cnefe_geom_utm, {float(dist_max)})
+ )
+ SELECT * EXCLUDE (rn, tempidgeocodebr)
+ FROM ranked
+ WHERE rn = 1
+ ORDER BY tempidgeocodebr
+ """
+ )
+ n_rows = con.execute("SELECT COUNT(*) FROM geocodebr_reverse_result").fetchone()[0]
+ if n_rows == 0:
+ raise ValueError("Nenhum endereco proximo foi encontrado.")
+ return con.execute("SELECT * FROM geocodebr_reverse_result").to_arrow_table()
+ finally:
+ con.close()
+
+
+def _register_points_input(con: duckdb.DuckDBPyConnection, pontos: Any) -> None:
+ if _looks_like_geodataframe(pontos):
+ epsg = pontos.crs.to_epsg() if pontos.crs is not None else None
+ if epsg != 4674:
+ raise ValueError("Dados de input precisam estar em SIRGAS 2000, EPSG 4674.")
+ geometry_name = pontos.geometry.name
+ attrs = pontos.drop(columns=[geometry_name]).copy()
+ attrs["_geocodebr_lon"] = pontos.geometry.x
+ attrs["_geocodebr_lat"] = pontos.geometry.y
+ con.register("pontos_input_view", attrs)
+ con.execute("CREATE OR REPLACE TEMP TABLE pontos_input AS SELECT * FROM pontos_input_view")
+ con.unregister("pontos_input_view")
+ return
+
+ _register_input(con, pontos)
+ con.execute("CREATE OR REPLACE TEMP TABLE pontos_input AS SELECT * FROM enderecos_input")
+
+
+def _looks_like_geodataframe(value: Any) -> bool:
+ return hasattr(value, "geometry") and hasattr(value, "crs")
+
+
+def _detect_coordinate_columns(columns: list[str]) -> tuple[str, str]:
+ candidates = [
+ ("lon", "lat"),
+ ("longitude", "latitude"),
+ ("x", "y"),
+ ("_geocodebr_lon", "_geocodebr_lat"),
+ ]
+ column_set = set(columns)
+ for lon_col, lat_col in candidates:
+ if lon_col in column_set and lat_col in column_set:
+ return lon_col, lat_col
+ raise ValueError("pontos deve ter colunas lon/lat, longitude/latitude, x/y ou ser um GeoDataFrame.")
+
+
+def _validate_points_bbox(con: duckdb.DuckDBPyConnection) -> None:
+ xmin, ymin, xmax, ymax = con.execute(
+ """
+ SELECT
+ MIN(_geocodebr_lon), MIN(_geocodebr_lat),
+ MAX(_geocodebr_lon), MAX(_geocodebr_lat)
+ FROM pontos_db
+ """
+ ).fetchone()
+ bbox_brazil = {
+ "xmin": -73.99044997,
+ "ymin": -33.75208127,
+ "xmax": -28.83594354,
+ "ymax": 5.27184108,
+ }
+ if (
+ xmin < bbox_brazil["xmin"]
+ or xmax > bbox_brazil["xmax"]
+ or ymin < bbox_brazil["ymin"]
+ or ymax > bbox_brazil["ymax"]
+ ):
+ raise ValueError("Coordenadas de input localizadas fora do bounding box do Brasil.")
+
+
+def _address_select_clause(original_columns: set[str]) -> str:
+ parts = []
+ for col in ["estado", "municipio", "logradouro", "numero", "cep", "localidade"]:
+ out_col = col if col not in original_columns else f"{col}_encontrado"
+ parts.append(f"c.{col} AS {out_col}")
+ return ", ".join(parts)
diff --git a/python-package/src/geocodebr/string_dist.py b/python-package/src/geocodebr/string_dist.py
new file mode 100644
index 0000000..9b85b50
--- /dev/null
+++ b/python-package/src/geocodebr/string_dist.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import duckdb
+
+from .utils import get_key_cols, get_prob_match_cutoff, quote_ident
+
+
+def calculate_string_dist(
+ con: duckdb.DuckDBPyConnection,
+ match_type: str,
+ unique_logradouros_tbl: str,
+) -> None:
+ key_cols = get_key_cols(match_type)
+ cols_not_null = " AND ".join(f"input_padrao_db.{col} IS NOT NULL" for col in key_cols)
+ lookup_cols = [col for col in key_cols if col not in {"numero", "logradouro"}]
+ join_condition_lookup = " AND ".join(
+ f"{quote_ident(unique_logradouros_tbl)}.{col} = input_padrao_db.{col}"
+ for col in lookup_cols
+ )
+ min_cutoff = get_prob_match_cutoff(match_type)
+
+ con.execute(
+ f"""
+ WITH to_compute AS (
+ SELECT
+ input_padrao_db.tempidgeocodebr,
+ input_padrao_db.logradouro AS logradouro_input,
+ {quote_ident(unique_logradouros_tbl)}.logradouro AS logradouro_cnefe
+ FROM input_padrao_db
+ JOIN {quote_ident(unique_logradouros_tbl)}
+ ON {join_condition_lookup}
+ WHERE input_padrao_db.similaridade_logradouro IS NULL
+ AND input_padrao_db.log_causa_confusao = FALSE
+ AND {cols_not_null}
+ ),
+ computed AS (
+ SELECT
+ tempidgeocodebr,
+ logradouro_cnefe,
+ CAST(jaro_similarity(logradouro_input, logradouro_cnefe) AS NUMERIC(5,3)) AS similarity,
+ RANK() OVER (PARTITION BY tempidgeocodebr ORDER BY similarity DESC, logradouro_cnefe) AS rank
+ FROM to_compute
+ WHERE similarity > {min_cutoff}
+ )
+ UPDATE input_padrao_db
+ SET temp_lograd_determ = computed.logradouro_cnefe,
+ similaridade_logradouro = similarity
+ FROM computed
+ WHERE input_padrao_db.tempidgeocodebr = computed.tempidgeocodebr
+ AND computed.rank = 1
+ """
+ )
+
diff --git a/python-package/src/geocodebr/tables.py b/python-package/src/geocodebr/tables.py
new file mode 100644
index 0000000..9f73b46
--- /dev/null
+++ b/python-package/src/geocodebr/tables.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import duckdb
+
+from .cache import listar_dados_cache
+from .utils import find_cached_parquet, get_key_cols, get_reference_table, quote_ident
+
+
+def register_cnefe_table(con: duckdb.DuckDBPyConnection, match_type: str) -> bool:
+ cnefe_table_name = get_reference_table(match_type)
+ exists = con.execute(
+ "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = ?",
+ [cnefe_table_name],
+ ).fetchone()[0]
+ if exists:
+ return True
+
+ path_to_parquet = find_cached_parquet(listar_dados_cache(), cnefe_table_name)
+ con.execute(
+ f"""
+ CREATE TEMP TABLE IF NOT EXISTS {quote_ident(cnefe_table_name)} AS
+ WITH unique_munis AS (
+ SELECT DISTINCT municipio FROM input_padrao_db
+ ),
+ unique_states AS (
+ SELECT DISTINCT estado FROM input_padrao_db
+ )
+ SELECT *
+ FROM read_parquet('{path_to_parquet}') m
+ WHERE m.estado IN (SELECT estado FROM unique_states)
+ AND m.municipio IN (SELECT municipio FROM unique_munis)
+ """
+ )
+ return True
+
+
+def register_unique_logradouros_table(con: duckdb.DuckDBPyConnection, match_type: str) -> str:
+ key_cols = get_key_cols(match_type)
+ cnefe_table_name = (
+ "municipio_logradouro_localidade"
+ if match_type in {"pn03", "pa03", "pl03"}
+ else "municipio_logradouro_cep_localidade"
+ )
+ table_name = f"unique_logr_{cnefe_table_name}"
+ exists = con.execute(
+ "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = ?",
+ [table_name],
+ ).fetchone()[0]
+ if exists:
+ return table_name
+
+ select_cols = [col for col in key_cols if col != "numero"]
+ distinct = ""
+ if not (cnefe_table_name == "municipio_logradouro_localidade" or {"localidade", "cep"} <= set(select_cols)):
+ distinct = "DISTINCT"
+ select_cols_sql = ", ".join(quote_ident(col) for col in select_cols)
+
+ base_exists = con.execute(
+ "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = ?",
+ [cnefe_table_name],
+ ).fetchone()[0]
+ if base_exists:
+ con.execute(
+ f"""
+ CREATE TEMP TABLE IF NOT EXISTS {quote_ident(table_name)} AS
+ WITH unique_munis AS (
+ SELECT DISTINCT municipio FROM input_padrao_db
+ ),
+ unique_states AS (
+ SELECT DISTINCT estado FROM input_padrao_db
+ )
+ SELECT {distinct} {select_cols_sql}
+ FROM {quote_ident(cnefe_table_name)}
+ WHERE estado IN (SELECT estado FROM unique_states)
+ AND municipio IN (SELECT municipio FROM unique_munis)
+ """
+ )
+ else:
+ path_to_parquet = find_cached_parquet(listar_dados_cache(), cnefe_table_name)
+ con.execute(
+ f"""
+ CREATE TEMP TABLE IF NOT EXISTS {quote_ident(table_name)} AS
+ WITH unique_munis AS (
+ SELECT DISTINCT municipio FROM input_padrao_db
+ )
+ SELECT {distinct} {select_cols_sql}
+ FROM read_parquet('{path_to_parquet}') m
+ WHERE m.municipio IN (SELECT municipio FROM unique_munis)
+ """
+ )
+ return table_name
+
diff --git a/python-package/src/geocodebr/utils.py b/python-package/src/geocodebr/utils.py
new file mode 100644
index 0000000..f6417c3
--- /dev/null
+++ b/python-package/src/geocodebr/utils.py
@@ -0,0 +1,201 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+import duckdb
+
+from .constants import DATA_RELEASE
+
+
+def quote_ident(name: str) -> str:
+ if not re.match(r"^[A-Za-z0-9_]+$", name):
+ raise ValueError(f"Nome SQL invalido: {name}")
+ return name
+
+
+def sql_string(value: str) -> str:
+ return "'" + value.replace("'", "''") + "'"
+
+
+def check_clean_colnames(columns: list[str]) -> None:
+ bad_cols = [col for col in columns if not re.match(r"^[A-Za-z0-9_]+$", col)]
+ if bad_cols:
+ raise ValueError(
+ "Column names must use only letters, numbers, and underscores. "
+ f"Please rename: {bad_cols}"
+ )
+
+
+def get_key_cols(match_type: str) -> list[str]:
+ if match_type in {"dn01", "da01", "pn01", "pa01"}:
+ return ["estado", "municipio", "logradouro", "numero", "cep", "localidade"]
+ if match_type in {"dn02", "da02", "pn02", "pa02"}:
+ return ["estado", "municipio", "logradouro", "numero", "cep"]
+ if match_type in {"dn03", "da03", "pn03", "pa03"}:
+ return ["estado", "municipio", "logradouro", "numero", "localidade"]
+ if match_type in {"dn04", "da04", "pn04", "pa04"}:
+ return ["estado", "municipio", "logradouro", "numero"]
+ if match_type in {"dl01", "pl01"}:
+ return ["estado", "municipio", "logradouro", "cep", "localidade"]
+ if match_type in {"dl02", "pl02"}:
+ return ["estado", "municipio", "logradouro", "cep"]
+ if match_type in {"dl03", "pl03"}:
+ return ["estado", "municipio", "logradouro", "localidade"]
+ if match_type in {"dl04", "pl04"}:
+ return ["estado", "municipio", "logradouro"]
+ if match_type == "dc01":
+ return ["estado", "municipio", "cep", "localidade"]
+ if match_type == "dc02":
+ return ["estado", "municipio", "cep"]
+ if match_type == "db01":
+ return ["estado", "municipio", "localidade"]
+ if match_type == "dm01":
+ return ["estado", "municipio"]
+ raise ValueError(f"match_type desconhecido: {match_type}")
+
+
+def get_reference_table(match_type: str) -> str:
+ key_cols = get_key_cols(match_type)
+ table_name = "_".join(key_cols).replace("estado_municipio", "municipio")
+
+ if re.search(r"dn02|pn02|da02|pa02|dn03|pn03", match_type):
+ table_name = "municipio_logradouro_numero_cep_localidade"
+ if re.search(r"da03|pa03|dn04|da04", match_type):
+ table_name = "municipio_logradouro_numero_localidade"
+ if re.search(r"dl02|pl02|dl03|pl03", match_type):
+ table_name = "municipio_logradouro_cep_localidade"
+ if re.search(r"dl04", match_type):
+ table_name = "municipio_logradouro_localidade"
+
+ return table_name
+
+
+def get_prob_match_cutoff(match_type: str) -> float:
+ return 0.85 if match_type in {"pn01", "pa01", "pl01"} else 0.9
+
+
+def find_cached_parquet(cache_files: list[str], table_name: str) -> str:
+ suffix = f"{table_name}.parquet"
+ matches = [
+ file
+ for file in cache_files
+ if Path(file).name == suffix and DATA_RELEASE in str(file)
+ ]
+ if not matches:
+ raise FileNotFoundError(
+ f"Arquivo {suffix} nao encontrado no cache. Execute download_cnefe()."
+ )
+ return matches[0].replace("\\", "/")
+
+
+def update_input_db(
+ con: duckdb.DuckDBPyConnection,
+ update_tb: str = "input_padrao_db",
+ reference_tb: str = "output_db",
+) -> int:
+ before = con.execute(f"SELECT COUNT(*) FROM {quote_ident(update_tb)}").fetchone()[0]
+ con.execute(
+ f"""
+ DELETE FROM {quote_ident(update_tb)}
+ WHERE tempidgeocodebr IN (
+ SELECT tempidgeocodebr FROM {quote_ident(reference_tb)}
+ )
+ """
+ )
+ after = con.execute(f"SELECT COUNT(*) FROM {quote_ident(update_tb)}").fetchone()[0]
+ return before - after
+
+
+def add_precision_col(con: duckdb.DuckDBPyConnection, update_tb: str) -> None:
+ update_tb = quote_ident(update_tb)
+ con.execute(f"ALTER TABLE {update_tb} ADD COLUMN precisao TEXT")
+ con.execute(
+ f"""
+ UPDATE {update_tb}
+ SET precisao = CASE
+ WHEN tipo_resultado IN ('dn01', 'dn02', 'dn03', 'dn04',
+ 'pn01', 'pn02', 'pn03', 'pn04') THEN 'numero'
+ WHEN tipo_resultado IN ('da01', 'da02', 'da03', 'da04',
+ 'pa01', 'pa02', 'pa03', 'pa04') THEN 'numero_aproximado'
+ WHEN tipo_resultado IN ('dl01', 'dl02', 'dl03', 'dl04',
+ 'pl01', 'pl02', 'pl03', 'pl04') THEN 'logradouro'
+ WHEN tipo_resultado IN ('dc01', 'dc02') THEN 'cep'
+ WHEN tipo_resultado = 'db01' THEN 'localidade'
+ WHEN tipo_resultado = 'dm01' THEN 'municipio'
+ ELSE NULL
+ END
+ """
+ )
+
+
+def merge_results_to_input(
+ con: duckdb.DuckDBPyConnection,
+ x: str,
+ y: str,
+ select_columns: list[str],
+ resultado_completo: bool,
+) -> None:
+ select_columns_y = [
+ "lat",
+ "lon",
+ "precisao",
+ "tipo_resultado",
+ "desvio_metros",
+ "endereco_encontrado",
+ ]
+ if resultado_completo:
+ select_columns_y.extend(
+ [
+ "logradouro_encontrado",
+ "numero_encontrado",
+ "cep_encontrado",
+ "localidade_encontrada",
+ "municipio_encontrado",
+ "estado_encontrado",
+ "similaridade_logradouro",
+ "contagem_cnefe",
+ "empate",
+ "cod_setor",
+ ]
+ )
+ con.execute(
+ f"""
+ UPDATE {quote_ident(y)}
+ SET similaridade_logradouro = COALESCE(similaridade_logradouro, 1)
+ """
+ )
+
+ select_x = ", ".join(f"{quote_ident(x)}.{quote_ident(col)}" for col in select_columns)
+ select_y = ", ".join(f"{quote_ident(y)}.{quote_ident(col)}" for col in select_columns_y)
+ con.execute(
+ f"""
+ CREATE OR REPLACE TEMP TABLE geocodebr_result AS
+ SELECT {select_x}, {select_y}
+ FROM {quote_ident(x)}
+ LEFT JOIN {quote_ident(y)}
+ ON {quote_ident(x)}.tempidgeocodebr = {quote_ident(y)}.tempidgeocodebr
+ ORDER BY {quote_ident(x)}.tempidgeocodebr
+ """
+ )
+
+
+def cria_col_logradouro_confusao(con: duckdb.DuckDBPyConnection) -> None:
+ con.execute("ALTER TABLE input_padrao_db ADD COLUMN log_causa_confusao BOOLEAN DEFAULT false")
+ ruas_num_ext = "|".join(
+ "RUA " + value
+ for value in ["UM", "DOIS", "TRES", "CINCO", "SEIS", "SETE", "OITO", "NOVE", "DEZ", "ONZE", "DOZE", "TREZE"]
+ )
+ con.execute(
+ rf"""
+ UPDATE input_padrao_db
+ SET log_causa_confusao = true
+ WHERE
+ (
+ REGEXP_MATCHES(logradouro, '^(RUA|TRAVESSA|RAMAL|BECO|BLOCO|AVENIDA|RODOVIA|ESTRADA)\s+([A-Z]{{1,2}}-?|[0-9]{{1,3}}|[A-Z]{{1,2}}-?[0-9]{{1,3}}|[A-Z]{{1,2}}\s+[0-9]{{1,3}}|[0-9]{{1,3}}-?[A-Z]{{1,2}})(\s+KM( \d+)?)?$')
+ OR REGEXP_MATCHES(logradouro, '({ruas_num_ext})$')
+ )
+ AND NOT REGEXP_MATCHES(logradouro, '\bDE (JANEIRO|FEVEREIRO|MARCO|ABRIL|MAIO|JUNHO|JULHO|AGOSTO|SETEMBRO|OUTUBRO|NOVEMBRO|DEZEMBRO)\b')
+ """
+ )
+
diff --git a/python-package/tests/conftest.py b/python-package/tests/conftest.py
new file mode 100644
index 0000000..0e0bb38
--- /dev/null
+++ b/python-package/tests/conftest.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+
+import pytest
+
+from geocodebr.cache import listar_arquivo_config
+
+
+@pytest.fixture(autouse=True)
+def restore_cache_config():
+ config_file = Path(listar_arquivo_config())
+ existed = config_file.exists()
+ content = config_file.read_text(encoding="utf-8") if existed else None
+ yield
+ if existed:
+ config_file.parent.mkdir(parents=True, exist_ok=True)
+ config_file.write_text(content, encoding="utf-8")
+ elif config_file.exists():
+ config_file.unlink()
+
diff --git a/python-package/tests/test_busca_por_cep.py b/python-package/tests/test_busca_por_cep.py
new file mode 100644
index 0000000..7958bb0
--- /dev/null
+++ b/python-package/tests/test_busca_por_cep.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from geocodebr import busca_por_cep, definir_pasta_cache
+from geocodebr.constants import DATA_RELEASE
+
+
+def test_busca_por_cep_duckdb_flow(tmp_path):
+ definir_pasta_cache(str(tmp_path), verboso=False)
+ data_dir = tmp_path / f"geocodebr_data_release_{DATA_RELEASE}"
+ data_dir.mkdir()
+ table = pa.table(
+ {
+ "cep": ["70390025", "20071001"],
+ "estado": ["DF", "RJ"],
+ "municipio": ["BRASILIA", "RIO DE JANEIRO"],
+ "logradouro": ["AVENIDA TESTE", "RUA TESTE"],
+ "localidade": ["CENTRO", "CENTRO"],
+ "lon": [-47.9, -43.2],
+ "lat": [-15.8, -22.9],
+ }
+ )
+ pq.write_table(table, data_dir / "municipio_logradouro_cep_localidade.parquet")
+
+ out = busca_por_cep(["70390-025", "99999-999"], h3_res=3, verboso=False)
+
+ assert out.num_rows == 2
+ assert "h3_03" in out.schema.names
+ assert out.column("cep").to_pylist() == ["70390025", "99999999"]
+
diff --git a/python-package/tests/test_cache.py b/python-package/tests/test_cache.py
new file mode 100644
index 0000000..a810f03
--- /dev/null
+++ b/python-package/tests/test_cache.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+
+from geocodebr import definir_pasta_cache, listar_dados_cache, listar_pasta_cache
+
+
+def test_cache_roundtrip(tmp_path):
+ assert definir_pasta_cache(str(tmp_path), verboso=False) == str(tmp_path)
+ assert listar_pasta_cache() == str(tmp_path)
+
+ (tmp_path / "a.parquet").write_text("", encoding="utf-8")
+ (tmp_path / "b.parquet").write_text("", encoding="utf-8")
+ assert [Path(path).name for path in listar_dados_cache()] == ["a.parquet", "b.parquet"]
+
diff --git a/python-package/tests/test_fields.py b/python-package/tests/test_fields.py
new file mode 100644
index 0000000..4c940d5
--- /dev/null
+++ b/python-package/tests/test_fields.py
@@ -0,0 +1,22 @@
+import pytest
+
+from geocodebr import definir_campos
+
+
+def test_definir_campos_preserves_public_names():
+ campos = definir_campos(
+ estado="uf",
+ municipio="cidade",
+ logradouro="rua",
+ numero="num",
+ cep="cep",
+ localidade="bairro",
+ )
+ assert list(campos) == ["logradouro", "numero", "cep", "localidade", "municipio", "estado"]
+ assert campos["estado"] == "uf"
+
+
+def test_definir_campos_rejects_non_string():
+ with pytest.raises(TypeError):
+ definir_campos(estado="uf", municipio=1)
+
diff --git a/python-package/tests/test_geocode.py b/python-package/tests/test_geocode.py
new file mode 100644
index 0000000..d1e59d3
--- /dev/null
+++ b/python-package/tests/test_geocode.py
@@ -0,0 +1,55 @@
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from geocodebr import definir_campos, definir_pasta_cache, geocode
+from geocodebr.constants import ALL_CNEFE_FILES, DATA_RELEASE
+
+
+def test_geocode_exact_number_match_with_duckdb(tmp_path):
+ definir_pasta_cache(str(tmp_path), verboso=False)
+ data_dir = tmp_path / f"geocodebr_data_release_{DATA_RELEASE}"
+ data_dir.mkdir()
+ cnefe = pa.table(
+ {
+ "estado": ["DF"],
+ "municipio": ["BRASILIA"],
+ "logradouro": ["AVENIDA TESTE"],
+ "numero": [100],
+ "cep": ["70000000"],
+ "localidade": ["CENTRO"],
+ "lon": [-47.9],
+ "lat": [-15.8],
+ "endereco_completo": ["AVENIDA TESTE, 100 - CENTRO, BRASILIA - DF"],
+ "desvio_metros": [10],
+ "n_casos": [1],
+ "cod_setor": ["001"],
+ }
+ )
+ for file in ALL_CNEFE_FILES:
+ pq.write_table(cnefe, data_dir / file)
+
+ enderecos = pa.table(
+ {
+ "uf": ["Distrito Federal"],
+ "cidade": ["Brasilia"],
+ "rua": ["Avenida Teste"],
+ "num": ["100"],
+ "cep_in": ["70000-000"],
+ "bairro": ["Centro"],
+ }
+ )
+ campos = definir_campos(
+ estado="uf",
+ municipio="cidade",
+ logradouro="rua",
+ numero="num",
+ cep="cep_in",
+ localidade="bairro",
+ )
+
+ out = geocode(enderecos, campos, resultado_completo=True, h3_res=3, verboso=False)
+
+ assert out.num_rows == 1
+ assert out.column("tipo_resultado").to_pylist() == ["dn01"]
+ assert out.column("precisao").to_pylist() == ["numero"]
+ assert "h3_03" in out.schema.names
diff --git a/python-package/tests/test_geocode_reverso.py b/python-package/tests/test_geocode_reverso.py
new file mode 100644
index 0000000..db85329
--- /dev/null
+++ b/python-package/tests/test_geocode_reverso.py
@@ -0,0 +1,32 @@
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from geocodebr import definir_pasta_cache, geocode_reverso
+from geocodebr.constants import DATA_RELEASE
+
+
+def test_geocode_reverso_with_duckdb_spatial(tmp_path):
+ definir_pasta_cache(str(tmp_path), verboso=False)
+ data_dir = tmp_path / f"geocodebr_data_release_{DATA_RELEASE}"
+ data_dir.mkdir()
+ cnefe = pa.table(
+ {
+ "estado": ["DF", "DF"],
+ "municipio": ["BRASILIA", "BRASILIA"],
+ "logradouro": ["AVENIDA PROXIMA", "AVENIDA DISTANTE"],
+ "numero": [100, 200],
+ "cep": ["70000000", "70000001"],
+ "localidade": ["CENTRO", "CENTRO"],
+ "lon": [-47.9000, -48.5000],
+ "lat": [-15.8000, -16.3000],
+ }
+ )
+ pq.write_table(cnefe, data_dir / "municipio_logradouro_numero_cep_localidade.parquet")
+ pontos = pa.table({"id": [1], "lon": [-47.9001], "lat": [-15.8001]})
+
+ out = geocode_reverso(pontos, dist_max=1000, verboso=False)
+
+ assert out.num_rows == 1
+ assert out.column("logradouro").to_pylist() == ["AVENIDA PROXIMA"]
+ assert out.column("distancia_metros").to_pylist()[0] < 100
+
From 43ebfcebcddbd520d36cfa521bc5429dfa8b25e1 Mon Sep 17 00:00:00 2001
From: "ANJOS, J. S." <0rakul0render@gmail.com>
Date: Tue, 9 Jun 2026 08:54:29 -0300
Subject: [PATCH 2/3] testes passados.
---
.gitignore | 2 +
.idea/.gitignore | 10 +
.idea/copilot.data.migration.ask2agent.xml | 6 +
.idea/geocodebr.iml | 25 +
.idea/inspectionProfiles/Project_Default.xml | 88 ++
.../inspectionProfiles/profiles_settings.xml | 6 +
.idea/misc.xml | 7 +
.idea/modules.xml | 8 +
.idea/vcs.xml | 6 +
.../pyproject.toml => pyproject.toml | 18 +-
python-package/MIGRATION_PLAN.md | 184 ---
python-package/README.md | 18 +
python-package/exemple/busca_por_cep.py | 23 +
python-package/exemple/enderecos.csv | 6 +
python-package/exemple/geocode_enderecos.py | 47 +
python-package/exemple/geocode_reverso.py | 26 +
.../{src => }/geocodebr/__init__.py | 0
python-package/{src => }/geocodebr/cache.py | 0
.../{src => }/geocodebr/constants.py | 0
python-package/{src => }/geocodebr/db.py | 0
.../{src => }/geocodebr/download_cnefe.py | 0
python-package/{src => }/geocodebr/errors.py | 0
python-package/{src => }/geocodebr/fields.py | 0
python-package/{src => }/geocodebr/geocode.py | 10 +-
.../{src => }/geocodebr/matching.py | 2 +-
.../{src => }/geocodebr/messages.py | 0
python-package/{src => }/geocodebr/reverse.py | 0
.../{src => }/geocodebr/string_dist.py | 0
python-package/{src => }/geocodebr/tables.py | 0
python-package/{src => }/geocodebr/utils.py | 0
python-package/tests/test_busca_por_cep.py | 2 -
uv.lock | 1087 +++++++++++++++++
32 files changed, 1385 insertions(+), 196 deletions(-)
create mode 100644 .idea/.gitignore
create mode 100644 .idea/copilot.data.migration.ask2agent.xml
create mode 100644 .idea/geocodebr.iml
create mode 100644 .idea/inspectionProfiles/Project_Default.xml
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/vcs.xml
rename python-package/pyproject.toml => pyproject.toml (74%)
delete mode 100644 python-package/MIGRATION_PLAN.md
create mode 100644 python-package/exemple/busca_por_cep.py
create mode 100644 python-package/exemple/enderecos.csv
create mode 100644 python-package/exemple/geocode_enderecos.py
create mode 100644 python-package/exemple/geocode_reverso.py
rename python-package/{src => }/geocodebr/__init__.py (100%)
rename python-package/{src => }/geocodebr/cache.py (100%)
rename python-package/{src => }/geocodebr/constants.py (100%)
rename python-package/{src => }/geocodebr/db.py (100%)
rename python-package/{src => }/geocodebr/download_cnefe.py (100%)
rename python-package/{src => }/geocodebr/errors.py (100%)
rename python-package/{src => }/geocodebr/fields.py (100%)
rename python-package/{src => }/geocodebr/geocode.py (97%)
rename python-package/{src => }/geocodebr/matching.py (99%)
rename python-package/{src => }/geocodebr/messages.py (100%)
rename python-package/{src => }/geocodebr/reverse.py (100%)
rename python-package/{src => }/geocodebr/string_dist.py (100%)
rename python-package/{src => }/geocodebr/tables.py (100%)
rename python-package/{src => }/geocodebr/utils.py (100%)
create mode 100644 uv.lock
diff --git a/.gitignore b/.gitignore
index 80314c3..4ba4c79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,5 @@ docs
/data_prep/data/*
/data_prep/data_raw/*
*.pyc
+__pycache__/
+.venv/
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..0a8642f
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Zeppelin ignored files
+/ZeppelinRemoteNotebooks/
diff --git a/.idea/copilot.data.migration.ask2agent.xml b/.idea/copilot.data.migration.ask2agent.xml
new file mode 100644
index 0000000..1f2ea11
--- /dev/null
+++ b/.idea/copilot.data.migration.ask2agent.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/geocodebr.iml b/.idea/geocodebr.iml
new file mode 100644
index 0000000..819d626
--- /dev/null
+++ b/.idea/geocodebr.iml
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..d2ddf35
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,88 @@
+
+
+