From 1f2dc95d2d46f1366fc96759806b24585c7dbab7 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 13 Mar 2026 16:12:04 -0400 Subject: [PATCH] Update README for v2 with new API, hex logo, and migration guide Co-Authored-By: Claude Opus 4.6 --- README.Rmd | 134 ++++++++++++++++--------- README.md | 290 +++++++++++++++++++++++++---------------------------- 2 files changed, 223 insertions(+), 201 deletions(-) diff --git a/README.Rmd b/README.Rmd index b38125d..b756eca 100644 --- a/README.Rmd +++ b/README.Rmd @@ -2,10 +2,7 @@ output: github_document --- - -[![R-CMD-check](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml) -[![test-coverage](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml) - + ```{r, include = FALSE} knitr::opts_chunk$set( @@ -16,74 +13,117 @@ knitr::opts_chunk$set( ) ``` -# nccsdata +# nccsdata nccsdata hex logo + + +[![R-CMD-check](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml) +[![test-coverage](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml) + -## Overview +nccsdata provides tools to download, filter, and analyze nonprofit organization data from the [National Center for Charitable Statistics](https://nccs.urban.org/) (NCCS). It reads IRS Business Master File (BMF) data stored as parquet files in a public S3 bucket, with support for predicate-pushdown filtering by state, county, NTEE subsector, and exempt organization type. -nccsdata provides tools to read, filter and append metadata to publicly available NCCS Core and BMF data sets. +> **Note:** This is version 2.0.0, a ground-up rewrite of the package. The v1 API (`get_data()`, `preview_sample()`, `parse_ntee()`) has been replaced. See the [migration section](#migrating-from-v1) below. ## Installation -You can install the development version of nccsdata from [GitHub](https://github.com/) with: +Install the development version from GitHub: -``` {r, message = FALSE, eval = FALSE} -install.packages("devtools") +```{r, eval = FALSE} +# install.packages("devtools") devtools::install_github("UrbanInstitute/nccsdata") -library(nccsdata) ``` ## Usage -### Data Pulls +### Reading BMF data -The [`nccsdata`](https://urbaninstitute.github.io/nccsdata/) package can be used to download legacy core data from 1989 to 2019 for charities, nonprofits, or private foundations that file their respective required IRS forms such as Form 990, 990EZs, or both. +`nccs_read()` downloads BMF data from S3 with optional filters. Filtering happens at the Arrow level via predicate pushdown, so only matching rows are read into memory. -This data can be filtered based on [NTEE](https://github.com/Nonprofit-Open-Data-Collective/mission-taxonomies/blob/main/NTEE-disaggregated/README.md) codes and geography. +```{r, eval = FALSE} +library(nccsdata) -```{r example, message=FALSE} -core_2005_nonprofit_pz <- nccsdata::get_data(dsname = "core", - time = "2005", - scope.orgtype = "NONPROFIT", - scope.formtype = "PZ") +# All Pennsylvania nonprofits (default columns) +pa <- nccs_read(state = "PA") +# Arts nonprofits in New York +ny_arts <- nccs_read(state = "NY", ntee_subsector = "ART") -tibble::as_tibble(core_2005_nonprofit_pz) +# Select specific columns +pa_slim <- nccs_read( + state = "PA", + columns = c("ein", "org_name_display", "geo_county", "income_amount") +) + +# Lazy query for custom dplyr pipelines +query <- nccs_read(state = "PA", collect = FALSE) +result <- query |> + dplyr::filter(geo_county == "Lackawanna County") |> + dplyr::collect() ``` -``` {r message = FALSE, warning = FALSE} -core_2005_artnonprofits_newyork <- nccsdata::get_data(dsname = "core", - time = "2016", - scope.orgtype = "NONPROFIT", - scope.formtype = "PZ", - ntee = "ART", - geo.state = "NY") -tibble::as_tibble(core_2005_artnonprofits_newyork) +### Summarizing data + +`nccs_summary()` produces grouped count summaries from a collected data frame. + +```{r, eval = FALSE} +pa <- nccs_read(state = "PA") + +# Total count +nccs_summary(pa) + +# Count by county +nccs_summary(pa, group_by = "geo_county") + +# Count by county and subsector, export to CSV +nccs_summary(pa, group_by = c("geo_county", "nteev2_subsector"), + output_csv = "pa_counts.csv") ``` - * Full [`get_data()`](https://urbaninstitute.github.io/nccsdata/articles/data_pull.html) vignette +### Discovering valid filter values + +`nccs_catalog()` lists valid values for `nccs_read()` filters without any network calls. + +```{r, eval = FALSE} +nccs_catalog("state") +nccs_catalog("ntee_subsector") +nccs_catalog("exempt_org_type") +``` -### Summarising Data +### Browsing the data dictionary -After processing the desired data, [`nccsdata`](https://urbaninstitute.github.io/nccsdata/) can also be used to -generate summary tables. +`nccs_dictionary()` returns a tibble describing all 97 BMF columns, with optional pattern filtering. -```{r message = FALSE, warning = FALSE} -nccsdata::preview_sample(data = core_2005_artnonprofits_newyork, - group_by = c("NTEECC", "STATE"), - var = c("TOTREV"), - stats = c("count", "mean", "max")) +```{r, eval = FALSE} +# All columns +nccs_dictionary() + +# Find geocoding-related columns +nccs_dictionary("geo") + +# Find NTEE-related columns +nccs_dictionary("ntee") ``` - * Full [`preview_sample()`](https://urbaninstitute.github.io/nccsdata/articles/summary_stats.html) vignette. - -### NTEE Codes - - [`nccsdata`](https://urbaninstitute.github.io/nccsdata/) also offers several - supplementary functions for documenting and retrieving [NTEE](https://github.com/Nonprofit-Open-Data-Collective/mission-taxonomies/blob/main/NTEE-disaggregated/README.md) codes. - - * Full [`ntee_preview()` and `parse_ntee()`](https://urbaninstitute.github.io/nccsdata/articles/ntee.html) vignette. +## Migrating from v1 + +| v1 function | v2 replacement | +|---|---| +| `get_data()` | `nccs_read()` | +| `preview_sample()` | `nccs_summary()` | +| `ntee_preview()` / `parse_ntee()` | `nccs_catalog("ntee_subsector")` | + +Key changes: + +- Data source moved from legacy Core/BMF CSVs to geocoded BMF parquet files on S3. +- Filtering now uses Arrow predicate pushdown instead of downloading full files. +- Dependencies reduced from 12 packages to 3 (`arrow`, `dplyr`, `utils`). + +## Documentation +Full documentation is available at . -## Getting Help +## Getting help -Raise an issue on the [issues](https://github.com/UrbanInstitute/nccsdata/issues) page or contact Thiyaghessan at `tpoongundranar@urban.org`. +- Browse the [getting started vignette](https://urbaninstitute.github.io/nccsdata/articles/getting-started.html) +- Open an issue on [GitHub](https://github.com/UrbanInstitute/nccsdata/issues) +- Contact the maintainer at `tpoongundranar@urban.org` diff --git a/README.md b/README.md index 87d295c..dd08236 100644 --- a/README.md +++ b/README.md @@ -1,154 +1,136 @@ - - - -[![R-CMD-check](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml) -[![test-coverage](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml) - - -# nccsdata - -## Overview - -nccsdata provides tools to read, filter and append metadata to publicly -available NCCS Core and BMF data sets. - -## Installation - -You can install the development version of nccsdata from -[GitHub](https://github.com/) with: - -``` r -install.packages("devtools") -devtools::install_github("UrbanInstitute/nccsdata") -library(nccsdata) -``` - -## Usage - -### Data Pulls - -The [`nccsdata`](https://urbaninstitute.github.io/nccsdata/) package can -be used to download legacy core data from 1989 to 2019 for charities, -nonprofits, or private foundations that file their respective required -IRS forms such as Form 990, 990EZs, or both. - -This data can be filtered based on -[NTEE](https://github.com/Nonprofit-Open-Data-Collective/mission-taxonomies/blob/main/NTEE-disaggregated/README.md) -codes and geography. - -``` r -core_2005_nonprofit_pz <- nccsdata::get_data(dsname = "core", - time = "2005", - scope.orgtype = "NONPROFIT", - scope.formtype = "PZ") -#> Requested files have a total size of 82.6 MB. Proceed -#> with download? Enter Y/N (Yes/no/cancel) - - -tibble::as_tibble(core_2005_nonprofit_pz) -#> # A tibble: 157,211 × 150 -#> NTEECC new.code type.org broad.category major.group univ hosp two.digit -#> -#> 1 J40 RG-HMS-J40 RG HMS J FALSE FALSE 40 -#> 2 W30 RG-PSB-W30 RG PSB W FALSE FALSE 30 -#> 3 W30 RG-PSB-W30 RG PSB W FALSE FALSE 30 -#> 4 W30 RG-PSB-W30 RG PSB W FALSE FALSE 30 -#> 5 W30 RG-PSB-W30 RG PSB W FALSE FALSE 30 -#> 6 Y42 RG-MMB-Y42 RG MMB Y FALSE FALSE 42 -#> 7 S41 RG-PSB-S41 RG PSB S FALSE FALSE 41 -#> 8 N60 RG-HMS-N60 RG HMS N FALSE FALSE 60 -#> 9 S41 RG-PSB-S41 RG PSB S FALSE FALSE 41 -#> 10 S41 RG-PSB-S41 RG PSB S FALSE FALSE 41 -#> # ℹ 157,201 more rows -#> # ℹ 142 more variables: further.category , division.subdivision , -#> # broad.category.description , major.group.description , -#> # code.name , division.subdivision.description , keywords , -#> # further.category.desciption , ntee2.code , EIN , -#> # TAXPER , STYEAR , CONT , DUES , SECUR , -#> # SALESEXP , INVINC , SOLICIT , GOODS , GRPROF , … -``` - -``` r -core_2005_artnonprofits_newyork <- nccsdata::get_data(dsname = "core", - time = "2016", - scope.orgtype = "NONPROFIT", - scope.formtype = "PZ", - ntee = "ART", - geo.state = "NY") -#> Requested files have a total size of 113.6 MB. Proceed -#> with download? Enter Y/N (Yes/no/cancel) -tibble::as_tibble(core_2005_artnonprofits_newyork) -#> # A tibble: 346 × 168 -#> NTEECC new.code type.org broad.category major.group univ hosp two.digit -#> -#> 1 A01 AA-ART-A00 AA ART A FALSE FALSE 1 -#> 2 A01 AA-ART-A00 AA ART A FALSE FALSE 1 -#> 3 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 4 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 5 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 6 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 7 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 8 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 9 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> 10 A03 PA-ART-A00 PA ART A FALSE FALSE 3 -#> # ℹ 336 more rows -#> # ℹ 160 more variables: further.category , division.subdivision , -#> # broad.category.description , major.group.description , -#> # code.name , division.subdivision.description , keywords , -#> # further.category.desciption , ntee2.code , EIN , -#> # ACCPER , ACTIV1 , ACTIV2 , ACTIV3 , ADDRESS , -#> # AFCD , ASS_BOY , ASS_EOY , BOND_BOY , BOND_EOY , … -``` - -- Full - [`get_data()`](https://urbaninstitute.github.io/nccsdata/articles/data_pull.html) - vignette - -### Summarising Data - -After processing the desired data, -[`nccsdata`](https://urbaninstitute.github.io/nccsdata/) can also be -used to generate summary tables. - -``` r -nccsdata::preview_sample(data = core_2005_artnonprofits_newyork, - group_by = c("NTEECC", "STATE"), - var = c("TOTREV"), - stats = c("count", "mean", "max")) -#> # A tibble: 29 × 5 -#> # Groups: NTEECC [29] -#> NTEECC STATE count mean max -#> -#> 1 A01 NY 2 77734 151889 -#> 2 A03 NY 14 924422 9222403 -#> 3 A11 NY 2 762752 1485739 -#> 4 A19 NY 1 50300 50300 -#> 5 A20 NY 5 236863 711793 -#> 6 A23 NY 112 64597 758835 -#> 7 A30 NY 26 810942 4974965 -#> 8 A31 NY 3 1389737 2142396 -#> 9 A32 NY 7 759395 3154923 -#> 10 A33 NY 15 329638 828684 -#> # ℹ 19 more rows -``` - -- Full - [`preview_sample()`](https://urbaninstitute.github.io/nccsdata/articles/summary_stats.html) - vignette. - -### NTEE Codes - -[`nccsdata`](https://urbaninstitute.github.io/nccsdata/) also offers -several supplementary functions for documenting and retrieving -[NTEE](https://github.com/Nonprofit-Open-Data-Collective/mission-taxonomies/blob/main/NTEE-disaggregated/README.md) -codes. - -- Full [`ntee_preview()` and - `parse_ntee()`](https://urbaninstitute.github.io/nccsdata/articles/ntee.html) - vignette. - -## Getting Help - -Raise an issue on the -[issues](https://github.com/UrbanInstitute/nccsdata/issues) page or -contact Thiyaghessan at `tpoongundranar@urban.org`. + + + +# nccsdata nccsdata hex logo + + + +[![R-CMD-check](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/R-CMD-check.yaml) +[![test-coverage](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml/badge.svg)](https://github.com/UrbanInstitute/nccsdata/actions/workflows/test-coverage.yaml) + + +nccsdata provides tools to download, filter, and analyze nonprofit +organization data from the [National Center for Charitable +Statistics](https://nccs.urban.org/) (NCCS). It reads IRS Business +Master File (BMF) data stored as parquet files in a public S3 bucket, +with support for predicate-pushdown filtering by state, county, NTEE +subsector, and exempt organization type. + +> **Note:** This is version 2.0.0, a ground-up rewrite of the package. +> The v1 API (`get_data()`, `preview_sample()`, `parse_ntee()`) has been +> replaced. See the [migration section](#migrating-from-v1) below. + +## Installation + +Install the development version from GitHub: + +``` r +# install.packages("devtools") +devtools::install_github("UrbanInstitute/nccsdata") +``` + +## Usage + +### Reading BMF data + +`nccs_read()` downloads BMF data from S3 with optional filters. +Filtering happens at the Arrow level via predicate pushdown, so only +matching rows are read into memory. + +``` r +library(nccsdata) + +# All Pennsylvania nonprofits (default columns) +pa <- nccs_read(state = "PA") + +# Arts nonprofits in New York +ny_arts <- nccs_read(state = "NY", ntee_subsector = "ART") + +# Select specific columns +pa_slim <- nccs_read( + state = "PA", + columns = c("ein", "org_name_display", "geo_county", "income_amount") +) + +# Lazy query for custom dplyr pipelines +query <- nccs_read(state = "PA", collect = FALSE) +result <- query |> + dplyr::filter(geo_county == "Lackawanna County") |> + dplyr::collect() +``` + +### Summarizing data + +`nccs_summary()` produces grouped count summaries from a collected data +frame. + +``` r +pa <- nccs_read(state = "PA") + +# Total count +nccs_summary(pa) + +# Count by county +nccs_summary(pa, group_by = "geo_county") + +# Count by county and subsector, export to CSV +nccs_summary(pa, group_by = c("geo_county", "nteev2_subsector"), + output_csv = "pa_counts.csv") +``` + +### Discovering valid filter values + +`nccs_catalog()` lists valid values for `nccs_read()` filters without +any network calls. + +``` r +nccs_catalog("state") +nccs_catalog("ntee_subsector") +nccs_catalog("exempt_org_type") +``` + +### Browsing the data dictionary + +`nccs_dictionary()` returns a tibble describing all 97 BMF columns, with +optional pattern filtering. + +``` r +# All columns +nccs_dictionary() + +# Find geocoding-related columns +nccs_dictionary("geo") + +# Find NTEE-related columns +nccs_dictionary("ntee") +``` + +## Migrating from v1 + +| v1 function | v2 replacement | +|-----------------------------------|----------------------------------| +| `get_data()` | `nccs_read()` | +| `preview_sample()` | `nccs_summary()` | +| `ntee_preview()` / `parse_ntee()` | `nccs_catalog("ntee_subsector")` | + +Key changes: + +- Data source moved from legacy Core/BMF CSVs to geocoded BMF parquet + files on S3. +- Filtering now uses Arrow predicate pushdown instead of downloading + full files. +- Dependencies reduced from 12 packages to 3 (`arrow`, `dplyr`, + `utils`). + +## Documentation + +Full documentation is available at +. + +## Getting help + +- Browse the [getting started + vignette](https://urbaninstitute.github.io/nccsdata/articles/getting-started.html) +- Open an issue on + [GitHub](https://github.com/UrbanInstitute/nccsdata/issues) +- Contact the maintainer at `tpoongundranar@urban.org`