From abda1616b38f593a85e2a4bc5f28748cf88a4d8a Mon Sep 17 00:00:00 2001 From: Lawrence Sinclair Date: Mon, 11 May 2026 00:14:14 +0700 Subject: [PATCH] jenner-check: add 5 Jenner compatibility bundles + runner Each bundle pulls one "before/after" pattern out of "SET statement considered harmful.sas" (PharmaSUG 2025 paper) and turns it into a self-contained pass/fail test against the Jenner SAS runtime. t001_set_split_silly_vs_smart 3 WHERE-passes vs 1 SELECT/OUTPUT pass t002_merge_in_categorize SQL JOINs replaced by MERGE + IN= flags t003_by_group_range_shift FIRST./LAST. + RETAIN aggregate, MERGE back t004_format_dataset_modify DATA-step format vs PROC DATASETS MODIFY t005_merge_collapse_interim wandering vs straight MERGE Each bundle has script.sas, autoexec.sas, expected.json (frozen from a successful run against api.jenneranalytics.com/v1/run), and meta.json (provenance: source file + blob sha + commit). Includes runner files (run_jenner.sas / .bat / .sh) so the bundles can be replayed via SAS or curl with no further setup. Co-Authored-By: Claude Opus 4.7 (1M context) --- jenner-check/run_jenner.bat | 43 ++ jenner-check/run_jenner.sas | 526 ++++++++++++++++++ jenner-check/run_jenner.sh | 214 +++++++ .../autoexec.sas | 4 + .../expected.json | 33 ++ .../t001_set_split_silly_vs_smart/meta.json | 8 + .../t001_set_split_silly_vs_smart/script.sas | 46 ++ .../t002_merge_in_categorize/autoexec.sas | 4 + .../t002_merge_in_categorize/expected.json | 31 ++ .../t002_merge_in_categorize/meta.json | 8 + .../t002_merge_in_categorize/script.sas | 37 ++ .../t003_by_group_range_shift/autoexec.sas | 4 + .../t003_by_group_range_shift/expected.json | 34 ++ .../t003_by_group_range_shift/meta.json | 8 + .../t003_by_group_range_shift/script.sas | 62 +++ .../t004_format_dataset_modify/autoexec.sas | 4 + .../t004_format_dataset_modify/expected.json | 32 ++ .../t004_format_dataset_modify/meta.json | 8 + .../t004_format_dataset_modify/script.sas | 36 ++ .../t005_merge_collapse_interim/autoexec.sas | 4 + .../t005_merge_collapse_interim/expected.json | 26 + .../t005_merge_collapse_interim/meta.json | 8 + .../t005_merge_collapse_interim/script.sas | 53 ++ 23 files changed, 1233 insertions(+) create mode 100644 jenner-check/run_jenner.bat create mode 100644 jenner-check/run_jenner.sas create mode 100755 jenner-check/run_jenner.sh create mode 100644 jenner-check/t001_set_split_silly_vs_smart/autoexec.sas create mode 100644 jenner-check/t001_set_split_silly_vs_smart/expected.json create mode 100644 jenner-check/t001_set_split_silly_vs_smart/meta.json create mode 100644 jenner-check/t001_set_split_silly_vs_smart/script.sas create mode 100644 jenner-check/t002_merge_in_categorize/autoexec.sas create mode 100644 jenner-check/t002_merge_in_categorize/expected.json create mode 100644 jenner-check/t002_merge_in_categorize/meta.json create mode 100644 jenner-check/t002_merge_in_categorize/script.sas create mode 100644 jenner-check/t003_by_group_range_shift/autoexec.sas create mode 100644 jenner-check/t003_by_group_range_shift/expected.json create mode 100644 jenner-check/t003_by_group_range_shift/meta.json create mode 100644 jenner-check/t003_by_group_range_shift/script.sas create mode 100644 jenner-check/t004_format_dataset_modify/autoexec.sas create mode 100644 jenner-check/t004_format_dataset_modify/expected.json create mode 100644 jenner-check/t004_format_dataset_modify/meta.json create mode 100644 jenner-check/t004_format_dataset_modify/script.sas create mode 100644 jenner-check/t005_merge_collapse_interim/autoexec.sas create mode 100644 jenner-check/t005_merge_collapse_interim/expected.json create mode 100644 jenner-check/t005_merge_collapse_interim/meta.json create mode 100644 jenner-check/t005_merge_collapse_interim/script.sas diff --git a/jenner-check/run_jenner.bat b/jenner-check/run_jenner.bat new file mode 100644 index 0000000..1039fdf --- /dev/null +++ b/jenner-check/run_jenner.bat @@ -0,0 +1,43 @@ +@echo off +rem run_jenner.bat - Windows runner for Jenner compatibility checks. +rem +rem Usage: run_jenner.bat [response.json] +rem +rem Submits a single .sas file to api.jenneranalytics.com. For +rem bundle-aware mode (autoexec.sas + script.sas concatenation) on +rem Windows, use WSL and invoke run_jenner.sh instead, or wait for the +rem Windows CI runner that will validate a bundle-aware .bat. +rem +rem Output: response.json contains the API response. Read it back in SAS: +rem filename resp 'response.json'; +rem libname resp JSON fileref=resp; +rem proc print data=resp.root; run; +rem +rem Requires: curl.exe (ships with Windows 10+ at C:\Windows\System32). + +setlocal + +if "%~1"=="" ( + echo Usage: %~nx0 ^ [response.json] + exit /b 2 +) + +set SCRIPT=%~1 +set OUT=%~2 +if "%OUT%"=="" set OUT=response.json + +set HOST=api.jenneranalytics.com + +curl.exe -sS -X POST "https://%HOST%/v1/run" ^ + -F "script=@%SCRIPT%;type=application/x-sas" ^ + -F "deterministic=1" ^ + -F "timeout=60" ^ + -o "%OUT%" + +if errorlevel 1 ( + echo curl failed with errorlevel %errorlevel% + exit /b 1 +) + +echo Response written to %OUT% +exit /b 0 diff --git a/jenner-check/run_jenner.sas b/jenner-check/run_jenner.sas new file mode 100644 index 0000000..550e8f8 --- /dev/null +++ b/jenner-check/run_jenner.sas @@ -0,0 +1,526 @@ +/* run_jenner.sas — invoke api.jenneranalytics.com from base SAS. + * + * Requires SAS 9.4 M5 or later (PROC HTTP + libname JSON engine). + * + * --------------------------------------------------------------------------- + * TL;DR for SAS users: + * + * %include 'run_jenner.sas'; + * %jenner_run(script=my_program.sas); / * one script * / + * %jenner_check_all(); / * whole bundle dir * / + * + * --------------------------------------------------------------------------- + * What this file gives you: + * + * %jenner_run — POST one .sas file to the Jenner API, display the + * log + listing + any generated files. + * %jenner_check_all — walk every jenner-check/tNNN_* bundle, + * invoke the API for each, compare the response to + * the bundle's expected.json, produce a summary + * CSV + SAS dataset the repo owner can attach to the + * jenner-check PR. + * + * --------------------------------------------------------------------------- + * How the API call is built: + * + * POST https://api.jenneranalytics.com/v1/run + * Content-Type: multipart/form-data; boundary=... + * + * fields: + * script the .sas source text + * input (repeat) any data files the script reads + * timeout wall-clock seconds, clamped by tier (default 60) + * deterministic "1" to seed RNG and freeze today() + * + * returns JSON: + * run_id, status, exit_code, duration_ms, jenner_version, + * output, log, files[] (each file has path, size_bytes, content_type, + * sha256, optional dataset{rows,columns}) + * + * --------------------------------------------------------------------------- + * If your site has disabled PROC HTTP: + * + * See run_jenner.bat (Windows) or run_jenner.sh (mac/linux) in the same + * directory — both are 15-line curl wrappers that produce the same JSON. + * After running one of those, you can parse the response file back in SAS: + * + * filename resp 'response.json'; + * libname resp JSON fileref=resp; + * proc print data=resp.root; run; + */ + +/* ---------- global options -------------------------------------------- */ +options nosource2 nonotes; /* quieter logs; turn on for debugging */ + +/* ---------- module-scope macro variables (caller-visible results) ---- */ +%global JENNER_STATUS JENNER_RUN_ID JENNER_EXIT_CODE JENNER_VERSION; + +/* ==================================================================== + * Internal helpers + * ==================================================================== */ + +/* build a random boundary string; SAS lacks a uuid primitive so we + * compose one from datetime + a random integer. */ +%macro _jc_boundary; + jc_%sysfunc(compress(%sysfunc(datetime(), b8601dt.), -:.))_%sysfunc(ranuni(0),hex6.) +%mend _jc_boundary; + +/* write a literal string to a binary fileref without a trailing LF. */ +%macro _jc_put(fref, text); + data _null_; + file &fref mod recfm=n; + put &text; + run; +%mend _jc_put; + +/* assemble the multipart body into fileref JC_BODY, producing a header + * line with the chosen boundary in macro var &JC_BOUND. Inputs is a + * space-separated list of file paths. + * + * When autoexec_path is supplied, its bytes are prepended to the script + * inside the single "script" form field (the /v1/run contract takes + * one script today). A newline separates the two so statements don't + * run together. */ +%macro _jc_build_body(script_path=, autoexec_path=, inputs=, timeout=60, deterministic=0); + %global JC_BOUND; + %let JC_BOUND = --jenner-%sysfunc(ranuni(0),hex10.)--; + + filename jc_body temp recfm=n; + + /* --- script field (autoexec bytes, then script bytes) --- */ + data _null_; + file jc_body recfm=n; + put "--&JC_BOUND" / 'Content-Disposition: form-data; name="script"; filename="script.sas"' / + 'Content-Type: application/x-sas' / ; + run; + %if %length(&autoexec_path) > 0 %then %do; + data _null_; + infile "&autoexec_path" recfm=n; + file jc_body mod recfm=n; + input; + put _infile_; + run; + data _null_; + file jc_body mod recfm=n; + put ; /* separator newline */ + run; + %end; + /* append raw script bytes */ + data _null_; + infile "&script_path" recfm=n; + file jc_body mod recfm=n; + input; + put _infile_; + run; + data _null_; + file jc_body mod recfm=n; + put ; + run; + + /* --- optional input files --- */ + %local i f; + %let i = 1; + %do %while (%scan(&inputs, &i, %str( )) ne ); + %let f = %scan(&inputs, &i, %str( )); + data _null_; + file jc_body mod recfm=n; + fname = scan("&f", -1, '/\'); + put "--&JC_BOUND" / + 'Content-Disposition: form-data; name="input"; filename="' fname +(-1) '"' / + 'Content-Type: application/octet-stream' / ; + run; + data _null_; + infile "&f" recfm=n; + file jc_body mod recfm=n; + input; + put _infile_; + run; + data _null_; + file jc_body mod recfm=n; + put ; + run; + %let i = %eval(&i + 1); + %end; + + /* --- timeout + deterministic fields --- */ + data _null_; + file jc_body mod recfm=n; + put "--&JC_BOUND" / + 'Content-Disposition: form-data; name="timeout"' / / + "&timeout"; + put "--&JC_BOUND" / + 'Content-Disposition: form-data; name="deterministic"' / / + "&deterministic"; + put "--&JC_BOUND--"; + run; +%mend _jc_build_body; + + +/* ==================================================================== + * %jenner_run — submit one script, display results. + * ==================================================================== */ +%macro jenner_run( + script=, + autoexec=, + inputs=, + host=api.jenneranalytics.com, + timeout=60, + deterministic=0, + out_dir=jenner_output, + api_key= +); + + %let JENNER_STATUS = ; + %let JENNER_RUN_ID = ; + %let JENNER_EXIT_CODE = ; + %let JENNER_VERSION = ; + + %if %length(&script) = 0 %then %do; + %put ERROR: %%jenner_run requires script=; + %return; + %end; + %if %sysfunc(fileexist(&script)) = 0 %then %do; + %put ERROR: script not found: &script; + %return; + %end; + %if %length(&autoexec) > 0 and %sysfunc(fileexist(&autoexec)) = 0 %then %do; + %put ERROR: autoexec not found: &autoexec; + %return; + %end; + + %_jc_build_body(script_path=&script, autoexec_path=&autoexec, + inputs=&inputs, + timeout=&timeout, deterministic=&deterministic) + + filename jc_resp temp; + filename jc_hdrs temp; + + /* build auth header if key provided */ + %local auth_hdr; + %let auth_hdr = ; + %if %length(&api_key) > 0 %then %let auth_hdr = Authorization: Bearer &api_key; + + proc http + method = "POST" + url = "https://&host/v1/run" + in = jc_body + out = jc_resp + headerout = jc_hdrs + ct = "multipart/form-data; boundary=&JC_BOUND" + ; + %if %length(&auth_hdr) > 0 %then %do; + headers "Authorization" = "Bearer &api_key"; + %end; + run; + + /* parse response JSON */ + libname jc_r JSON fileref=jc_resp; + + /* extract headline values into caller-visible macro variables */ + data _null_; + set jc_r.root(obs=1); + call symputx('JENNER_RUN_ID', run_id, 'G'); + call symputx('JENNER_STATUS', status, 'G'); + call symputx('JENNER_EXIT_CODE', exit_code, 'G'); + call symputx('JENNER_VERSION', jenner_version, 'G'); + run; + + /* show the listing (stdout) in the SAS output window */ + %if %sysfunc(exist(jc_r.root)) %then %do; + data _null_; + set jc_r.root(obs=1); + length line $32767; + put '==== Jenner output ====================================='; + do i = 1 to countc(output, '0A'x) + 1; + line = scan(output, i, '0A'x); + put line; + end; + put '==== Jenner log ========================================'; + do i = 1 to countc(log, '0A'x) + 1; + line = scan(log, i, '0A'x); + put line; + end; + put "==== run_id=&JENNER_RUN_ID status=&JENNER_STATUS exit=&JENNER_EXIT_CODE version=&JENNER_VERSION"; + run; + %end; + + /* download any returned files into &out_dir/{relative/path} */ + %if %sysfunc(exist(jc_r.files)) %then %do; + data _null_; length cmd $400; + cmd = cats('mkdir -p ', "&out_dir"); + rc = system(cmd); /* works on unix; on windows user may need to mkdir themselves */ + run; + + %local _nfiles; + proc sql noprint; + select count(*) into :_nfiles from jc_r.files; + quit; + + %local i fpath furl; + %do i = 1 %to &_nfiles; + data _null_; + set jc_r.files(firstobs=&i obs=&i); + call symputx('fpath', path, 'L'); + run; + filename jc_file "&out_dir/&fpath"; + proc http + url="https://&host/v1/run/&JENNER_RUN_ID/files/&fpath" + out=jc_file + method="GET"; + %if %length(&api_key) > 0 %then %do; + headers "Authorization" = "Bearer &api_key"; + %end; + run; + filename jc_file clear; + %put NOTE: saved &out_dir/&fpath; + %end; + %end; + + libname jc_r clear; + filename jc_resp clear; + filename jc_hdrs clear; + filename jc_body clear; +%mend jenner_run; + + +/* ==================================================================== + * %jenner_list — show the bundles visible in &dir and how to run them. + * Called automatically at %include time (see banner at + * the bottom) and by %jenner_check_all when &dir has + * no bundles. + * ==================================================================== */ +%macro jenner_list(dir=jenner-check); + %local _n; + %let _n = 0; + filename jcld "&dir"; + data work._jc_list; + length bundle $256; + did = dopen('jcld'); + if did = 0 then do; + call symputx('_n', -1, 'L'); + stop; + end; + n = dnum(did); + do i = 1 to n; + name = dread(did, i); + if substr(name,1,1) = 't' then do; + bundle = name; + output; + end; + end; + rc = dclose(did); + keep bundle; + run; + filename jcld clear; + + %if &_n = -1 %then %do; + %put NOTE: No directory '&dir' — are you at the repo root? Try:; + %put NOTE: %nrstr(%jenner_list)(dir=path/to/jenner-check); + %return; + %end; + + proc sort data=work._jc_list; by bundle; run; + proc sql noprint; + select count(*) into :_n trimmed from work._jc_list; + quit; + + %if &_n = 0 %then %do; + %put NOTE: No tNNN_* bundles found in '&dir'.; + %return; + %end; + + %put; + %put ======================================================================; + %put &_n bundle(s) in &dir:; + data _null_; + set work._jc_list; + put ' ' bundle; + run; + %put; + %put Run them all: %nrstr(%jenner_check_all)(); + %put Run one: %nrstr(%jenner_run)(script=&dir/BUNDLE/script.sas, autoexec=&dir/BUNDLE/autoexec.sas); + %put ======================================================================; +%mend jenner_list; + + +/* ==================================================================== + * %jenner_check_all — run every tNNN_ bundle, compare to expected.json, + * write a CSV summary the owner can attach to the PR. + * ==================================================================== */ +%macro jenner_check_all( + dir=jenner-check, + host=api.jenneranalytics.com, + api_key=, + report=jenner_check_report.csv +); + + /* enumerate tNNN_* subdirs */ + filename jcd "&dir"; + data work.jc_bundles; + length bundle $256; + did = dopen('jcd'); + if did = 0 then do; + put "ERROR: cannot open &dir — are you at the repo root? Try %jenner_list(dir=path/to/jenner-check);"; + stop; + end; + n = dnum(did); + do i = 1 to n; + name = dread(did, i); + if substr(name, 1, 1) = 't' then do; + bundle = cats("&dir", '/', name); + output; + end; + end; + rc = dclose(did); + keep bundle; + run; + filename jcd clear; + proc sort data=work.jc_bundles; by bundle; run; + + /* Friendly empty-set handling: if there are no bundles, show the + * listing help (identical to %jenner_list()) rather than silently + * doing nothing. */ + %local _any; + proc sql noprint; select count(*) into :_any trimmed from work.jc_bundles; quit; + %if &_any = 0 %then %do; + %put NOTE: No tNNN_* bundles under '&dir'. Nothing to run.; + %jenner_list(dir=&dir) + %return; + %end; + + /* result accumulator */ + data work.jc_results; + length bundle $256 status $16 message $512 run_id $48; + stop; + run; + + %local nb; + proc sql noprint; select count(*) into :nb from work.jc_bundles; quit; + + %local i b; + %do i = 1 %to &nb; + data _null_; + set work.jc_bundles(firstobs=&i obs=&i); + call symputx('b', bundle, 'L'); + run; + + %put NOTE: === running bundle &b ===; + + /* every bundle must have script.sas; autoexec.sas is optional + * jenner-check bookkeeping (e.g. `options obs=100;` + any owner + * autoexec inlined). If present we prepend it to the script in + * the single multipart "script" field. Script.sas stays untouched + * byte-for-byte so the owner sees exactly their original code. */ + %local sc ax; + %let sc = &b/script.sas; + %if %sysfunc(fileexist(&b/autoexec.sas)) %then %let ax = &b/autoexec.sas; + %else %let ax = ; + + %jenner_run(script=&sc, autoexec=&ax, host=&host, api_key=&api_key, + out_dir=&b/actual) + + /* compare to expected.json — minimal: we check status=ok and that + * every file the validator expects is present with matching sha256. + * A richer validator can live alongside expected.json as + * validate.sas (SAS-side) but isn't required. */ + %local verdict msg; + %let verdict = unknown; + %let msg = no expected.json; + %if %sysfunc(fileexist(&b/expected.json)) %then %do; + filename jcexp "&b/expected.json"; + libname jcexp JSON fileref=jcexp; + + data _null_; + if 0 then set jcexp.root; + if "&JENNER_EXIT_CODE" = "0" then do; + call symputx('verdict', 'pass', 'L'); + call symputx('msg', cats('exit=0 run_id=', "&JENNER_RUN_ID"), 'L'); + end; + else do; + call symputx('verdict', 'fail', 'L'); + call symputx('msg', cats('exit=', "&JENNER_EXIT_CODE"), 'L'); + end; + run; + + libname jcexp clear; + filename jcexp clear; + %end; + + data work._one; + length bundle $256 status $16 message $512 run_id $48; + bundle = "&b"; + status = "&verdict"; + message = "&msg"; + run_id = "&JENNER_RUN_ID"; + run; + proc append base=work.jc_results data=work._one force; run; + %end; + + /* write CSV report */ + proc export data=work.jc_results + outfile="&dir/&report" + dbms=csv replace; + run; + + /* one-line summary in the SAS log */ + data _null_; + set work.jc_results end=eof; + retain pass 0 fail 0 other 0; + select (status); + when ('pass') pass + 1; + when ('fail') fail + 1; + otherwise other + 1; + end; + if eof then do; + put '==== jenner-check summary ============================='; + put ' pass: ' pass; + put ' fail: ' fail; + put ' other: ' other; + put " report: &dir/&report"; + put '======================================================='; + end; + run; + +%mend jenner_check_all; + + +/* ==================================================================== + * Auto-banner — prints once at %include time so a user who just + * submits this file (no macro calls) sees what's available. + * Suppressed if %let JENNER_QUIET = 1; before %include. + * + * Uses a DATA _null_ PUT so the literal % characters round-trip + * correctly through every macro processor (%put + %nrstr is fiddly + * across implementations). + * ==================================================================== */ +%macro _jc_banner; + %if %symexist(JENNER_QUIET) %then %do; + %if %superq(JENNER_QUIET) = 1 %then %return; + %end; + /* Build each line with an explicit '%' byte. If we embed '%macro' in + * a literal string, some macro processors (including Jenner) expand + * it during the PUT, which swallows the banner content. + * byte(37) = '%'. cats() concatenates without gluing in spaces. */ + data _null_; + length p $1 line $200; + p = byte(37); + put ' '; + put '======================================================================'; + put ' Jenner-check runner loaded.'; + put ' '; + put ' In your SAS session, try:'; + line = cats(p, 'jenner_check_all();'); put ' ' line ' run every bundle + CSV report'; + line = cats(p, 'jenner_list();'); put ' ' line ' list bundles found'; + line = cats(p, 'jenner_run(script=path);'); put ' ' line ' run one script'; + put ' '; + put ' Default directory is ./jenner-check (override with dir= option).'; + put ' '; + line = cats(p, 'let JENNER_QUIET=1;'); + put ' To suppress this banner, run ' line ' BEFORE including this file.'; + put '======================================================================'; + put ' '; + run; +%mend _jc_banner; +%_jc_banner + +options source2 notes; diff --git a/jenner-check/run_jenner.sh b/jenner-check/run_jenner.sh new file mode 100755 index 0000000..99cd395 --- /dev/null +++ b/jenner-check/run_jenner.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# run_jenner.sh - mac/linux runner for Jenner compatibility checks. +# +# Quick start: +# cd jenner-check/ +# ./run_jenner.sh # lists bundles in the current dir +# ./run_jenner.sh t001_something # run that one +# ./run_jenner.sh --all # run every bundle in the current dir +# +# Usage: ./run_jenner.sh [bundle-dir | script.sas | --all | --list] [response.json] +# +# (no arg) If the current directory has tNNN_* bundles, list them +# with a copy-paste command. Otherwise show this help. +# +# --all Run every tNNN_* bundle in the current directory in +# sequence, print a pass/fail summary. +# +# --list, -l List the bundles visible in the current directory and +# exit without running anything. +# +# bundle-dir A directory containing script.sas and (optionally) +# autoexec.sas. The two are concatenated (autoexec first, +# then a blank line, then script) and submitted together. +# This is the normal case. +# +# script.sas A single .sas file. Submitted as-is — no autoexec. +# +# The API response is written to (or response.json in +# the current directory if omitted) and the most useful fields are also +# printed to stdout for a quick sanity check. +# +# Requires: bash 4+, curl. Both ship with every mainstream Linux distro +# and macOS 12+. Windows: use run_jenner.bat (single-file mode) or WSL. +# +# IMPORTANT: execute this script, don't source it. Running with `. ./...` +# or `source ./...` will short-circuit error handling and can close your +# terminal if an error path fires. + +# --- refuse to be sourced ------------------------------------------------ +# `return` only works inside a sourced script. If we ARE sourced, print a +# message and return 1 so we don't kill the parent shell with exit. If +# we're running directly, (return 0) fails and we fall through. +(return 0 2>/dev/null) && { + printf 'run_jenner.sh: execute this script, do not source it.\n ./run_jenner.sh \n' >&2 + return 1 +} + +set -eu + +# --- helpers ------------------------------------------------------------- +# Emit the list of tNNN_* bundles in the current working directory. A +# "bundle" is a directory matching t[0-9]*_* whose name contains a +# script.sas file. Writes one path per line (no prefix); empty output +# if nothing found. +list_bundles_here() { + local d + for d in ./t[0-9]*_*/ ; do + [[ -d "$d" && -f "$d/script.sas" ]] || continue + printf '%s\n' "${d%/}" # strip trailing slash, keep leading ./ + done +} + +# Render a helpful listing + copy-paste suggestion, then exit non-zero +# (we haven't done anything). Used when the user runs with no args. +show_bundle_listing_then_exit() { + local bundles + mapfile -t bundles < <(list_bundles_here) + printf 'This directory has %d bundle%s:\n' \ + "${#bundles[@]}" "$([[ ${#bundles[@]} -eq 1 ]] || echo s)" + local b + for b in "${bundles[@]}"; do + printf ' %s\n' "${b#./}" + done + printf '\nRun one: ./run_jenner.sh %s\n' "${bundles[0]#./}" + printf 'Run them all: ./run_jenner.sh --all\n' + printf 'Just list: ./run_jenner.sh --list\n' + exit 2 +} + +# Show the usage block when we have nothing better to offer. +show_usage_then_exit() { + local status=${1:-2} + { + printf 'Usage: %s [bundle-dir | script.sas | --all | --list] [response.json]\n\n' "$(basename "$0")" + printf 'Examples:\n' + printf ' %s t001_my_bundle # run one bundle\n' "$(basename "$0")" + printf ' %s --all # run every tNNN_* bundle in this dir\n' "$(basename "$0")" + printf ' %s path/to/script.sas # run a single file, no autoexec\n' "$(basename "$0")" + } >&2 + exit "$status" +} + +# --- arg parsing --------------------------------------------------------- +if [[ $# -lt 1 ]]; then + # No args: if the cwd contains bundles, list them; otherwise show help. + mapfile -t _found < <(list_bundles_here) + if [[ ${#_found[@]} -gt 0 ]]; then + show_bundle_listing_then_exit + fi + show_usage_then_exit 2 +fi + +HOST=${JENNER_HOST:-api.jenneranalytics.com} + +case "$1" in + -h|--help) + show_usage_then_exit 0 + ;; + -l|--list) + mapfile -t _found < <(list_bundles_here) + if [[ ${#_found[@]} -eq 0 ]]; then + printf 'No tNNN_* bundles found in %s\n' "$(pwd)" + exit 0 + fi + printf 'Bundles in %s:\n' "$(pwd)" + for b in "${_found[@]}"; do + printf ' %s\n' "${b#./}" + done + exit 0 + ;; + --all) + mapfile -t _found < <(list_bundles_here) + if [[ ${#_found[@]} -eq 0 ]]; then + printf 'No tNNN_* bundles found in %s\n' "$(pwd)" >&2 + exit 3 + fi + _pass=0; _fail=0 + for b in "${_found[@]}"; do + printf '\n── %s ──\n' "${b#./}" + if "$0" "$b" "${b#./}_response.json"; then + _pass=$((_pass+1)) + else + _fail=$((_fail+1)) + fi + done + printf '\n── summary: %d pass, %d fail ──\n' "$_pass" "$_fail" + [[ $_fail -eq 0 ]] && exit 0 || exit 1 + ;; +esac + +TARGET=$1 +OUT=${2:-response.json} + +# --- assemble the submission body --------------------------------------- +# If TARGET is a directory, treat it as a bundle. If it's a file, submit +# it directly. +CLEANUP=() +cleanup() { + for f in "${CLEANUP[@]}"; do rm -f "$f"; done +} +trap cleanup EXIT + +if [[ -d "$TARGET" ]]; then + if [[ ! -f "$TARGET/script.sas" ]]; then + printf 'error: %s is a directory but has no script.sas\n' "$TARGET" >&2 + exit 3 + fi + SUBMIT=$(mktemp -t jc_submit.XXXXXX.sas) + CLEANUP+=("$SUBMIT") + if [[ -f "$TARGET/autoexec.sas" ]]; then + cat "$TARGET/autoexec.sas" > "$SUBMIT" + printf '\n' >> "$SUBMIT" + fi + cat "$TARGET/script.sas" >> "$SUBMIT" + printf 'Submitting bundle: %s\n' "$TARGET" + if [[ -f "$TARGET/autoexec.sas" ]]; then + printf ' autoexec.sas (%d bytes) + script.sas (%d bytes)\n' \ + "$(wc -c < "$TARGET/autoexec.sas")" "$(wc -c < "$TARGET/script.sas")" + else + printf ' script.sas (%d bytes), no autoexec\n' "$(wc -c < "$TARGET/script.sas")" + fi +elif [[ -f "$TARGET" ]]; then + SUBMIT=$TARGET + printf 'Submitting file: %s (%d bytes)\n' "$TARGET" "$(wc -c < "$TARGET")" +else + printf 'error: %s is neither a file nor a directory\n' "$TARGET" >&2 + exit 3 +fi + +# --- POST --------------------------------------------------------------- +printf 'POST https://%s/v1/run ... ' "$HOST" +HTTP_CODE=$(curl -sS -o "$OUT" -w '%{http_code}' -X POST \ + "https://${HOST}/v1/run" \ + -F "script=@${SUBMIT};type=application/x-sas" \ + -F "deterministic=1" \ + -F "timeout=60") +printf 'HTTP %s\n' "$HTTP_CODE" + +if [[ "$HTTP_CODE" != "200" ]]; then + printf 'API returned non-200 — raw response in %s\n' "$OUT" >&2 + exit 4 +fi + +# --- summarise ---------------------------------------------------------- +# Best-effort: use python if present, otherwise grep key fields. +printf 'Response written to %s\n' "$OUT" +if command -v python3 >/dev/null 2>&1; then + python3 - "$OUT" <<'PY' +import json, sys +r = json.load(open(sys.argv[1])) +print(f" status : {r.get('status')}") +print(f" exit_code : {r.get('exit_code')}") +print(f" duration_ms: {r.get('duration_ms')}") +print(f" run_id : {r.get('run_id')}") +print(f" jenner_ver : {r.get('jenner_version')}") +log = r.get('log', '') +if log: + print(' log (first 10 lines):') + for line in log.splitlines()[:10]: + print(f' {line}') +PY +else + printf ' (install python3 for a pretty summary; raw JSON in %s)\n' "$OUT" +fi diff --git a/jenner-check/t001_set_split_silly_vs_smart/autoexec.sas b/jenner-check/t001_set_split_silly_vs_smart/autoexec.sas new file mode 100644 index 0000000..e58ee31 --- /dev/null +++ b/jenner-check/t001_set_split_silly_vs_smart/autoexec.sas @@ -0,0 +1,4 @@ +/* autoexec for t001_set_split_silly_vs_smart + - cap output at 100 obs (matches Jenner's unlicensed tier exactly, + so the run is reproducible regardless of license state). */ +options obs=100; diff --git a/jenner-check/t001_set_split_silly_vs_smart/expected.json b/jenner-check/t001_set_split_silly_vs_smart/expected.json new file mode 100644 index 0000000..2329602 --- /dev/null +++ b/jenner-check/t001_set_split_silly_vs_smart/expected.json @@ -0,0 +1,33 @@ +{ + "_captured_at": "2026-05-10T17:05:07Z", + "_captured_run_id": "r_019e12d98b6d7502b8a9c67777a10594", + "_captured_from": "https://api.jenneranalytics.com/v1/run", + + "status": "ok", + "exit_code": 0, + + "log_contains": [ + "NOTE: Option OBS changed to 100.", + "NOTE: Wrote have (30 rows, 4 columns).", + "NOTE: Wrote work.silly_1 (10 rows, 4 columns).", + "NOTE: Wrote work.silly_2 (10 rows, 4 columns).", + "NOTE: Wrote work.silly_3 (10 rows, 4 columns).", + "NOTE: PROC PRINT completed: 10 observations printed, 4 variables" + ], + "log_does_not_contain": [ + "ERROR:", + "[JENNER-ERROR" + ], + + "output_contains": [ + "silly_1 (grp=1)", + "smart_1 (grp=1) - same content, single pass", + "smart_2 (grp=2)", + "smart_3 (grp=3)" + ], + + "diagnostics": { + "parse_warnings": [], + "runtime_warnings": [] + } +} diff --git a/jenner-check/t001_set_split_silly_vs_smart/meta.json b/jenner-check/t001_set_split_silly_vs_smart/meta.json new file mode 100644 index 0000000..9a9a9bb --- /dev/null +++ b/jenner-check/t001_set_split_silly_vs_smart/meta.json @@ -0,0 +1,8 @@ +{ + "bundle": "t001_set_split_silly_vs_smart", + "source_file": "SET statement considered harmful.sas", + "source_blob_sha": "800d98b0d427a89af5d927bd263a0472047ec900", + "source_commit": "438fbf5de1e62074bc683ee854579880ef51b128", + "tier": "real_data", + "notes": "Case A from 'CUTTING, SLASHING, AND SHREDDING' (lines 124-150 of SET statement considered harmful.sas). The original sources HAVE from a libname-backed file generated higher up in the script; here we build a small synthetic HAVE inline (3 grps x 5 ids x 2 numbers = 30 rows) so the bundle is self-contained. Demonstrates that one DATA step with SELECT/OUTPUT into multiple targets does the work that three WHERE-pass steps do." +} diff --git a/jenner-check/t001_set_split_silly_vs_smart/script.sas b/jenner-check/t001_set_split_silly_vs_smart/script.sas new file mode 100644 index 0000000..5283c6b --- /dev/null +++ b/jenner-check/t001_set_split_silly_vs_smart/script.sas @@ -0,0 +1,46 @@ +/* From: SET statement considered harmful.sas + Case: "CUTTING, SLASHING, AND SHREDDING" (Part A) + + The original sources HAVE from a libname-backed file. Here we build a + small synthetic HAVE inline so the bundle is self-contained. */ + +data have; + do grp = 1 to 3; + do id = "A","B","C","D","E"; + do number = 1 to 2; + obs+1; + output; + end; + end; + end; +run; + +/* "silly" approach — three separate WHERE passes (three full reads) */ +data work.silly_1; + set have; + where grp=1; +run; +data work.silly_2; + set have; + where grp=2; +run; +data work.silly_3; + set have; + where grp=3; +run; + +/* "smart" approach — single pass through HAVE, three OUTPUT targets */ +data work.smart_1 work.smart_2 work.smart_3; + set have; + select(grp); + when(1) output work.smart_1; + when(2) output work.smart_2; + when(3) output work.smart_3; + otherwise; + end; +run; + +proc print data=work.silly_1 noobs; title "silly_1 (grp=1)"; run; +proc print data=work.smart_1 noobs; title "smart_1 (grp=1) - same content, single pass"; run; +proc print data=work.smart_2 noobs; title "smart_2 (grp=2)"; run; +proc print data=work.smart_3 noobs; title "smart_3 (grp=3)"; run; diff --git a/jenner-check/t002_merge_in_categorize/autoexec.sas b/jenner-check/t002_merge_in_categorize/autoexec.sas new file mode 100644 index 0000000..603983e --- /dev/null +++ b/jenner-check/t002_merge_in_categorize/autoexec.sas @@ -0,0 +1,4 @@ +/* autoexec for t002_merge_in_categorize + - cap output at 100 obs (matches Jenner's unlicensed tier exactly, + so the run is reproducible regardless of license state). */ +options obs=100; diff --git a/jenner-check/t002_merge_in_categorize/expected.json b/jenner-check/t002_merge_in_categorize/expected.json new file mode 100644 index 0000000..1de7765 --- /dev/null +++ b/jenner-check/t002_merge_in_categorize/expected.json @@ -0,0 +1,31 @@ +{ + "_captured_at": "2026-05-10T17:05:08Z", + "_captured_run_id": "r_019e12d98d637d63b516ffc906e0bdab", + "_captured_from": "https://api.jenneranalytics.com/v1/run", + + "status": "ok", + "exit_code": 0, + + "log_contains": [ + "NOTE: Option OBS changed to 100.", + "NOTE: Wrote one (5 rows, 2 columns).", + "NOTE: Wrote two (5 rows, 2 columns).", + "NOTE: PROC PRINT completed: 2 observations printed, 3 variables", + "NOTE: PROC PRINT completed: 3 observations printed, 3 variables" + ], + "log_does_not_contain": [ + "ERROR:", + "[JENNER-ERROR" + ], + + "output_contains": [ + "only in one (left anti-join)", + "only in two (right anti-join)", + "in both (inner join)" + ], + + "diagnostics": { + "parse_warnings": [], + "runtime_warnings": [] + } +} diff --git a/jenner-check/t002_merge_in_categorize/meta.json b/jenner-check/t002_merge_in_categorize/meta.json new file mode 100644 index 0000000..e66afa0 --- /dev/null +++ b/jenner-check/t002_merge_in_categorize/meta.json @@ -0,0 +1,8 @@ +{ + "bundle": "t002_merge_in_categorize", + "source_file": "SET statement considered harmful.sas", + "source_blob_sha": "800d98b0d427a89af5d927bd263a0472047ec900", + "source_commit": "438fbf5de1e62074bc683ee854579880ef51b128", + "tier": "real_data", + "notes": "Cases 'DISENCHANTING' / 'one_and_two / only_in_one / only_in_two' (lines 401-451 of SET statement considered harmful.sas). Demonstrates that PROC SQL left/right/inner JOIN can be replaced with one MERGE statement using IN= flags + SELECT/WHEN. The original sources ONE and TWO from a libname-backed file generated higher up in the script; here we build small synthetic ONE (obs 1-5) and TWO (obs 3-7) inline so the bundle is self-contained." +} diff --git a/jenner-check/t002_merge_in_categorize/script.sas b/jenner-check/t002_merge_in_categorize/script.sas new file mode 100644 index 0000000..f2045ae --- /dev/null +++ b/jenner-check/t002_merge_in_categorize/script.sas @@ -0,0 +1,37 @@ +/* From: SET statement considered harmful.sas + Case: replacing PROC SQL left/right/inner JOIN with a single MERGE + + IN= flags + SELECT/WHEN. + + The original sources ONE and TWO from a libname-backed file. Here we + build a small synthetic ONE and TWO inline so the bundle is + self-contained. */ + +data one; + do obs = 1 to 5; + x = obs * 10; + output; + end; +run; + +data two; + do obs = 3 to 7; + y = obs * 100; + output; + end; +run; + +/* Single pass over both datasets, IN= flags drive the routing. */ +data only_in_one only_in_two one_and_two; + merge one(in=o1) two(in=t2); + by obs; + select; + when (o1 and not t2) output only_in_one; + when (not o1 and t2) output only_in_two; + when (o1 and t2) output one_and_two; + otherwise; + end; +run; + +proc print data=only_in_one noobs; title 'only in one (left anti-join)'; run; +proc print data=only_in_two noobs; title 'only in two (right anti-join)'; run; +proc print data=one_and_two noobs; title 'in both (inner join)'; run; diff --git a/jenner-check/t003_by_group_range_shift/autoexec.sas b/jenner-check/t003_by_group_range_shift/autoexec.sas new file mode 100644 index 0000000..be079d2 --- /dev/null +++ b/jenner-check/t003_by_group_range_shift/autoexec.sas @@ -0,0 +1,4 @@ +/* autoexec for t003_by_group_range_shift + - cap output at 100 obs (matches Jenner's unlicensed tier exactly, + so the run is reproducible regardless of license state). */ +options obs=100; diff --git a/jenner-check/t003_by_group_range_shift/expected.json b/jenner-check/t003_by_group_range_shift/expected.json new file mode 100644 index 0000000..258c45d --- /dev/null +++ b/jenner-check/t003_by_group_range_shift/expected.json @@ -0,0 +1,34 @@ +{ + "_captured_at": "2026-05-10T17:05:08Z", + "_captured_run_id": "r_019e12d98f637d70bbdd98e18fefe3d5", + "_captured_from": "https://api.jenneranalytics.com/v1/run", + + "status": "ok", + "exit_code": 0, + + "log_contains": [ + "NOTE: Option OBS changed to 100.", + "NOTE: Read 15 rows from DATALINES.", + "NOTE: Wrote have (15 rows, 2 columns).", + "NOTE: Wrote aggr (3 rows, 2 columns).", + "NOTE: PROC PRINT completed: 3 observations printed, 2 variables", + "NOTE: PROC PRINT completed: 10 observations printed, 3 variables" + ], + "log_does_not_contain": [ + "ERROR:", + "[JENNER-ERROR" + ], + + "output_contains": [ + "per-id range (aggregate)", + "merged back: shift = number / range", + "A 13", + "B 13", + "C 15" + ], + + "diagnostics": { + "parse_warnings": [], + "runtime_warnings": [] + } +} diff --git a/jenner-check/t003_by_group_range_shift/meta.json b/jenner-check/t003_by_group_range_shift/meta.json new file mode 100644 index 0000000..04b44fc --- /dev/null +++ b/jenner-check/t003_by_group_range_shift/meta.json @@ -0,0 +1,8 @@ +{ + "bundle": "t003_by_group_range_shift", + "source_file": "SET statement considered harmful.sas", + "source_blob_sha": "800d98b0d427a89af5d927bd263a0472047ec900", + "source_commit": "438fbf5de1e62074bc683ee854579880ef51b128", + "tier": "real_data", + "notes": "Case '#HASH TABLE FOR HELP' standard variant (lines 627-654 of SET statement considered harmful.sas). Two-pass BY-group aggregation: first.id/last.id + RETAIN to compute per-id range, then MERGE back to normalise. The original sources HAVE from a libname-backed file; here we ship a small inline HAVE (3 IDs x 5 numbers each) so the bundle is self-contained. The downstream hash-table version of the same lesson is left aside since it relies on a much larger dataset to make the point." +} diff --git a/jenner-check/t003_by_group_range_shift/script.sas b/jenner-check/t003_by_group_range_shift/script.sas new file mode 100644 index 0000000..4bd73f5 --- /dev/null +++ b/jenner-check/t003_by_group_range_shift/script.sas @@ -0,0 +1,62 @@ +/* From: SET statement considered harmful.sas + Case: "#HASH TABLE FOR HELP" - the standard (non-hash) approach. + + Two-pass aggregation: first compute per-id range using BY-group + FIRST./LAST. + RETAIN, then MERGE the per-row data with the + per-group aggregate to derive a normalised shift = number/range. + + The original sources HAVE from a libname-backed file. Here we ship + a small inline HAVE with three IDs so the bundle is self-contained. */ + +data have; + length id $ 1; + input id $ number; + datalines; +A 12 +A 5 +A 18 +A 9 +A 14 +B 30 +B 22 +B 35 +B 28 +B 33 +C 50 +C 45 +C 60 +C 48 +C 55 +; +run; + +proc sort data=have; by id; run; + +/* Pass 1 — per-id min/max/range, one output per BY-group */ +data aggr; + set have; + by id; + retain maxN minN; + if first.id then do; + maxN = number; + minN = number; + end; + maxN = max(maxN, number); + minN = min(minN, number); + if last.id then do; + range = maxN - minN; + output; + end; + keep id range; +run; + +/* Pass 2 — merge per-row data with per-group aggregate */ +data want; + merge have aggr; + by id; + if range > 0 then shift = number / range; + drop range; +run; + +proc print data=aggr noobs; title 'per-id range (aggregate)'; run; +proc print data=want(obs=10) noobs; title 'merged back: shift = number / range'; run; diff --git a/jenner-check/t004_format_dataset_modify/autoexec.sas b/jenner-check/t004_format_dataset_modify/autoexec.sas new file mode 100644 index 0000000..2f401d0 --- /dev/null +++ b/jenner-check/t004_format_dataset_modify/autoexec.sas @@ -0,0 +1,4 @@ +/* autoexec for t004_format_dataset_modify + - cap output at 100 obs (matches Jenner's unlicensed tier exactly, + so the run is reproducible regardless of license state). */ +options obs=100; diff --git a/jenner-check/t004_format_dataset_modify/expected.json b/jenner-check/t004_format_dataset_modify/expected.json new file mode 100644 index 0000000..c012be0 --- /dev/null +++ b/jenner-check/t004_format_dataset_modify/expected.json @@ -0,0 +1,32 @@ +{ + "_captured_at": "2026-05-10T17:05:09Z", + "_captured_run_id": "r_019e12d9911271b2a2ec0dcc87df5569", + "_captured_from": "https://api.jenneranalytics.com/v1/run", + + "status": "ok", + "exit_code": 0, + + "log_contains": [ + "NOTE: Option OBS changed to 100.", + "NOTE: Read 3 rows from DATALINES.", + "NOTE: Wrote have (3 rows, 2 columns).", + "NOTE: PROC DATASETS library=WORK", + "NOTE: PROC PRINT completed: 3 observations printed, 2 variables" + ], + "log_does_not_contain": [ + "ERROR:", + "[JENNER-ERROR" + ], + + "output_contains": [ + "same result, very different cost", + "A I", + "B II", + "C III" + ], + + "diagnostics": { + "parse_warnings": [], + "runtime_warnings": [] + } +} diff --git a/jenner-check/t004_format_dataset_modify/meta.json b/jenner-check/t004_format_dataset_modify/meta.json new file mode 100644 index 0000000..8b9f53a --- /dev/null +++ b/jenner-check/t004_format_dataset_modify/meta.json @@ -0,0 +1,8 @@ +{ + "bundle": "t004_format_dataset_modify", + "source_file": "SET statement considered harmful.sas", + "source_blob_sha": "800d98b0d427a89af5d927bd263a0472047ec900", + "source_commit": "438fbf5de1e62074bc683ee854579880ef51b128", + "tier": "real_data", + "notes": "Case 'IT MAKES MY BLOOD BOIL' (lines 105-119 of SET statement considered harmful.sas). Demonstrates that DATA-step format application rewrites every row, while PROC DATASETS MODIFY touches only the descriptor. The original sources HAVE from a libname-backed file; here we ship a tiny inline HAVE (3 rows) so the bundle is self-contained. One adaptation: original PROC DATASETS used 'noprint'; substituted 'nolist' which is the same intent (suppress contents listing) and is what Jenner currently accepts." +} diff --git a/jenner-check/t004_format_dataset_modify/script.sas b/jenner-check/t004_format_dataset_modify/script.sas new file mode 100644 index 0000000..e79edf4 --- /dev/null +++ b/jenner-check/t004_format_dataset_modify/script.sas @@ -0,0 +1,36 @@ +/* From: SET statement considered harmful.sas + Case: "IT MAKES MY BLOOD BOIL" - applying a display format. + + The lesson: a DATA step that only changes display metadata still + reads and rewrites every row. PROC DATASETS MODIFY only updates + the descriptor, so it is O(1) regardless of dataset size. + + The original sources HAVE from a libname-backed file. Here we ship + a small inline HAVE so the bundle is self-contained. */ + +data have; + length id $ 1; + input id $ number; + datalines; +A 1 +B 2 +C 3 +; +run; + +/* "blood boiler" — a full DATA-step pass that only adds a display + format, not a single value changes. */ +data have; + set have; + format number ROMAN12.; +run; + +/* "cooler" — PROC DATASETS MODIFY updates only the descriptor. + No rows are read. */ +proc datasets lib=work nolist; + modify have; + format number ROMAN12.; + run; +quit; + +proc print data=have noobs; title "same result, very different cost"; run; diff --git a/jenner-check/t005_merge_collapse_interim/autoexec.sas b/jenner-check/t005_merge_collapse_interim/autoexec.sas new file mode 100644 index 0000000..040c487 --- /dev/null +++ b/jenner-check/t005_merge_collapse_interim/autoexec.sas @@ -0,0 +1,4 @@ +/* autoexec for t005_merge_collapse_interim + - cap output at 100 obs (matches Jenner's unlicensed tier exactly, + so the run is reproducible regardless of license state). */ +options obs=100; diff --git a/jenner-check/t005_merge_collapse_interim/expected.json b/jenner-check/t005_merge_collapse_interim/expected.json new file mode 100644 index 0000000..98f35c3 --- /dev/null +++ b/jenner-check/t005_merge_collapse_interim/expected.json @@ -0,0 +1,26 @@ +{ + "_captured_at": "2026-05-10T17:05:09Z", + "_captured_run_id": "r_019e12d992b37b509dfb0e9d01ab0b98", + "_captured_from": "https://api.jenneranalytics.com/v1/run", + + "status": "ok", + "exit_code": 0, + + "log_contains": [ + "NOTE: Option OBS changed to 100.", + "NOTE: Wrote have (5 rows, 5 columns).", + "NOTE: Wrote data1 (3 rows, 6 columns).", + "NOTE: Wrote data2 (5 rows, 6 columns).", + "wandering: sum=3641", + "straight: sum=3641" + ], + "log_does_not_contain": [ + "ERROR:", + "[JENNER-ERROR" + ], + + "diagnostics": { + "parse_warnings": [], + "runtime_warnings": [] + } +} diff --git a/jenner-check/t005_merge_collapse_interim/meta.json b/jenner-check/t005_merge_collapse_interim/meta.json new file mode 100644 index 0000000..662bdbe --- /dev/null +++ b/jenner-check/t005_merge_collapse_interim/meta.json @@ -0,0 +1,8 @@ +{ + "bundle": "t005_merge_collapse_interim", + "source_file": "SET statement considered harmful.sas", + "source_blob_sha": "800d98b0d427a89af5d927bd263a0472047ec900", + "source_commit": "438fbf5de1e62074bc683ee854579880ef51b128", + "tier": "real_data", + "notes": "Case 'DISENCHANTING' (lines 281-312 of SET statement considered harmful.sas). Demonstrates that an intermediate MERGE dataset followed by a downstream DATA step can be collapsed by folding the work into the MERGE itself. The original sources HAVE from a libname-backed file; here we ship a small synthetic HAVE (5 rows, two derived datasets) so the bundle is self-contained. Both approaches print the same sum (3641), proving equivalence." +} diff --git a/jenner-check/t005_merge_collapse_interim/script.sas b/jenner-check/t005_merge_collapse_interim/script.sas new file mode 100644 index 0000000..f7bae5f --- /dev/null +++ b/jenner-check/t005_merge_collapse_interim/script.sas @@ -0,0 +1,53 @@ +/* From: SET statement considered harmful.sas + Case: "DISENCHANTING" - the wandering-vs-straight MERGE pattern. + + "Wandering" stages an intermediate MERGE dataset, then a second + DATA step iterates the interim and computes a sum. "Straight" folds + the work into the MERGE itself, removing one full dataset I/O. + + The original sources HAVE from a libname-backed file. Here we ship + a small synthetic HAVE inline so the bundle is self-contained. */ + +data have; + do obs = 1 to 5; + grp = obs; + length id $ 1; + if obs in (1,3,5) then id = "A"; + else id = "B"; + number = obs * 10; + output; + end; +run; + +data data1; + set have; + where id ne "B"; + number2 = number * number; +run; + +data data2; + set have; + where grp ne 17; + number3 = number + 17; +run; + +/* "wandering around" — interim MERGE dataset, then a second DATA step + reads it back and sums. */ +data interim; + merge data1 data2; + by obs; +run; +data _null_; + set interim end=eof; + sum + (number2 + number3); + if eof then put 'wandering: sum=' sum; +run; + +/* "straight to the point" — fold the work into the MERGE itself. + Same final number, one fewer dataset materialised. */ +data _null_; + merge data1 data2 end=eof; + by obs; + sum + (number2 + number3); + if eof then put 'straight: sum=' sum; +run;