From 1daad6f519e75ef70319a136c8295b9806b04179 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 14 Jan 2026 12:35:06 -0800 Subject: [PATCH 1/5] Add minimal example datasets for PQG education Hand-crafted small examples to help understand the iSamples PQG format: - JSON: 1-sample and 3-sample examples (validated against schema) - CSV: Flattened entity files (samples, events, locations, sites, agents, edges) - Parquet: Same data in all 3 formats: - Export (3 rows, nested structs) - Narrow (21 rows, explicit edge rows) - Wide (10 rows, p__* columns) Includes README with: - Entity relationship diagram - Example queries for each format - Format comparison table Idea from meeting with Stephen Richard - small examples make format differences much easier to understand. Co-Authored-By: Claude Opus 4.5 --- examples/minimal/README.md | 195 ++++++++++++++++++ examples/minimal/csv/agents.csv | 4 + examples/minimal/csv/edges.csv | 14 ++ examples/minimal/csv/events.csv | 4 + examples/minimal/csv/locations.csv | 3 + examples/minimal/csv/samples.csv | 4 + examples/minimal/csv/sites.csv | 2 + examples/minimal/json/1_sample.json | 86 ++++++++ examples/minimal/json/3_samples.json | 146 +++++++++++++ .../minimal/parquet/minimal_export.parquet | Bin 0 -> 1673 bytes .../minimal/parquet/minimal_narrow.parquet | Bin 0 -> 4831 bytes examples/minimal/parquet/minimal_wide.parquet | Bin 0 -> 4951 bytes 12 files changed, 458 insertions(+) create mode 100644 examples/minimal/README.md create mode 100644 examples/minimal/csv/agents.csv create mode 100644 examples/minimal/csv/edges.csv create mode 100644 examples/minimal/csv/events.csv create mode 100644 examples/minimal/csv/locations.csv create mode 100644 examples/minimal/csv/samples.csv create mode 100644 examples/minimal/csv/sites.csv create mode 100644 examples/minimal/json/1_sample.json create mode 100644 examples/minimal/json/3_samples.json create mode 100644 examples/minimal/parquet/minimal_export.parquet create mode 100644 examples/minimal/parquet/minimal_narrow.parquet create mode 100644 examples/minimal/parquet/minimal_wide.parquet diff --git a/examples/minimal/README.md b/examples/minimal/README.md new file mode 100644 index 0000000..7a5943b --- /dev/null +++ b/examples/minimal/README.md @@ -0,0 +1,195 @@ +# Minimal PQG Example Data + +This directory contains small, hand-crafted example datasets to help understand the iSamples PQG format. The same data is represented in JSON, CSV, and all three parquet formats (export, narrow, wide). + +## Dataset Overview + +**Domain**: Geological rock samples from Mount Rainier volcanic monitoring project + +**Entities**: +- 3 MaterialSampleRecords (samples) +- 3 SamplingEvents (collection/preparation events) +- 2 GeospatialCoordLocations (coordinates) +- 1 SamplingSite (Mount Rainier Summit Area) +- 1 Agent (Jane Smith, collector) + +**Relationships demonstrated**: +- Sample → produced_by → SamplingEvent (how samples are created) +- Sample → derivedFrom → Sample (parent/child relationship) +- SamplingEvent → sample_location → GeospatialCoordLocation +- SamplingEvent → sampling_site → SamplingSite +- SamplingSite → site_location → GeospatialCoordLocation + +## File Structure + +``` +minimal/ +├── json/ +│ ├── 1_sample.json # Single sample (simplest case) +│ └── 3_samples.json # Three related samples +├── csv/ +│ ├── samples.csv # MaterialSampleRecords +│ ├── events.csv # SamplingEvents +│ ├── locations.csv # GeospatialCoordLocations +│ ├── sites.csv # SamplingSites +│ ├── agents.csv # Agents +│ └── edges.csv # Relationships (for narrow format) +└── parquet/ + ├── minimal_export.parquet # Export format (3 rows, nested) + ├── minimal_narrow.parquet # Narrow format (21 rows, with edges) + └── minimal_wide.parquet # Wide format (10 rows, p__* columns) +``` + +## The Three Parquet Formats + +### Export Format (`minimal_export.parquet`) +- **3 rows** - one per sample +- Sample-centric with nested structs for related entities +- Best for: Simple queries on sample properties +- Coordinates pre-extracted to `sample_location_latitude/longitude` + +### Narrow Format (`minimal_narrow.parquet`) +- **21 rows** - 10 entities + 11 edge rows +- Graph-normalized with explicit `_edge_` rows +- Columns `s` (subject), `p` (predicate), `o` (object array) +- Best for: Graph traversal, flexible relationship queries + +### Wide Format (`minimal_wide.parquet`) +- **10 rows** - one per entity (no edge rows) +- Relationships stored as `p__*` columns with row_id arrays +- Best for: Fast entity queries, smaller file size, analytical queries + +## Example Queries + +### Query 1: Find all samples (works in all formats) + +**Export format:** +```sql +SELECT sample_identifier, label +FROM read_parquet('parquet/minimal_export.parquet') +``` + +**Wide format:** +```sql +SELECT pid, label +FROM read_parquet('parquet/minimal_wide.parquet') +WHERE otype = 'MaterialSampleRecord' +``` + +**Narrow format:** +```sql +SELECT pid, label +FROM read_parquet('parquet/minimal_narrow.parquet') +WHERE otype = 'MaterialSampleRecord' +``` + +### Query 2: Find samples with their locations + +**Wide format (uses p__* columns):** +```sql +SELECT + s.pid as sample, + s.label, + loc.latitude, + loc.longitude +FROM read_parquet('parquet/minimal_wide.parquet') s +JOIN read_parquet('parquet/minimal_wide.parquet') e + ON e.otype = 'SamplingEvent' + AND list_contains(s.p__produced_by, e.row_id) +JOIN read_parquet('parquet/minimal_wide.parquet') loc + ON loc.otype = 'GeospatialCoordLocation' + AND list_contains(e.p__sample_location, loc.row_id) +WHERE s.otype = 'MaterialSampleRecord' +``` + +**Narrow format (uses edge rows):** +```sql +SELECT + s.pid as sample, + s.label, + loc.latitude, + loc.longitude +FROM read_parquet('parquet/minimal_narrow.parquet') s +JOIN read_parquet('parquet/minimal_narrow.parquet') e1 + ON e1.otype = '_edge_' + AND e1.s = s.row_id + AND e1.p = 'produced_by' +JOIN read_parquet('parquet/minimal_narrow.parquet') ev + ON ev.otype = 'SamplingEvent' + AND list_contains(e1.o, ev.row_id) +JOIN read_parquet('parquet/minimal_narrow.parquet') e2 + ON e2.otype = '_edge_' + AND e2.s = ev.row_id + AND e2.p = 'sample_location' +JOIN read_parquet('parquet/minimal_narrow.parquet') loc + ON loc.otype = 'GeospatialCoordLocation' + AND list_contains(e2.o, loc.row_id) +WHERE s.otype = 'MaterialSampleRecord' +``` + +### Query 3: Count entities by type + +```sql +SELECT otype, COUNT(*) as count +FROM read_parquet('parquet/minimal_wide.parquet') +GROUP BY otype +ORDER BY count DESC +``` + +Expected output: +``` +MaterialSampleRecord 3 +SamplingEvent 3 +GeospatialCoordLocation 2 +SamplingSite 1 +Agent 1 +``` + +## JSON Schema Validation + +The JSON files validate against the iSamples Core 1.0 schema: + +```python +import json +from jsonschema import validate + +# Load schema (from isamplesorg-metadata repo) +with open('path/to/iSamplesSchemaCore1.0.json') as f: + schema = json.load(f) + +# Load and validate +with open('json/1_sample.json') as f: + sample = json.load(f) + +validate(instance=sample, schema=schema) # Raises if invalid +``` + +## Entity Relationship Diagram + +``` +MaterialSampleRecord ──produced_by──► SamplingEvent ──sample_location──► GeospatialCoordLocation + │ │ + │ └──sampling_site──► SamplingSite ──site_location──► GeospatialCoordLocation + │ + ├──registrant──► Agent + │ + └──derivedFrom──► MaterialSampleRecord (parent sample) +``` + +## Size Comparison + +| Format | Rows | File Size | Notes | +|--------|------|-----------|-------| +| Export | 3 | 1.7 KB | Nested structs, sample-centric | +| Narrow | 21 | 4.8 KB | Explicit edge rows | +| Wide | 10 | 5.0 KB | p__* columns | + +In production datasets: +- Wide is typically 60-70% smaller than narrow +- Export is smallest but less flexible for complex queries + +## See Also + +- [PQG Specification](../../docs/PQG_SPECIFICATION.md) - Full format specification +- [Edge Types](../../pqg/edge_types.py) - All 14 iSamples edge types +- [Schema Definitions](../../pqg/schemas/) - Python schema validators diff --git a/examples/minimal/csv/agents.csv b/examples/minimal/csv/agents.csv new file mode 100644 index 0000000..0e3c700 --- /dev/null +++ b/examples/minimal/csv/agents.csv @@ -0,0 +1,4 @@ +agent_id,name,role,affiliation,contact_information +agent:jsmith,Jane Smith,collector,University of Washington,jsmith@uw.edu +agent:labtech,Lab Technician,preparer,University of Washington, +agent:curator,Collections Manager,curator,Burke Museum, diff --git a/examples/minimal/csv/edges.csv b/examples/minimal/csv/edges.csv new file mode 100644 index 0000000..1090eab --- /dev/null +++ b/examples/minimal/csv/edges.csv @@ -0,0 +1,14 @@ +subject_id,predicate,object_id,description +ark:/99999/example001,produced_by,event:example001,Sample was produced by this sampling event +ark:/99999/example002,produced_by,event:example002,Sample was produced by this sampling event +ark:/99999/example003,produced_by,event:example003,Sample was produced by this sampling event +ark:/99999/example002,derivedFrom,ark:/99999/example001,Thin section derived from parent rock sample +ark:/99999/example003,relatedTo,ark:/99999/example001,Sibling sample from same site +event:example001,sample_location,loc:rainier001,Event occurred at this location +event:example003,sample_location,loc:rainier002,Event occurred at this location +event:example001,sampling_site,site:rainier001,Event occurred at this site +event:example003,sampling_site,site:rainier001,Event occurred at this site +site:rainier001,site_location,loc:rainier001,Site is at this location +ark:/99999/example001,registrant,agent:jsmith,Sample registered by this agent +ark:/99999/example002,registrant,agent:jsmith,Sample registered by this agent +ark:/99999/example003,registrant,agent:jsmith,Sample registered by this agent diff --git a/examples/minimal/csv/events.csv b/examples/minimal/csv/events.csv new file mode 100644 index 0000000..f9d8e04 --- /dev/null +++ b/examples/minimal/csv/events.csv @@ -0,0 +1,4 @@ +event_id,label,description,result_time,project,feature_of_interest,site_id,location_id,collector_id +event:example001,Mount Rainier Field Collection 2024-06-10,Field collection during summer geology survey,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier001,agent:jsmith +event:example002,Lab Preparation 2024-07-01,Thin section preparation in petrology lab,2024-07-01,,,,agent:labtech +event:example003,Mount Rainier Field Collection 2024-06-10 (Site B),Field collection 10m from first sample,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier002,agent:jsmith diff --git a/examples/minimal/csv/locations.csv b/examples/minimal/csv/locations.csv new file mode 100644 index 0000000..d63ebb2 --- /dev/null +++ b/examples/minimal/csv/locations.csv @@ -0,0 +1,3 @@ +location_id,latitude,longitude,elevation,obfuscated +loc:rainier001,46.8523,-121.7603,4392 m above mean sea level,false +loc:rainier002,46.8524,-121.7601,4390 m above mean sea level,false diff --git a/examples/minimal/csv/samples.csv b/examples/minimal/csv/samples.csv new file mode 100644 index 0000000..b73ebd3 --- /dev/null +++ b/examples/minimal/csv/samples.csv @@ -0,0 +1,4 @@ +sample_id,label,description,last_modified_time,event_id,material_category,sample_object_type,registrant_id +ark:/99999/example001,Rock Sample MR-001 (Parent),"Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",2024-06-15T10:30:00Z,event:example001,rock,physicalspecimen,agent:jsmith +ark:/99999/example002,Rock Sample MR-001-A (Child - Thin Section),Thin section prepared from parent sample MR-001 for petrographic analysis.,2024-07-01T14:00:00Z,event:example002,rock,thinsection,agent:jsmith +ark:/99999/example003,Rock Sample MR-002,"Second basalt sample from same site, collected 10m away from MR-001.",2024-06-15T11:00:00Z,event:example003,rock,physicalspecimen,agent:jsmith diff --git a/examples/minimal/csv/sites.csv b/examples/minimal/csv/sites.csv new file mode 100644 index 0000000..d2573e4 --- /dev/null +++ b/examples/minimal/csv/sites.csv @@ -0,0 +1,2 @@ +site_id,label,description,place_name +site:rainier001,Mount Rainier Summit Area,Collection site near the summit crater rim,"Mount Rainier, Pierce County, Washington, USA" diff --git a/examples/minimal/json/1_sample.json b/examples/minimal/json/1_sample.json new file mode 100644 index 0000000..ac915fd --- /dev/null +++ b/examples/minimal/json/1_sample.json @@ -0,0 +1,86 @@ +{ + "sample_identifier": "ark:/99999/example001", + "label": "Rock Sample from Mount Rainier", + "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.", + "last_modified_time": "2024-06-15T10:30:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10", + "description": "Field collection during summer geology survey", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "label": "Mount Rainier Summit Area", + "description": "Collection site near the summit crater rim", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8523, + "longitude": -121.7603, + "elevation": "4392 m above mean sea level", + "obfuscated": false + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington", + "contact_information": "jsmith@uw.edu" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/ite", + "label": "ite", + "scheme_name": "iSamples Material Type" + } + ], + "has_context_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/sampledfeature/1.0/activehumanoccupationsite", + "label": "Earth interior", + "scheme_name": "iSamples Sampled Feature Type" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen", + "scheme_name": "iSamples Specimen Type" + } + ], + "keywords": [ + { + "keyword": "basalt", + "scheme_name": "Free text" + }, + { + "keyword": "volcanic rock", + "scheme_name": "Free text" + }, + { + "keyword": "Cascade Range", + "scheme_name": "Geographic" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington", + "contact_information": "jsmith@uw.edu", + "role": "registrant" + }, + "curation": { + "label": "UW Geology Sample Collection", + "description": "Stored in climate-controlled facility", + "curation_location": "University of Washington, Burke Museum, Room 142, Drawer B-15", + "access_constraints": ["By appointment only", "Research use only"], + "responsibility": [ + { + "name": "Collections Manager", + "role": "curator", + "affiliation": "Burke Museum" + } + ] + } +} diff --git a/examples/minimal/json/3_samples.json b/examples/minimal/json/3_samples.json new file mode 100644 index 0000000..02e47fe --- /dev/null +++ b/examples/minimal/json/3_samples.json @@ -0,0 +1,146 @@ +[ + { + "sample_identifier": "ark:/99999/example001", + "label": "Rock Sample MR-001 (Parent)", + "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow. This is the original field sample.", + "last_modified_time": "2024-06-15T10:30:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10", + "identifier": "event:example001", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "identifier": "site:rainier001", + "label": "Mount Rainier Summit Area", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8523, + "longitude": -121.7603, + "elevation": "4392 m above mean sea level" + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock", + "scheme_name": "iSamples Material Type" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + }, + { + "sample_identifier": "ark:/99999/example002", + "label": "Rock Sample MR-001-A (Child - Thin Section)", + "description": "Thin section prepared from parent sample MR-001 for petrographic analysis.", + "last_modified_time": "2024-07-01T14:00:00Z", + "produced_by": { + "label": "Lab Preparation 2024-07-01", + "identifier": "event:example002", + "result_time": "2024-07-01", + "description": "Thin section preparation in petrology lab", + "responsibility": [ + { + "name": "Lab Technician", + "role": "preparer", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/thinsection", + "label": "Thin section" + } + ], + "related_resource": [ + { + "label": "Parent sample", + "relationship": "derivedFrom", + "target": "ark:/99999/example001", + "description": "This thin section was prepared from the parent rock sample" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + }, + { + "sample_identifier": "ark:/99999/example003", + "label": "Rock Sample MR-002", + "description": "Second basalt sample from same site, collected 10m away from MR-001.", + "last_modified_time": "2024-06-15T11:00:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10 (Site B)", + "identifier": "event:example003", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "identifier": "site:rainier001", + "label": "Mount Rainier Summit Area", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8524, + "longitude": -121.7601, + "elevation": "4390 m above mean sea level" + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen" + } + ], + "related_resource": [ + { + "label": "Sibling sample", + "relationship": "relatedTo", + "target": "ark:/99999/example001", + "description": "Collected from same site as MR-001" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + } +] diff --git a/examples/minimal/parquet/minimal_export.parquet b/examples/minimal/parquet/minimal_export.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d06011ed3d21eee72d6c0017bec63b4037df2eeb GIT binary patch literal 1673 zcmcIl&rj1(9Di-bhJYjxUeg8=p9$0sw$W}wVcEeUTnsD|7GsEU>DmXY>(-_17ECyJ z@nFJ*=gD>`XUhpv|qvr zN|1(48=yre`?gghUbF5J#}}KJ2QWyn>N+W_Bju2zGQy46JVYsR43{QA*gFOv_36sH zfvOou^{U6^yK!ZusK}X=oRQ@{fhYIkvQkvi89BqFbP!b_az0T>bI;3p|(=j#m(WQ*UZjyDh4d1y?}zqh^dyT+3Ahz)X{9C;cZ%HY}!;Ew^XxqFHbMI zMvZntQ;yZNHCk%#S38M?f_%H6X=$oQRx;t^Hg&o%hDY&4?5X#LsbUW-41ulsK@UCY zqbC8)g@rlcNJMjZLcmD4U|N{v!rWJeI}}H=JPuC#8vJ_4J$3Q0!XfEe_^Bxn)RPjd zqhJ0P!<{kWOD4-t4>=~u&kl|CTm%}fn=&Aye*=9npIGiAE1t1gK6+)l@QYXXAxQo| z!~+w%;1v4vJ7c(0M*JDd@-u-=yRP(&*ctuXyde4-=m90ehu#;`;PT>0ILk-c_U<;I z*w9#)lJ(E=rEc+=kGL)IJ1~d5%YYAg*8;~%eZ2v;%k9*q`w#&-@7CZ$-u>ax^?E)X zcRI#}#Zm4&f+T4J?v#yn(xSElUM4LikxrzDRBn1#y{_om>gxX5iX26+LI@S$Pv{Sq C0-dD* literal 0 HcmV?d00001 diff --git a/examples/minimal/parquet/minimal_narrow.parquet b/examples/minimal/parquet/minimal_narrow.parquet new file mode 100644 index 0000000000000000000000000000000000000000..64d12bb8a3e323e955ecdbbe8a4a0c76069d7f61 GIT binary patch literal 4831 zcmcIoUuYc18K2o(NvFR)$)5F|eX_^4ygtdca*`!S7NVSFyETq%RLM<(F%he~m2~#r zt=K!MF_dymi);K4{1i;+OHFA*D5ed02!*CljOjz^Ln+0W5K1wA2&plB=tD4Wf8Xrw zX?3<-HBiCq&dz-Eo8R|m=KFLrle0sDiI2JXgdY(D=r)UH#xC}0jIjaE@#Jk!UTeo& zpc!a8(e|SCqYa=PMsv^>(U#Db(F$mn(XODmXhpOVnumrxe$^!IYmx2lYnOukx-jp0 zxpChvlnRcY$qY^PFgd&p!{tK7={lViZwDLh?HKH1Y^2v@qxd|I z_A1&K+KFVx$WUfoeu%8~R2NU8*7{Vd2K31^-*UY2id`w#?n%${^V41qA71J5Zk8o_ zb40pOTAG78oxO>~Y0~9dH=Aj~IcA&p&Hp z@3t|a3q$Oh78<#xJ>7!m={G0O%}k#L$sS>lh4sLpt+M9O0kZ#HUHmg@|Jzz>9~MtO zjWM6I3hczrd8?(0oDKXS=T?g_sGRg2yML;eeelhK!y`vC$BzzW zc46_fy&z|NXT|pIN^(!))uWlAL)Tto_tT`coa%3b%vrnS$T@P;ZhXypxhqoH#*yb{ zkHRl9J!AVYNB;qKelJTOos{Vd%Y~AhbDDk;@9$w}55b#I@9&y;s7;GF+2Ggw$@>(N z5n>cJvgFvltSmdS9O#h)FUOaDp-9qGcG-3-G8bs$#>;|1u%#9@m(&e)OK8(4qhVlaZW6hQq@Be%x2$y2^$ z*Eq#Z6z8{B{_$`6`(5q*zu{_KqSsCMIg&DVIWKJ;>I-D7H3*9D5G(yqK+>mqWJ> z0P>_=&e?fKzV5j>yHv=@b6%-X@s$5(eD5;+4D=ZxBYe{V-?RefI+-&iWDT}!gt)B- zH&$N|eUa4zlJ@OjlRQ|j(R!t+I>ss2#PoMcg*C@Vgj|*0qI|mc}UmG>9kI{%V!BkeZ%g%^K5sBx66VGTDdKr(}b8mXI~#_X9kz zRU8~**n^;1x)FVfMp-tS!ILXy*?5*{Xgbl-8qgtCm4uIoVSQLgead!BO04FM;yNT~ zB914EXR#s>WpNIGMlzVw6-~gQA15C4!e3S5(TGknYL+I+s#)R;VKn)?xDwGZ$8?(J z)XUO?RcY~y(4k!bnFu5$Nn<(28^mS23X2nFJXX~;Cdqz!wu*hyAZH3K`BaQDvlB6T zCBHCV&MiAdTl6uZ9n{Qb-}~--A%Ac`6CL=Bt)LG^)Nk@CS67_yiM;B)(qnD8G!jE=)ak1bQHh}29=hg*!T)SMEFM9a`Qg6NvOy=O_&=PzF zc%vBcQ2aErQQg#C9BBdjlVA&Y7Lke}ZVV(-$)`OQOTpBFB}1DE%@{BqaN~1s#V1Ui z3R6=jKN&0?`ReY*5H0Rdi^&6OHi|pz&IMEYO9Z$jL%|Lwaa?e zmNNUdTydELTQB8|{YL!%1Wk7Q%bw7VdQ!kC^+Tuh`=w^wGagoS zAg8}*oRB}-yAgT)#73Y$NLXKN19Vu@nenj53!tALC+M3pghsT3^!jmE(AQ(eZ@Bfa z+xjYzGIzhE(o>U}P0B#Sf^J`XHPL+BK(jq^2PvX=HRI>py0P8*xF=<{1=R}`D`B7h zO)>gFW0N`BR*of?EP38#w`hkt;(W_l;@&|7Ggj=5*)^#Ignw!WUSYpPo;ck;7;*}794{*hH%QZWb|JkTR_To@ zPh&?q@`c6G+n%qeb??QDFS%9SJz;iIa;ulUW_zTV2GC1p>%fZ88240u4)b!b3pZ|X z>&q1A(a%Xi_8p7#6^3yyV+>ev<8d!br%$0sIyEKLzef1cVQpw|Wbl|wFRT{uYear1 VpBo!nJTaQ-XK&!&o*DeK^&dEk=W_r6 literal 0 HcmV?d00001 diff --git a/examples/minimal/parquet/minimal_wide.parquet b/examples/minimal/parquet/minimal_wide.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3f7e38af7b39b2f0ba97639576098aff9e2e123f GIT binary patch literal 4951 zcmcIoU1%KF6~1?NR@${z@=DozJu9!R>mQ9*j-{+*DOJQitYo_lt|U}RbxQ~lqur4_ z_Gm`TuGAPx*|oO+{o+($Ii6z& zx!mYX9}#1Rp}Xu<>|=v3;vDw2?Nz4iSFMdr*UshUQ-q9N&JyE1^03xv-ndrYa4J_B zO(uiuQ_sBsET2n%=an?bVR0TPyd7cp!xSFSs-kcNFurB`@(@PejFgU#q4PZp(tco1~M0DA(7wsb8{Kg+G=>a>F}X<^<06Btim8Dqt0v z^S8W;SX4_VmbNxv4a9U{TSGI)$nG~C+bxL;zU$gWG)9c&#>R(p&kv90`ayZtS``a{ zy=etjC4D^kjp5wrnVoNwPqS#Gm>D_*GM6pS7E8FYN1?Uo7q1Js6tpQ=S`4ls`Ryn zrE*tgTmy!`>p9zY01mn#{5A1Cs|JK4!O-Msygb1uxu_$04^t6e;NYKa_29DGuuMq-$@Ov!@`Yq@>0Kr1Wh*l+UOQ}MV zL{POrs?~OMlov1>gObDwr=^O(I>lL9&|y?Zj_?!k#3W%=RI#H~9A%|E4kCLX0i`TN zySNn5Q9wt0inRh3q>9S>p~IpnNHh?B0t--w{uXhGi$euDuEYesKo&^0iA#k9DO3sG zo$z~Ip(izxZ4*bX$~^)_ffYV-Rk1Bv(2=2%2PG7iwvwuP3&OPYlYt3hbeFb@*GsD_ z<>FO)!(v&8!9mUF2>c(cIHkeUaJ)h@vI%uyg!QOjxv^>2Thy!9qOP@SyY*IA$u1WI zXA>`5?9)Iq=0YM-*(%wChj2%qdl^e$CI{oL@2xjSd*mo)L~7J_?QNNc44zapx&phr zm+m!?x}jC%UWA=Tt|V_Y|#VXZ3y(aR=KjW;g=k^GfQ>g3C&1u zuB_l?w*twTREGqngG}g#?mFcPgE&CzI)UN_T%SM?box+7-lJbfOU_iwujG^mn5hS5 z0=vPDKzjAY0W`WnBe2(jdtiCM|ErDRsdOg&BxRBe)6#@N!p5hNF-=$_OZ^Y^e zt^wOx{W!_))7#q%qX?FeU>I2v>3)RoXy*Hnz8=Q44kf`x!osX_N~N`c`qct06R2>N zQ+yKPUsCgzk-U+RnOiN1HKHpURP;hiX_dKlsFM9lZ% zd1L6nxu9|O;7KM=WdA?F4a2lFgktkJkyhQ&c$J#}OqdT68AHb$)X*^-w8%{D^a%7ma`wd0M0N`Fe9qh@jHe0fFVyALndD~n8410N$Q88oZvIc-=pTw zd(GWs#yE9=to1-bJPDEhKSM(!9v-VNr!qpG;L?J76#e_NKJ!s8EKzemgzLNy_&j9{ z(ceG57d?C>$Vq-j`5kKBd&Yd!mobc|WzkO`kp%t=JtX(Wz+Q6lZj!9-QvM_JZpwW8 zOvZ>elJQVgPP^|20)HL`h7jH7o|DsjLNN%xNac-GGXiB!F4OROuS(AAsXOnAa!iH-jNRy(dHUmQ~(+-M6EyanF3S-~2ocH1Uf=l}NMMLYjY(H2I?} zPnj|Pu%BeJ7a@1NI3wh@Cj6g Date: Fri, 30 Jan 2026 10:08:12 -0800 Subject: [PATCH 2/5] Add task specs for optimization experiments (#17, #18) Creates self-contained task specifications for: - H3 geospatial optimization (experiments/h3_optimization/) - Facet metadata optimization (experiments/facet_optimization/) These are formatted as Claude Code Web-ready prompts with: - Pinned data URLs - Exact column names - Step-by-step tasks - Expected output formats Co-Authored-By: Claude Opus 4.5 --- experiments/facet_optimization/TASK_SPEC.md | 334 ++++++++++++++++++++ experiments/h3_optimization/TASK_SPEC.md | 216 +++++++++++++ 2 files changed, 550 insertions(+) create mode 100644 experiments/facet_optimization/TASK_SPEC.md create mode 100644 experiments/h3_optimization/TASK_SPEC.md diff --git a/experiments/facet_optimization/TASK_SPEC.md b/experiments/facet_optimization/TASK_SPEC.md new file mode 100644 index 0000000..2e1183f --- /dev/null +++ b/experiments/facet_optimization/TASK_SPEC.md @@ -0,0 +1,334 @@ +# Facet Metadata Optimization Task + +**Issue:** https://github.com/isamplesorg/pqg/issues/18 +**Goal:** Generate pre-computed facet summary tables for instant dashboard queries + +## Data Source + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +``` + +- ~280 MB, ~20M rows +- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN + +## Schema (Relevant Columns) + +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier | +| `otype` | VARCHAR | Entity type - `'MaterialSampleRecord'` for samples | +| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN | +| `label` | VARCHAR | Human-readable name | +| `p__has_material_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | +| `p__has_context_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | +| `p__has_sample_object_type` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | + +For IdentifiedConcept rows (otype = 'IdentifiedConcept'): +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier (referenced by p__* arrays) | +| `label` | VARCHAR | Concept label (e.g., "Rock", "Earth interior") | +| `scheme_name` | VARCHAR | Vocabulary name | + +## Task 1: Baseline Benchmark + +Measure current facet query performance. + +```python +import duckdb +import time + +con = duckdb.connect() +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" + +# Query 1: Source facet counts +SOURCE_FACET = f""" +SELECT n as source, COUNT(*) as count +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' +GROUP BY n +ORDER BY count DESC +""" + +# Query 2: Material category facet (requires join) +MATERIAL_FACET = f""" +WITH samples AS ( + SELECT row_id, UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_material_category IS NOT NULL +), +concepts AS ( + SELECT row_id, label + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' +) +SELECT c.label as material, COUNT(*) as count +FROM samples s +JOIN concepts c ON c.row_id = s.material_id +GROUP BY c.label +ORDER BY count DESC +LIMIT 50 +""" + +# Query 3: Entity type counts (quick sanity check) +OTYPE_COUNTS = f""" +SELECT otype, COUNT(*) as count +FROM read_parquet('{PARQUET_URL}') +GROUP BY otype +ORDER BY count DESC +""" +``` + +**Measure:** Execute each query 3 times, report median time in milliseconds. + +## Task 2: Generate Source Facet Summary + +Simple aggregation - should be tiny file. + +```python +OUTPUT_PATH = "/tmp/facet_source_counts.parquet" + +query = f""" +COPY ( + SELECT + 'source' as facet_type, + n as facet_value, + COUNT(*) as count + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + GROUP BY n + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 3: Generate Material Category Facet Summary + +Requires joining through the relationship arrays. + +```python +OUTPUT_PATH = "/tmp/facet_material_counts.parquet" + +query = f""" +COPY ( + WITH samples AS ( + SELECT UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_material_category IS NOT NULL + ), + concepts AS ( + SELECT row_id, label, scheme_name + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' + ) + SELECT + 'material' as facet_type, + c.label as facet_value, + c.scheme_name as scheme, + COUNT(*) as count + FROM samples s + JOIN concepts c ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 4: Generate Context Category Facet Summary + +```python +OUTPUT_PATH = "/tmp/facet_context_counts.parquet" + +query = f""" +COPY ( + WITH samples AS ( + SELECT UNNEST(p__has_context_category) as context_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_context_category IS NOT NULL + ), + concepts AS ( + SELECT row_id, label, scheme_name + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' + ) + SELECT + 'context' as facet_type, + c.label as facet_value, + c.scheme_name as scheme, + COUNT(*) as count + FROM samples s + JOIN concepts c ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 5: Generate Combined Facet Summary + +All facets in one file for easy loading. + +```python +OUTPUT_PATH = "/tmp/facet_summaries_all.parquet" + +query = f""" +COPY ( + -- Source facet + SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + GROUP BY n + + UNION ALL + + -- Material facet + SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Context facet + SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_context_category) as context_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_context_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Object type facet + SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_sample_object_type) as type_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_sample_object_type IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.type_id + GROUP BY c.label, c.scheme_name +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 6: Generate Cross-Facet Summary (Source × Material) + +For "how many Rock samples from OPENCONTEXT?" + +```python +OUTPUT_PATH = "/tmp/facet_source_material_cross.parquet" + +query = f""" +COPY ( + SELECT + s.source, + c.label as material, + COUNT(*) as count + FROM ( + SELECT n as source, UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY s.source, c.label + HAVING COUNT(*) > 100 -- Filter out tiny combinations + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 7: Benchmark Summary Table Queries + +Compare querying summary tables vs full data. + +```python +# Load summary and query +SUMMARY_PATH = "/tmp/facet_summaries_all.parquet" + +# This should be nearly instant +FAST_SOURCE_FACET = f""" +SELECT facet_value, count +FROM read_parquet('{SUMMARY_PATH}') +WHERE facet_type = 'source' +ORDER BY count DESC +""" + +FAST_MATERIAL_FACET = f""" +SELECT facet_value, count +FROM read_parquet('{SUMMARY_PATH}') +WHERE facet_type = 'material' +ORDER BY count DESC +""" +``` + +## Expected Output + +Generate a JSON results file: + +```json +{ + "baseline": { + "source_facet_ms": 2345, + "material_facet_ms": 5678, + "context_facet_ms": 4567, + "otype_counts_ms": 1234 + }, + "with_summary": { + "source_facet_ms": 5, + "material_facet_ms": 8, + "context_facet_ms": 7 + }, + "speedup": { + "source": 469, + "material": 710 + }, + "summary_files": { + "facet_summaries_all.parquet": { + "size_bytes": 12345, + "row_count": 234 + }, + "facet_source_material_cross.parquet": { + "size_bytes": 45678, + "row_count": 1234 + } + }, + "facet_counts": { + "source": { + "SESAR": 3100000, + "OPENCONTEXT": 1200000, + "GEOME": 1500000, + "SMITHSONIAN": 900000 + }, + "material_top10": ["Rock", "ite", "..."], + "context_top10": ["Earth interior", "..."] + } +} +``` + +## Output Files + +Save to `experiments/facet_optimization/results/`: +- `benchmark_results.json` - The JSON above +- `facet_summaries_all.parquet` - Combined facet counts +- `facet_source_material_cross.parquet` - Cross-tab counts +- `benchmark_log.txt` - Full execution log diff --git a/experiments/h3_optimization/TASK_SPEC.md b/experiments/h3_optimization/TASK_SPEC.md new file mode 100644 index 0000000..39d8e4e --- /dev/null +++ b/experiments/h3_optimization/TASK_SPEC.md @@ -0,0 +1,216 @@ +# H3 Geospatial Optimization Task + +**Issue:** https://github.com/isamplesorg/pqg/issues/17 +**Goal:** Add H3 index columns to iSamples parquet and benchmark speedup + +## Data Source + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +``` + +- ~280 MB, ~20M rows +- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN + +## Schema (Relevant Columns) + +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier | +| `otype` | VARCHAR | Entity type - filter to `'MaterialSampleRecord'` for samples | +| `latitude` | DOUBLE | WGS84 latitude (nullable) | +| `longitude` | DOUBLE | WGS84 longitude (nullable) | +| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN | +| `label` | VARCHAR | Human-readable name | + +## Environment Setup + +```python +import duckdb + +# Install and load H3 extension +con = duckdb.connect() +con.execute("INSTALL h3; LOAD h3;") +``` + +## Task 1: Baseline Benchmark + +Measure current geospatial query performance. + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" + +# Query 1: Bounding box - Western US +BBOX_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 +""" + +# Query 2: Bounding box with facet +BBOX_FACET_QUERY = f""" +SELECT n as source, COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 +GROUP BY n +""" + +# Query 3: Point radius (approximate - 1 degree ≈ 111km) +# San Francisco area, ~50km radius +RADIUS_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 37.3 AND 38.1 + AND longitude BETWEEN -122.8 AND -122.0 +""" +``` + +**Measure:** Execute each query 3 times, report median time in milliseconds. + +## Task 2: Generate H3-Enhanced Parquet + +Add H3 columns at resolutions 4, 6, and 8. + +```python +import duckdb +import time + +con = duckdb.connect() +con.execute("INSTALL h3; LOAD h3;") + +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet" + +# Generate with H3 columns +query = f""" +COPY ( + SELECT *, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 4) END as h3_res4, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 6) END as h3_res6, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 8) END as h3_res8 + FROM read_parquet('{PARQUET_URL}') +) TO '{OUTPUT_PATH}' (FORMAT PARQUET, COMPRESSION ZSTD); +""" + +start = time.time() +con.execute(query) +elapsed = time.time() - start +print(f"Generated in {elapsed:.1f}s") +``` + +**Report:** +- Original file size (MB) +- New file size with H3 (MB) +- Size increase percentage +- Row count with valid H3 (non-null lat/lon) + +## Task 3: H3 Benchmark + +Re-run equivalent queries using H3 filters. + +```python +OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet" + +# Get H3 cells covering the Western US bbox at res 4 +# (In practice, use h3 library to get these) +# For now, query to find the cells: +FIND_CELLS = f""" +SELECT DISTINCT h3_res4 +FROM read_parquet('{OUTPUT_PATH}') +WHERE latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 + AND h3_res4 IS NOT NULL +""" + +# Then filter by H3 cell instead of lat/lon +# This should be faster because H3 is an integer column with good stats +H3_BBOX_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' + AND h3_res4 IN (SELECT DISTINCT h3_res4 + FROM read_parquet('{OUTPUT_PATH}') + WHERE latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110) +""" + +# For aggregation by location (clustering for map display) +H3_CLUSTER_QUERY = f""" +SELECT h3_res6, COUNT(*) as cnt, + AVG(latitude) as center_lat, + AVG(longitude) as center_lon +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' + AND h3_res4 IN (...cells from above...) +GROUP BY h3_res6 +""" +``` + +## Task 4: Resolution Analysis + +Determine optimal H3 resolutions. + +```python +# Count distinct cells at each resolution +RESOLUTION_STATS = f""" +SELECT + COUNT(DISTINCT h3_res4) as unique_res4, + COUNT(DISTINCT h3_res6) as unique_res6, + COUNT(DISTINCT h3_res8) as unique_res8, + COUNT(*) as total_rows, + COUNT(h3_res4) as rows_with_h3 +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' +""" +``` + +**Report:** Unique cells per resolution, average points per cell. + +## Expected Output + +Generate a JSON results file: + +```json +{ + "baseline": { + "bbox_query_ms": 1234, + "bbox_facet_ms": 1456, + "radius_query_ms": 1123 + }, + "with_h3": { + "bbox_query_ms": 234, + "bbox_facet_ms": 345, + "cluster_query_ms": 456 + }, + "speedup": { + "bbox": 5.3, + "facet": 4.2 + }, + "file_size": { + "original_mb": 282, + "with_h3_mb": 310, + "increase_pct": 9.9 + }, + "h3_stats": { + "rows_with_coords": 5400000, + "unique_res4_cells": 1234, + "unique_res6_cells": 45678, + "unique_res8_cells": 234567 + } +} +``` + +## Output Files + +Save to `experiments/h3_optimization/results/`: +- `benchmark_results.json` - The JSON above +- `isamples_wide_h3.parquet` - Enhanced parquet (or note if too large to include) +- `benchmark_log.txt` - Full execution log From 67ba4c76b60472f14e1a1b690dbc6a74f86d1690 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 30 Jan 2026 18:25:35 +0000 Subject: [PATCH 3/5] Add facet optimization benchmark results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Results from running all 7 facet optimization tasks: - Baseline benchmarks: source (34ms), material (490ms), otype (35ms) - Generated summary parquet files for source, material, context facets - Combined facet summary with 60 rows - Cross-facet summary (source × material) with 24 combinations - Speedup achieved: 8.7x for source, 140.1x for material facets https://claude.ai/code/session_016aGrEntdNnvpPjUqkpAtdC --- .../results/facet_results.json | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 experiments/facet_optimization/results/facet_results.json diff --git a/experiments/facet_optimization/results/facet_results.json b/experiments/facet_optimization/results/facet_results.json new file mode 100644 index 0000000..4160856 --- /dev/null +++ b/experiments/facet_optimization/results/facet_results.json @@ -0,0 +1,142 @@ +{ + "baseline": { + "source_facet_ms": 34.14, + "material_facet_ms": 490.4, + "otype_counts_ms": 34.67 + }, + "with_summary": { + "source_facet_ms": 3.92, + "material_facet_ms": 3.5, + "context_facet_ms": 3.21 + }, + "speedup": { + "source": 8.7, + "material": 140.1 + }, + "summary_files": { + "facet_summaries_all.parquet": { + "size_bytes": 2118, + "row_count": 60 + }, + "facet_source_material_cross.parquet": { + "size_bytes": 1266, + "row_count": 24 + } + }, + "facet_counts": { + "source": { + "SESAR": 4688386, + "OPENCONTEXT": 1064831, + "GEOME": 605554, + "SMITHSONIAN": 322161 + }, + "material_top10": [ + "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/rock", + "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock", + "https://w3id.org/isample/vocabulary/material/1.0/material", + "https://w3id.org/isample/vocabulary/material/1.0/mineral", + "https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal", + "https://w3id.org/isample/opencontext/material/0.1/ceramicclay", + "https://w3id.org/isample/vocabulary/material/1.0/sediment" + ], + "otype_counts": { + "MaterialSampleRecord": 6680932, + "SamplingEvent": 6354171, + "GeospatialCoordLocation": 5980282, + "MaterialSampleCuration": 720254, + "SampleRelation": 501579, + "SamplingSite": 386160, + "IdentifiedConcept": 55893, + "Agent": 50087 + }, + "context_top10": [ + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/anysampledfeature", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/pasthumanoccupationsite", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/earthinterior", + "https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/subaerialsurfaceenvironment", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/waterbody", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbodybottom", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/lakeriverstreambottom", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbody", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/terrestrialwaterbody" + ] + }, + "task_details": { + "task1": { + "status": "completed", + "queries_run": 3, + "runs_per_query": 3 + }, + "task2": { + "status": "completed", + "output_file": "/tmp/facet_source_counts.parquet", + "row_count": 4, + "size_bytes": 542, + "generation_time_ms": 32.17 + }, + "task3": { + "status": "completed", + "output_file": "/tmp/facet_material_counts.parquet", + "row_count": 19, + "size_bytes": 1284, + "generation_time_ms": 583.98 + }, + "task4": { + "status": "completed", + "output_file": "/tmp/facet_context_counts.parquet", + "row_count": 17, + "size_bytes": 1171, + "generation_time_ms": 569.76 + }, + "task5": { + "status": "completed", + "output_file": "/tmp/facet_summaries_all.parquet", + "row_count": 60, + "size_bytes": 2118, + "generation_time_ms": 984.68 + }, + "task6": { + "status": "completed", + "output_file": "/tmp/facet_source_material_cross.parquet", + "row_count": 24, + "size_bytes": 1266, + "generation_time_ms": 142.55, + "top_combinations": [ + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial", + "count": 2233939 + }, + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "count": 912855 + }, + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock", + "count": 838805 + }, + { + "source": "OPENCONTEXT", + "material": "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial", + "count": 745539 + }, + { + "source": "GEOME", + "material": "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial", + "count": 605554 + } + ] + }, + "task7": { + "status": "completed", + "speedup_source": 8.7, + "speedup_material": 140.1 + } + } +} \ No newline at end of file From 745103f7466c19a9b61217ba6059024c3fa39482 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 30 Jan 2026 11:09:24 -0800 Subject: [PATCH 4/5] Add H3 optimization benchmark results from fly.io sprite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 4.85x speedup for bbox queries (170ms → 35ms) - 4.87x speedup for faceted geo queries - Only 3.7% file size increase (282MB → 292MB) - 5.98M samples with coords out of 6.68M total Co-Authored-By: Claude --- .../results/benchmark_results.json | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 experiments/h3_optimization/results/benchmark_results.json diff --git a/experiments/h3_optimization/results/benchmark_results.json b/experiments/h3_optimization/results/benchmark_results.json new file mode 100644 index 0000000..e7c79c2 --- /dev/null +++ b/experiments/h3_optimization/results/benchmark_results.json @@ -0,0 +1,29 @@ +{ + "baseline": { + "bbox_query_ms": 170.0129508972168, + "bbox_facet_ms": 186.75613403320312, + "radius_query_ms": 179.39233779907227 + }, + "with_h3": { + "bbox_query_ms": 35.030364990234375, + "bbox_facet_ms": 38.38610649108887, + "cluster_query_ms": 51.71322822570801 + }, + "speedup": { + "bbox": 4.85, + "facet": 4.87 + }, + "file_size": { + "original_mb": 282, + "with_h3_mb": 292.4, + "increase_pct": 3.7, + "generation_time_s": 41.6 + }, + "h3_stats": { + "unique_res4_cells": 38406, + "unique_res6_cells": 111681, + "unique_res8_cells": 175653, + "total_sample_rows": 6680932, + "rows_with_coords": 5980282 + } +} \ No newline at end of file From aefd465b0f0554da3c19ddcea5bd0fca1ef04244 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 30 Jan 2026 11:35:35 -0800 Subject: [PATCH 5/5] Add CLI commands for H3 indexing and facet summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add-h3: Add H3 index columns at specified resolutions - Supports local files and remote URLs - Configurable lat/lon columns and resolutions - Uses H3 community extension - facet-summaries: Generate pre-computed facet summary tables - Combined summaries for source, material, context, object_type - Source × material cross-tabulation - Configurable otype filter and minimum cross-count Implements CLI support for optimizations benchmarked in #17 and #18. Co-Authored-By: Claude Opus 4.5 --- pqg/__main__.py | 251 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/pqg/__main__.py b/pqg/__main__.py index 72a8687..8c23098 100644 --- a/pqg/__main__.py +++ b/pqg/__main__.py @@ -4,12 +4,15 @@ import json import logging +import os +import time import typing import click import duckdb import rich import rich.tree +import rich.console import pqg import pqg.common @@ -186,5 +189,253 @@ def get_geo(ctx, store): print("]}") +@cli.command("add-h3") +@click.pass_context +@click.argument("input_parquet") +@click.option("-o", "--output", required=True, help="Output parquet file path") +@click.option( + "-r", + "--resolutions", + default="4,6,8", + help="Comma-separated H3 resolutions to add (default: 4,6,8)", +) +@click.option( + "--lat-col", default="latitude", help="Latitude column name (default: latitude)" +) +@click.option( + "--lon-col", default="longitude", help="Longitude column name (default: longitude)" +) +def add_h3( + ctx, + input_parquet: str, + output: str, + resolutions: str, + lat_col: str, + lon_col: str, +): + """Add H3 index columns to a parquet file. + + Creates a new parquet file with h3_resN columns for each specified resolution. + Only rows with valid lat/lon will have H3 values; others will be NULL. + + Example: + pqg add-h3 input.parquet -o output_h3.parquet + pqg add-h3 input.parquet -o output.parquet -r 4,6 + """ + console = rich.console.Console() + logger = get_logger() + + # Parse resolutions + res_list = [int(r.strip()) for r in resolutions.split(",")] + logger.info(f"Adding H3 columns at resolutions: {res_list}") + + con = ctx.obj["dbinstance"] + + # Install and load H3 extension (community extension) + console.print("[blue]Installing H3 extension from community...[/blue]") + con.execute("INSTALL h3 FROM community; LOAD h3;") + + # Build H3 column expressions + h3_cols = [] + for res in res_list: + h3_cols.append( + f"CASE WHEN {lat_col} IS NOT NULL AND {lon_col} IS NOT NULL " + f"THEN h3_latlng_to_cell({lat_col}, {lon_col}, {res}) END as h3_res{res}" + ) + h3_select = ", ".join(h3_cols) + + # Determine source (local file or URL) + if input_parquet.startswith("http://") or input_parquet.startswith("https://"): + source = f"read_parquet('{input_parquet}')" + else: + source = f"read_parquet('{os.path.abspath(input_parquet)}')" + + query = f""" + COPY ( + SELECT *, {h3_select} + FROM {source} + ) TO '{os.path.abspath(output)}' (FORMAT PARQUET, COMPRESSION ZSTD); + """ + + console.print(f"[blue]Processing {input_parquet}...[/blue]") + start = time.time() + con.execute(query) + elapsed = time.time() - start + + # Get stats + stats = con.sql( + f"SELECT COUNT(*) as total, COUNT(h3_res{res_list[0]}) as with_h3 " + f"FROM read_parquet('{os.path.abspath(output)}')" + ).fetchone() + + output_size = os.path.getsize(output) / (1024 * 1024) + + console.print(f"[green]✓ Generated {output}[/green]") + console.print(f" Size: {output_size:.1f} MB") + console.print(f" Total rows: {stats[0]:,}") + console.print(f" Rows with H3: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)") + console.print(f" Time: {elapsed:.1f}s") + + +@cli.command("facet-summaries") +@click.pass_context +@click.argument("input_parquet") +@click.option( + "-o", + "--output-dir", + required=True, + help="Output directory for summary files", +) +@click.option( + "--otype-filter", + default="MaterialSampleRecord", + help="Filter to this otype (default: MaterialSampleRecord)", +) +@click.option( + "--min-cross-count", + default=100, + type=int, + help="Minimum count for cross-facet combinations (default: 100)", +) +def facet_summaries( + ctx, + input_parquet: str, + output_dir: str, + otype_filter: str, + min_cross_count: int, +): + """Generate pre-computed facet summary tables from a wide parquet file. + + Creates two output files: + - facet_summaries_all.parquet: Combined counts for source, material, context, object_type + - facet_source_material_cross.parquet: Source × material cross-tabulation + + Example: + pqg facet-summaries wide.parquet -o summaries/ + """ + console = rich.console.Console() + logger = get_logger() + + con = ctx.obj["dbinstance"] + + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Determine source + if input_parquet.startswith("http://") or input_parquet.startswith("https://"): + source = f"read_parquet('{input_parquet}')" + else: + source = f"read_parquet('{os.path.abspath(input_parquet)}')" + + otype_clause = f"otype = '{otype_filter}'" if otype_filter else "1=1" + + # Generate combined facet summaries + console.print("[blue]Generating combined facet summaries...[/blue]") + start = time.time() + + combined_path = os.path.join(output_dir, "facet_summaries_all.parquet") + combined_query = f""" + COPY ( + -- Source facet + SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count + FROM {source} + WHERE {otype_clause} + GROUP BY n + + UNION ALL + + -- Material facet + SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_material_category) as material_id + FROM {source} + WHERE {otype_clause} AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Context facet + SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_context_category) as context_id + FROM {source} + WHERE {otype_clause} AND p__has_context_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Object type facet + SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_sample_object_type) as type_id + FROM {source} + WHERE {otype_clause} AND p__has_sample_object_type IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.type_id + GROUP BY c.label, c.scheme_name + ) TO '{combined_path}' (FORMAT PARQUET); + """ + con.execute(combined_query) + + combined_stats = con.sql( + f"SELECT COUNT(*) FROM read_parquet('{combined_path}')" + ).fetchone() + combined_size = os.path.getsize(combined_path) + + console.print(f"[green]✓ {combined_path}[/green]") + console.print(f" Rows: {combined_stats[0]}, Size: {combined_size:,} bytes") + + # Generate cross-facet summary + console.print("[blue]Generating source × material cross-tabulation...[/blue]") + + cross_path = os.path.join(output_dir, "facet_source_material_cross.parquet") + cross_query = f""" + COPY ( + SELECT + s.source, + c.label as material, + COUNT(*) as count + FROM ( + SELECT n as source, UNNEST(p__has_material_category) as material_id + FROM {source} + WHERE {otype_clause} AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY s.source, c.label + HAVING COUNT(*) > {min_cross_count} + ORDER BY count DESC + ) TO '{cross_path}' (FORMAT PARQUET); + """ + con.execute(cross_query) + + cross_stats = con.sql( + f"SELECT COUNT(*) FROM read_parquet('{cross_path}')" + ).fetchone() + cross_size = os.path.getsize(cross_path) + + elapsed = time.time() - start + + console.print(f"[green]✓ {cross_path}[/green]") + console.print(f" Rows: {cross_stats[0]}, Size: {cross_size:,} bytes") + console.print(f"[green]Total time: {elapsed:.1f}s[/green]") + + # Print summary + console.print("\n[bold]Summary:[/bold]") + facet_counts = con.sql( + f"SELECT facet_type, COUNT(*) as n, SUM(count) as total " + f"FROM read_parquet('{combined_path}') GROUP BY facet_type" + ).fetchall() + for row in facet_counts: + console.print(f" {row[0]}: {row[1]} values, {row[2]:,} total records") + + if __name__ == "__main__": cli()