From 1daad6f519e75ef70319a136c8295b9806b04179 Mon Sep 17 00:00:00 2001
From: Raymond Yee <raymond.yee@gmail.com>
Date: Wed, 14 Jan 2026 12:35:06 -0800
Subject: [PATCH 1/5] Add minimal example datasets for PQG education

Hand-crafted small examples to help understand the iSamples PQG format:

- JSON: 1-sample and 3-sample examples (validated against schema)
- CSV: Flattened entity files (samples, events, locations, sites, agents, edges)
- Parquet: Same data in all 3 formats:
  - Export (3 rows, nested structs)
  - Narrow (21 rows, explicit edge rows)
  - Wide (10 rows, p__* columns)

Includes README with:
- Entity relationship diagram
- Example queries for each format
- Format comparison table

Idea from meeting with Stephen Richard - small examples make format
differences much easier to understand.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/minimal/README.md                    | 195 ++++++++++++++++++
 examples/minimal/csv/agents.csv               |   4 +
 examples/minimal/csv/edges.csv                |  14 ++
 examples/minimal/csv/events.csv               |   4 +
 examples/minimal/csv/locations.csv            |   3 +
 examples/minimal/csv/samples.csv              |   4 +
 examples/minimal/csv/sites.csv                |   2 +
 examples/minimal/json/1_sample.json           |  86 ++++++++
 examples/minimal/json/3_samples.json          | 146 +++++++++++++
 .../minimal/parquet/minimal_export.parquet    | Bin 0 -> 1673 bytes
 .../minimal/parquet/minimal_narrow.parquet    | Bin 0 -> 4831 bytes
 examples/minimal/parquet/minimal_wide.parquet | Bin 0 -> 4951 bytes
 12 files changed, 458 insertions(+)
 create mode 100644 examples/minimal/README.md
 create mode 100644 examples/minimal/csv/agents.csv
 create mode 100644 examples/minimal/csv/edges.csv
 create mode 100644 examples/minimal/csv/events.csv
 create mode 100644 examples/minimal/csv/locations.csv
 create mode 100644 examples/minimal/csv/samples.csv
 create mode 100644 examples/minimal/csv/sites.csv
 create mode 100644 examples/minimal/json/1_sample.json
 create mode 100644 examples/minimal/json/3_samples.json
 create mode 100644 examples/minimal/parquet/minimal_export.parquet
 create mode 100644 examples/minimal/parquet/minimal_narrow.parquet
 create mode 100644 examples/minimal/parquet/minimal_wide.parquet

diff --git a/examples/minimal/README.md b/examples/minimal/README.md
new file mode 100644
index 0000000..7a5943b
--- /dev/null
+++ b/examples/minimal/README.md
@@ -0,0 +1,195 @@
+# Minimal PQG Example Data
+
+This directory contains small, hand-crafted example datasets to help understand the iSamples PQG format. The same data is represented in JSON, CSV, and all three parquet formats (export, narrow, wide).
+
+## Dataset Overview
+
+**Domain**: Geological rock samples from Mount Rainier volcanic monitoring project
+
+**Entities**:
+- 3 MaterialSampleRecords (samples)
+- 3 SamplingEvents (collection/preparation events)
+- 2 GeospatialCoordLocations (coordinates)
+- 1 SamplingSite (Mount Rainier Summit Area)
+- 1 Agent (Jane Smith, collector)
+
+**Relationships demonstrated**:
+- Sample → produced_by → SamplingEvent (how samples are created)
+- Sample → derivedFrom → Sample (parent/child relationship)
+- SamplingEvent → sample_location → GeospatialCoordLocation
+- SamplingEvent → sampling_site → SamplingSite
+- SamplingSite → site_location → GeospatialCoordLocation
+
+## File Structure
+
+```
+minimal/
+├── json/
+│   ├── 1_sample.json       # Single sample (simplest case)
+│   └── 3_samples.json      # Three related samples
+├── csv/
+│   ├── samples.csv         # MaterialSampleRecords
+│   ├── events.csv          # SamplingEvents
+│   ├── locations.csv       # GeospatialCoordLocations
+│   ├── sites.csv           # SamplingSites
+│   ├── agents.csv          # Agents
+│   └── edges.csv           # Relationships (for narrow format)
+└── parquet/
+    ├── minimal_export.parquet    # Export format (3 rows, nested)
+    ├── minimal_narrow.parquet    # Narrow format (21 rows, with edges)
+    └── minimal_wide.parquet      # Wide format (10 rows, p__* columns)
+```
+
+## The Three Parquet Formats
+
+### Export Format (`minimal_export.parquet`)
+- **3 rows** - one per sample
+- Sample-centric with nested structs for related entities
+- Best for: Simple queries on sample properties
+- Coordinates pre-extracted to `sample_location_latitude/longitude`
+
+### Narrow Format (`minimal_narrow.parquet`)
+- **21 rows** - 10 entities + 11 edge rows
+- Graph-normalized with explicit `_edge_` rows
+- Columns `s` (subject), `p` (predicate), `o` (object array)
+- Best for: Graph traversal, flexible relationship queries
+
+### Wide Format (`minimal_wide.parquet`)
+- **10 rows** - one per entity (no edge rows)
+- Relationships stored as `p__*` columns with row_id arrays
+- Best for: Fast entity queries, smaller file size, analytical queries
+
+## Example Queries
+
+### Query 1: Find all samples (works in all formats)
+
+**Export format:**
+```sql
+SELECT sample_identifier, label
+FROM read_parquet('parquet/minimal_export.parquet')
+```
+
+**Wide format:**
+```sql
+SELECT pid, label
+FROM read_parquet('parquet/minimal_wide.parquet')
+WHERE otype = 'MaterialSampleRecord'
+```
+
+**Narrow format:**
+```sql
+SELECT pid, label
+FROM read_parquet('parquet/minimal_narrow.parquet')
+WHERE otype = 'MaterialSampleRecord'
+```
+
+### Query 2: Find samples with their locations
+
+**Wide format (uses p__* columns):**
+```sql
+SELECT
+    s.pid as sample,
+    s.label,
+    loc.latitude,
+    loc.longitude
+FROM read_parquet('parquet/minimal_wide.parquet') s
+JOIN read_parquet('parquet/minimal_wide.parquet') e
+    ON e.otype = 'SamplingEvent'
+    AND list_contains(s.p__produced_by, e.row_id)
+JOIN read_parquet('parquet/minimal_wide.parquet') loc
+    ON loc.otype = 'GeospatialCoordLocation'
+    AND list_contains(e.p__sample_location, loc.row_id)
+WHERE s.otype = 'MaterialSampleRecord'
+```
+
+**Narrow format (uses edge rows):**
+```sql
+SELECT
+    s.pid as sample,
+    s.label,
+    loc.latitude,
+    loc.longitude
+FROM read_parquet('parquet/minimal_narrow.parquet') s
+JOIN read_parquet('parquet/minimal_narrow.parquet') e1
+    ON e1.otype = '_edge_'
+    AND e1.s = s.row_id
+    AND e1.p = 'produced_by'
+JOIN read_parquet('parquet/minimal_narrow.parquet') ev
+    ON ev.otype = 'SamplingEvent'
+    AND list_contains(e1.o, ev.row_id)
+JOIN read_parquet('parquet/minimal_narrow.parquet') e2
+    ON e2.otype = '_edge_'
+    AND e2.s = ev.row_id
+    AND e2.p = 'sample_location'
+JOIN read_parquet('parquet/minimal_narrow.parquet') loc
+    ON loc.otype = 'GeospatialCoordLocation'
+    AND list_contains(e2.o, loc.row_id)
+WHERE s.otype = 'MaterialSampleRecord'
+```
+
+### Query 3: Count entities by type
+
+```sql
+SELECT otype, COUNT(*) as count
+FROM read_parquet('parquet/minimal_wide.parquet')
+GROUP BY otype
+ORDER BY count DESC
+```
+
+Expected output:
+```
+MaterialSampleRecord     3
+SamplingEvent           3
+GeospatialCoordLocation 2
+SamplingSite            1
+Agent                   1
+```
+
+## JSON Schema Validation
+
+The JSON files validate against the iSamples Core 1.0 schema:
+
+```python
+import json
+from jsonschema import validate
+
+# Load schema (from isamplesorg-metadata repo)
+with open('path/to/iSamplesSchemaCore1.0.json') as f:
+    schema = json.load(f)
+
+# Load and validate
+with open('json/1_sample.json') as f:
+    sample = json.load(f)
+
+validate(instance=sample, schema=schema)  # Raises if invalid
+```
+
+## Entity Relationship Diagram
+
+```
+MaterialSampleRecord ──produced_by──► SamplingEvent ──sample_location──► GeospatialCoordLocation
+        │                                   │
+        │                                   └──sampling_site──► SamplingSite ──site_location──► GeospatialCoordLocation
+        │
+        ├──registrant──► Agent
+        │
+        └──derivedFrom──► MaterialSampleRecord (parent sample)
+```
+
+## Size Comparison
+
+| Format | Rows | File Size | Notes |
+|--------|------|-----------|-------|
+| Export | 3 | 1.7 KB | Nested structs, sample-centric |
+| Narrow | 21 | 4.8 KB | Explicit edge rows |
+| Wide | 10 | 5.0 KB | p__* columns |
+
+In production datasets:
+- Wide is typically 60-70% smaller than narrow
+- Export is smallest but less flexible for complex queries
+
+## See Also
+
+- [PQG Specification](../../docs/PQG_SPECIFICATION.md) - Full format specification
+- [Edge Types](../../pqg/edge_types.py) - All 14 iSamples edge types
+- [Schema Definitions](../../pqg/schemas/) - Python schema validators
diff --git a/examples/minimal/csv/agents.csv b/examples/minimal/csv/agents.csv
new file mode 100644
index 0000000..0e3c700
--- /dev/null
+++ b/examples/minimal/csv/agents.csv
@@ -0,0 +1,4 @@
+agent_id,name,role,affiliation,contact_information
+agent:jsmith,Jane Smith,collector,University of Washington,jsmith@uw.edu
+agent:labtech,Lab Technician,preparer,University of Washington,
+agent:curator,Collections Manager,curator,Burke Museum,
diff --git a/examples/minimal/csv/edges.csv b/examples/minimal/csv/edges.csv
new file mode 100644
index 0000000..1090eab
--- /dev/null
+++ b/examples/minimal/csv/edges.csv
@@ -0,0 +1,14 @@
+subject_id,predicate,object_id,description
+ark:/99999/example001,produced_by,event:example001,Sample was produced by this sampling event
+ark:/99999/example002,produced_by,event:example002,Sample was produced by this sampling event
+ark:/99999/example003,produced_by,event:example003,Sample was produced by this sampling event
+ark:/99999/example002,derivedFrom,ark:/99999/example001,Thin section derived from parent rock sample
+ark:/99999/example003,relatedTo,ark:/99999/example001,Sibling sample from same site
+event:example001,sample_location,loc:rainier001,Event occurred at this location
+event:example003,sample_location,loc:rainier002,Event occurred at this location
+event:example001,sampling_site,site:rainier001,Event occurred at this site
+event:example003,sampling_site,site:rainier001,Event occurred at this site
+site:rainier001,site_location,loc:rainier001,Site is at this location
+ark:/99999/example001,registrant,agent:jsmith,Sample registered by this agent
+ark:/99999/example002,registrant,agent:jsmith,Sample registered by this agent
+ark:/99999/example003,registrant,agent:jsmith,Sample registered by this agent
diff --git a/examples/minimal/csv/events.csv b/examples/minimal/csv/events.csv
new file mode 100644
index 0000000..f9d8e04
--- /dev/null
+++ b/examples/minimal/csv/events.csv
@@ -0,0 +1,4 @@
+event_id,label,description,result_time,project,feature_of_interest,site_id,location_id,collector_id
+event:example001,Mount Rainier Field Collection 2024-06-10,Field collection during summer geology survey,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier001,agent:jsmith
+event:example002,Lab Preparation 2024-07-01,Thin section preparation in petrology lab,2024-07-01,,,,agent:labtech
+event:example003,Mount Rainier Field Collection 2024-06-10 (Site B),Field collection 10m from first sample,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier002,agent:jsmith
diff --git a/examples/minimal/csv/locations.csv b/examples/minimal/csv/locations.csv
new file mode 100644
index 0000000..d63ebb2
--- /dev/null
+++ b/examples/minimal/csv/locations.csv
@@ -0,0 +1,3 @@
+location_id,latitude,longitude,elevation,obfuscated
+loc:rainier001,46.8523,-121.7603,4392 m above mean sea level,false
+loc:rainier002,46.8524,-121.7601,4390 m above mean sea level,false
diff --git a/examples/minimal/csv/samples.csv b/examples/minimal/csv/samples.csv
new file mode 100644
index 0000000..b73ebd3
--- /dev/null
+++ b/examples/minimal/csv/samples.csv
@@ -0,0 +1,4 @@
+sample_id,label,description,last_modified_time,event_id,material_category,sample_object_type,registrant_id
+ark:/99999/example001,Rock Sample MR-001 (Parent),"Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",2024-06-15T10:30:00Z,event:example001,rock,physicalspecimen,agent:jsmith
+ark:/99999/example002,Rock Sample MR-001-A (Child - Thin Section),Thin section prepared from parent sample MR-001 for petrographic analysis.,2024-07-01T14:00:00Z,event:example002,rock,thinsection,agent:jsmith
+ark:/99999/example003,Rock Sample MR-002,"Second basalt sample from same site, collected 10m away from MR-001.",2024-06-15T11:00:00Z,event:example003,rock,physicalspecimen,agent:jsmith
diff --git a/examples/minimal/csv/sites.csv b/examples/minimal/csv/sites.csv
new file mode 100644
index 0000000..d2573e4
--- /dev/null
+++ b/examples/minimal/csv/sites.csv
@@ -0,0 +1,2 @@
+site_id,label,description,place_name
+site:rainier001,Mount Rainier Summit Area,Collection site near the summit crater rim,"Mount Rainier, Pierce County, Washington, USA"
diff --git a/examples/minimal/json/1_sample.json b/examples/minimal/json/1_sample.json
new file mode 100644
index 0000000..ac915fd
--- /dev/null
+++ b/examples/minimal/json/1_sample.json
@@ -0,0 +1,86 @@
+{
+  "sample_identifier": "ark:/99999/example001",
+  "label": "Rock Sample from Mount Rainier",
+  "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",
+  "last_modified_time": "2024-06-15T10:30:00Z",
+  "produced_by": {
+    "label": "Mount Rainier Field Collection 2024-06-10",
+    "description": "Field collection during summer geology survey",
+    "result_time": "2024-06-10",
+    "project": "Cascade Volcanic Monitoring Project",
+    "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+    "sampling_site": {
+      "label": "Mount Rainier Summit Area",
+      "description": "Collection site near the summit crater rim",
+      "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+      "sample_location": {
+        "latitude": 46.8523,
+        "longitude": -121.7603,
+        "elevation": "4392 m above mean sea level",
+        "obfuscated": false
+      }
+    },
+    "responsibility": [
+      {
+        "name": "Jane Smith",
+        "role": "collector",
+        "affiliation": "University of Washington",
+        "contact_information": "jsmith@uw.edu"
+      }
+    ]
+  },
+  "has_material_category": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/material/1.0/ite",
+      "label": "ite",
+      "scheme_name": "iSamples Material Type"
+    }
+  ],
+  "has_context_category": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/sampledfeature/1.0/activehumanoccupationsite",
+      "label": "Earth interior",
+      "scheme_name": "iSamples Sampled Feature Type"
+    }
+  ],
+  "has_sample_object_type": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+      "label": "Physical specimen",
+      "scheme_name": "iSamples Specimen Type"
+    }
+  ],
+  "keywords": [
+    {
+      "keyword": "basalt",
+      "scheme_name": "Free text"
+    },
+    {
+      "keyword": "volcanic rock",
+      "scheme_name": "Free text"
+    },
+    {
+      "keyword": "Cascade Range",
+      "scheme_name": "Geographic"
+    }
+  ],
+  "registrant": {
+    "name": "Jane Smith",
+    "affiliation": "University of Washington",
+    "contact_information": "jsmith@uw.edu",
+    "role": "registrant"
+  },
+  "curation": {
+    "label": "UW Geology Sample Collection",
+    "description": "Stored in climate-controlled facility",
+    "curation_location": "University of Washington, Burke Museum, Room 142, Drawer B-15",
+    "access_constraints": ["By appointment only", "Research use only"],
+    "responsibility": [
+      {
+        "name": "Collections Manager",
+        "role": "curator",
+        "affiliation": "Burke Museum"
+      }
+    ]
+  }
+}
diff --git a/examples/minimal/json/3_samples.json b/examples/minimal/json/3_samples.json
new file mode 100644
index 0000000..02e47fe
--- /dev/null
+++ b/examples/minimal/json/3_samples.json
@@ -0,0 +1,146 @@
+[
+  {
+    "sample_identifier": "ark:/99999/example001",
+    "label": "Rock Sample MR-001 (Parent)",
+    "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow. This is the original field sample.",
+    "last_modified_time": "2024-06-15T10:30:00Z",
+    "produced_by": {
+      "label": "Mount Rainier Field Collection 2024-06-10",
+      "identifier": "event:example001",
+      "result_time": "2024-06-10",
+      "project": "Cascade Volcanic Monitoring Project",
+      "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+      "sampling_site": {
+        "identifier": "site:rainier001",
+        "label": "Mount Rainier Summit Area",
+        "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+        "sample_location": {
+          "latitude": 46.8523,
+          "longitude": -121.7603,
+          "elevation": "4392 m above mean sea level"
+        }
+      },
+      "responsibility": [
+        {
+          "name": "Jane Smith",
+          "role": "collector",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock",
+        "scheme_name": "iSamples Material Type"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+        "label": "Physical specimen"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  },
+  {
+    "sample_identifier": "ark:/99999/example002",
+    "label": "Rock Sample MR-001-A (Child - Thin Section)",
+    "description": "Thin section prepared from parent sample MR-001 for petrographic analysis.",
+    "last_modified_time": "2024-07-01T14:00:00Z",
+    "produced_by": {
+      "label": "Lab Preparation 2024-07-01",
+      "identifier": "event:example002",
+      "result_time": "2024-07-01",
+      "description": "Thin section preparation in petrology lab",
+      "responsibility": [
+        {
+          "name": "Lab Technician",
+          "role": "preparer",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/thinsection",
+        "label": "Thin section"
+      }
+    ],
+    "related_resource": [
+      {
+        "label": "Parent sample",
+        "relationship": "derivedFrom",
+        "target": "ark:/99999/example001",
+        "description": "This thin section was prepared from the parent rock sample"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  },
+  {
+    "sample_identifier": "ark:/99999/example003",
+    "label": "Rock Sample MR-002",
+    "description": "Second basalt sample from same site, collected 10m away from MR-001.",
+    "last_modified_time": "2024-06-15T11:00:00Z",
+    "produced_by": {
+      "label": "Mount Rainier Field Collection 2024-06-10 (Site B)",
+      "identifier": "event:example003",
+      "result_time": "2024-06-10",
+      "project": "Cascade Volcanic Monitoring Project",
+      "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+      "sampling_site": {
+        "identifier": "site:rainier001",
+        "label": "Mount Rainier Summit Area",
+        "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+        "sample_location": {
+          "latitude": 46.8524,
+          "longitude": -121.7601,
+          "elevation": "4390 m above mean sea level"
+        }
+      },
+      "responsibility": [
+        {
+          "name": "Jane Smith",
+          "role": "collector",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+        "label": "Physical specimen"
+      }
+    ],
+    "related_resource": [
+      {
+        "label": "Sibling sample",
+        "relationship": "relatedTo",
+        "target": "ark:/99999/example001",
+        "description": "Collected from same site as MR-001"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  }
+]
diff --git a/examples/minimal/parquet/minimal_export.parquet b/examples/minimal/parquet/minimal_export.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d06011ed3d21eee72d6c0017bec63b4037df2eeb
GIT binary patch
literal 1673
zcmcIl&rj1(9Di-bhJYjxUeg8=p9$0sw$W}wVcEeUTnsD|7GsEU>DmXY>(-_17ECyJ
z@nFJ<lQABMXHTBI7&-YD_y_n8m=KR%^xJm7+`z!WC2ik(-_Q5+{k$Kaw?b}5!3e)+
za509%&<f))LXWa6LMY5&c&m0bv%JPI(lR|%YYmgivhqwoDCxaYknxSdZ<)U2=Wl}K
zSxDZov?|&4)5!Kt98!r?P;FXwqf01%3svGdB0Z=WrcUCdSTX8kmujwI)uYqs;W88f
z{u34-vwi%pne$puVG$~<tBz{AM6*m2GO14Vrft*@NK#Iw$-Y5>*=gD>`XUhpv|qvr
zN|1(48=yre`?gghUbF5J#}}KJ2QWyn>N+W_Bju2zGQy46JVYsR43{QA*gFOv_36sH
zfvOou^{U6^yK!ZusK}X=oRQ@{fhYIkvQkvi89BqFbP!b_<VArmk7IpwckTyt5un?A
zlG`q9Z9X1H0NcZheXuxx8vwZx)MC)wk{CfTc>az0T>bI;3p|(=j#m(WQ*UZjy<Bp%
z3azPl65)vjmY)h1qGaf>Dh4d1y?}zqh^dyT+3Ahz)X{9C;cZ%HY}!;Ew^XxqFHbMI
zMvZntQ;yZNHCk%#S38M?f_%H6X=$oQRx;t^Hg&o%hDY&4?5X#LsbUW-41ulsK@UCY
zqbC8)g@rlcNJMjZLcmD4U|N{v!rWJeI}}H=JPuC#8vJ_4J$3Q0!XfEe_^Bxn)RPjd
zqhJ0P!<{kWOD4-t4>=~u&kl|CTm%}fn=&Aye*=9npIGiAE1t1gK6+)l@QYXXAxQo|
z!~+w%;1v4vJ7c(0M*JDd@-u-=yRP(&*ctuXyde4-=m90ehu#;`;PT>0ILk-c_U<;I
z*w9#)lJ(E=rEc+=kGL)IJ1~d5%YYAg*8;~%eZ2v;%k9*q`w#&-@7CZ$-u>ax^?E)X
zcRI#}#Zm4&f+T4J?v#yn(xSElUM4LikxrzDRBn1#y{_om>gxX5iX26+LI@S$Pv{Sq
C0-dD*

literal 0
HcmV?d00001

diff --git a/examples/minimal/parquet/minimal_narrow.parquet b/examples/minimal/parquet/minimal_narrow.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..64d12bb8a3e323e955ecdbbe8a4a0c76069d7f61
GIT binary patch
literal 4831
zcmcIoUuYc18K2o(NvFR)$)5F|eX_^4ygtdca*`!S7NVSFyETq%RLM<(F%he~m2~#r
zt=K!MF_dymi);K4{1i;+OHFA*D5ed02!*CljOjz^Ln+0W5K1wA2&plB=tD4Wf8Xrw
zX?3<-HBiCq&dz-Eo8R|m=KFLrle0sDiI2JXgdY(D=r)UH#xC}0jIjaE@#Jk!UTeo&
zpc!a8(e|SCqYa=PMsv^>(U#Db(F$mn(XODmXhpOVnumrxe$^!IYmx2lYnOukx-jp0
zxpChvlnRcY$qY^PFgd&p!{tK7={<NRfv&UWlq%!S+jeoqbuyXrI~W^2n`Y**4ouoh
ze8Rk3E*2`wf+rIGmGM_jK#y0F-+n6zIXp7Q;slaka>lViZwDLh?HKH1Y^2v@qxd|I
z_A1&K+KFVx$WUfoeu%8~R2NU8*7{Vd2K31^-*UY2id`w#?n%${^V41qA71J5Zk8o_
zb40pOTAG78oxO>~Y0~9dH=Aj~<m3`Qpba}`R~#P@O&3gO*2w|qQ<L?<F>IcA&p&Hp
z@3t|a3q$Oh78<#xJ>7!m={G0O%}k#L$sS>lh4sLpt+M9O0kZ#HUHmg@|Jzz>9~MtO
zjWM6I3hczrd8?(0oDKXS=T?g_sGRg2yML;eeelhK<L2c_&vl&~X(xv>!y`vC$BzzW
zc46_fy&z|NXT|pIN^(!))uWlAL)Tto_tT`coa%3b%vrnS$T@P;ZhXypxhqoH#*yb{
zkHRl9J!AVYNB;qKelJTOos{Vd%Y~AhbDDk;@9$w}55b#I@9&y;s7;GF+2Ggw$@>(N
z5n>cJvgFvltSmdS9O#h)FUOaDp-9qGcG-3-G8bs$<Yj)<hli9-a#3koUiH_UtAiBs
zibT1l_KH#xOBQ{vD4{P1%d(0~iYPV*-<9%mA=sfcIS~j#0$jjs5YBj46UQT-`Y{iD
z@gwDnj@ab&W`7@h%vlqMwo>#>;|1u%#9@m(&e)OK8(4qhVlaZW6hQq@Be%x2$y2^$
z*Eq#Z6z8{B{_$`6`(5q*zu<gN%+&aN<8AWe$gvZ{vMB8ZZ_SZK$1cgTV@nq&*6rNO
zG8gw!_<`k-C`(fXUyQ9OlKR>{_KqSsCMIg&DVIWKJ;>I-D7H3*9D5G(yqK+>mqWJ>
z0P>_=&e?fKzV5j>yHv=@b6%-X@s$5(eD5;+4D=ZxBYe{V-?RefI+-&iWDT}!gt)B-
zH&$N|eUa4zlJ@OjlRQ|j(R!t+I>ss2#PoMcg*C@Vgj|*0qI|<HBl4FjUTJG+yTqOv
zo6bxt&}xzG`CD6llZ%yUTpL(<<`UgTb-RyV7|M(v%i!9)m^3As{=(46cxK#WBRKk)
z9l{jNVxvEEk%_2XP-=Iv$(EW=`f6@wVp&`w!CLAWnbxu84rcIZf*I^!(?p1F*zg@B
zCGkWP_oBD)C;ZJ2fh1OZFTa{|^79K<*S0a`9p70hlq<ens+79e8I@erk)22CQJ3}E
z?TmGHTPYZQo{s73-RuZn^?7iuh~)_B@nv<rvf9QmqFB$Bz4|oeQIa4+)aYAeQz^#6
zh69Z;C4_7vsb8f>mc}UmG>9kI{%V!BkeZ%g%^K5sBx66VGTDdKr(}b8mXI~#_X9kz
zRU8~**n^;1x)FVfMp-tS!ILXy*?5*{Xgbl-8qgtCm4uIoVSQLgead!BO04FM;yNT~
zB914EXR#s>WpNIGMlzVw6-~gQA15C4!e3S5(TGknYL+I+s#)R;VKn)?xDwGZ$8?(J
z)XUO?RcY~y(4k!bnFu5$Nn<(28^mS23X2nFJXX~;Cdqz!wu*hyAZH3K`BaQDvlB6T
zCBHCV&MiAdTl6uZ9n{Qb-}~--A%Ac`6CL=Bt)LG^)Nk@CS67_yi<iP)lWQ+HZur%n
zcgi`xutGVS#&oQVyNL11YTh~6hOC)7a*3WGs3u!oue9{SS=zMK!A-7CNPPp}DX+Sf
z`AVT!C)0u>M;B)(qnD8G!jE=)ak1bQHh}29=hg*!T)SMEFM9a`Qg6NvOy=O_&=PzF
zc%vBcQ2aErQQg#C9BBdjlVA&Y7Lke}ZVV(-$)`OQOTpBFB}1DE%@{BqaN~1s#V1Ui
z3R6=jKN&0?`ReY*5H0Rdi^&6O<zJhOXPWg0rRFxeAR+~;*<n)^Hy)IrKzq4B!v=v{
z)fTs#jQ2F_p*~?Y2SuORr0PRJZXAe0=t+~8)bhOHT6bEkubNY4ORXxbcMN90NR6Yw
zvHsO7-ij(5gp&gNLN{*3tVb;qX3ulg+OXNSp7M-c>Hi|pz&IMEYO9Z$jL%|Lwaa?e
zmNNUdTydELTQB8|{YL!%1Wk7Q%bw7VdQ!kC^+Tuh`=w^w<kqci)`PB;84v3>GagoS
zAg8}*oRB}-yAgT)#73Y$NLXKN19Vu@nenj53!tALC+M3pghsT3^!jmE(AQ(eZ@Bfa
z+xjYzGIzhE(o>U}P0B#Sf^J`XHPL+BK(jq^2PvX=HRI>py0P8*xF=<{1=R}`D`B7h
zO)>gFW0N`B<d6F{+9JHv)CRaW?r`hg4(q}8lzCtaHRP6rU>R*of?E<IB|hB0Q3-Vu
z2VugEDz|<wtm^_MT0gtdzL5r>P38#w`hkt;(W_l;@&|7Ggj=5*)^#Ignw!<Mo6jf$
zUt=ASd(=Qq<ru~4LvDQE`fR85vz;lkwO$%Wx@y}O&Jgh3zKvSHAYZ9%ZpVzjaqCXf
zx^1S+9qU1;DrQwAhoeF192V9H%;s>WUSYpPo;ck;7;*}794{*hH%QZWb|JkTR_To@
zPh&?q@`c6G+n%qeb??QDFS%9SJz;iIa;ulUW_zTV2GC1p>%fZ88240u4)b!b3pZ|X
z>&q1A(a%Xi_8p7#6^3yyV+>ev<8d!br%$0sIyEKLzef1cVQpw|Wbl|wFRT{uYear1
VpBo!nJTaQ-XK&!&o*DeK^&dEk=W_r6

literal 0
HcmV?d00001

diff --git a/examples/minimal/parquet/minimal_wide.parquet b/examples/minimal/parquet/minimal_wide.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3f7e38af7b39b2f0ba97639576098aff9e2e123f
GIT binary patch
literal 4951
zcmcIoU1%KF6~1?NR@${z@=DozJu9!R>mQ9*j-{+*DOJQitYo_lt|U}RbxQ~lqur4_
z_Gm`TuGAPx*|<SZZ789bVEhpBkP2f$o=OP)$wLUG55|~cLLN#feJCY`U_zfv+jH*S
zpV6+IDy3bRJNM_D?|$c=bMHN~3)73EjIeu@-K6JO3aTBfgODp3jS!Nel*B3Rfmaue
zCZLDWXcAr=y2+7vr%ndCV`Qj1mOl%<_<L5p2mtLy*q_3ff1ocb&>oO+{o+($Ii6z&
zx!mYX9}#1Rp}Xu<>|=v3;vDw2?Nz4iSFMdr*UshUQ-q9N&JyE1^03xv-ndrYa4J_B
zO(uiuQ_sBsET2n%=an?bVR0TPyd7cp!xSFSs-kcNFurB`<xQ*NSndVi4@$Fs5gNXC
z?I<bW6iYa<<E<|NrS4<N<R#>@(@PejFgU#q4PZp(tco1~M0DA(7wsb8{Kg<mXPouV
zGL3;mSGE(%OW&WKTbR8Bocb8Yh1S(V$3<<SQ@CUw>+G=>a>F}X<^<06Btim8Dqt0v
z^S8W;SX4_VmbNxv4a9U{TSGI)$nG~C+bxL;zU$gWG)9c&#>R(p&kv90`ayZtS``a{
zy=etjC4D^kjp5wrnVoNwPqS#Gm>D_*GM6pS7E8FYN1?Uo7q1Js6t<XK90p&B?1B}5
z^+Tt~`~b-gPmApGRmT%ccJvpqp+0i?3^)L6xffwi!nQo3%9huqEnnQmhzSuz&kN7C
z0#UhYi?Xt-2t4gT1kMIZ&sb&4t%#zs&Mt}4Rsar?Hi|WAd3h_?wr`9KfwUx5t}Vl+
zG#XUa0)In*{a}D7%g`D*4Z=&{I<F*F)eP};UI~H(Z~<l`;IAZ0&B6<$W0_~oPI3+&
zu=74;BO!;bD=UAjp~qs{^u<7s9f~0l*8lbOzyIsE@Ely9umnqnLZc%k0|$X8S-wUr
z<Dv(fK5w?r)oQYHVX}q{uSv2kx$4^|$W6(0o-NkG_sm<T0P=!WE?Om9T=Cta<vB$$
z=X*}Ym&?Bp_}5^W0eyjq5O`LBXC*k*k!gmB8f?!va%;vKXfLzVA?>pQ=S`4ls`Ryn
zrE*tgTmy!`>p9zY01mn#{5A1Cs|<m-Uh%zyq4l%lH8$P3IH1)6%MT8s4u=b6)$rl6
z@IxIydPoPwgQ4MsiRIDU)cM?0F85N}5a@^H(ebI=ltIQ}Tf*ol4B;rOPmCpMjPwKr
z5Fm1gvarYIbIR@0Fgl$h{{X*HxisPELRAz4_1hZDhgS8cbUdX0QGz^Hj(=PrK)!`R
zSA#(~z~}ctK<^SZ&L(lH+7;4YQZv;SC4w7=!-Gd}%R~ceqLa)FQSB}4)u$bV6*3t@
z>JK4!O-Msygb1uxu_$04^t6e;NYKa_29DGuuMq-$@Ov!@`Yq@>0Kr1Wh*l+UOQ}MV
zL{POrs?~OMlov1>gObDwr=^O(I>lL9&|y?Zj_?!k#3W%=RI#H~9A%|E4kCLX0i`TN
zySNn5Q9wt0inRh3q>9S>p~IpnNHh?B0t--w{uXhGi$euDuEYesKo&^0iA#k9DO3sG
zo$z~Ip(izxZ4*bX$~^)_ffYV-Rk1Bv(2=2%2PG7iwvwuP3&OPYlYt3hbeFb@*GsD_
z<>FO)!(v&8!9mUF2>c(cIHkeUaJ)h@vI%uyg!QOjxv^>2Thy!9qOP@SyY*IA$u1WI
zXA>`5?9)Iq=0YM-*(%wChj2%qdl^e$CI{oL@2xjSd*mo)L~7J_?QNNc44zapx&phr
z<yKZI&PIbw=cYPx>m+m!?x}jC%UWA=Tt|V_Y|#VXZ3y(aR=KjW;g=k^GfQ>g3C&1u
zuB_l?w*twTREGqngG}g#?mFcPgE&CzI)UN_T%SM?box+7-lJbfOU_iwujG^mn5hS5
z0=vPDKzjAY0W`WnBe2(jdtiCM|ErDRsdOg&BxRBe)6<Nk>#@N!p5hNF-=$_OZ^Y^e
zt^wOx{W!_))7#q%qX?FeU>I2v>3)RoXy*Hnz8=Q44kf`x!osX_N~N`c`qct06R2>N
zQ+yKPUsCgzk-U+RnOiN1HKH<W1>pU<!*F`qpq_*(ao7A~)Vv$b7@f5gwo&1PLo~GZ
z0*?7vKYJx?yxKAn@NYVQD`MX2$QylaspXh)sF@IOJ^LRd!i>RP;hiX_dKlsFM9lZ%
zd1L6nxu9|O;7KM=WdA?F4a2lFgktkJkyhQ&c$J#}OqdT68AHb$)X*^-<p~ZOwb@2L
zd~mex2Zhu;d%{rN=7Vm4!gSQoF)>w8%{D^a%7ma`wd0M0N<xkBG4nxB{&~!NYC#Xo
z!)KQi`&i@eQuEWp=FP(y<B&>`Fe9qh@jHe0fFVyALndD~n8410N$Q88oZvIc-=pTw
zd(GWs#yE9=to1-bJPDEhKSM(!9v-VNr!qpG;L?J76#e_NKJ!s8EKzemgzLNy_&j9{
z(ceG57d?C>$Vq-j`5kKBd&Yd!mobc|WzkO`kp%t=JtX(Wz+Q6lZj!9-QvM_JZpwW8
zOvZ>elJQVgPP^|20)HL`h7jH7o|DsjLNN%xNac-GGXiB!F4OROuS(AAsXOn<ehq1$
zqxON^cav=wtGrO_7pWxmXUezDUvcvmhpmV=iz-bm+(IjVFY3#uu=lxu+tL53@i(aX
z;SuvEM*tqPq#8UN8O;{L@1EET-*!{Cg9G(f&5sQ8wgK4q&7ex8-fSWE&dCrqNl&)i
z=<Uc?_)F%S>Aa!iH-jNRy(dHUmQ~(+-M6EyanF3S-~2ocH1Uf=l}NMMLYjY(H2I?}
zPnj|Pu%BeJ7a@1NI3wh@Cj6<gJvuTza$aOtw;cGlr8HV9PEM}9Fp(P~9(>g<z+Y<r
E2GMHC{Qv*}

literal 0
HcmV?d00001


From 018adf58f5b8da20d701e404776c85db1321cb68 Mon Sep 17 00:00:00 2001
From: Raymond Yee <raymond.yee@gmail.com>
Date: Fri, 30 Jan 2026 10:08:12 -0800
Subject: [PATCH 2/5] Add task specs for optimization experiments (#17, #18)

Creates self-contained task specifications for:
- H3 geospatial optimization (experiments/h3_optimization/)
- Facet metadata optimization (experiments/facet_optimization/)

These are formatted as Claude Code Web-ready prompts with:
- Pinned data URLs
- Exact column names
- Step-by-step tasks
- Expected output formats

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 experiments/facet_optimization/TASK_SPEC.md | 334 ++++++++++++++++++++
 experiments/h3_optimization/TASK_SPEC.md    | 216 +++++++++++++
 2 files changed, 550 insertions(+)
 create mode 100644 experiments/facet_optimization/TASK_SPEC.md
 create mode 100644 experiments/h3_optimization/TASK_SPEC.md

diff --git a/experiments/facet_optimization/TASK_SPEC.md b/experiments/facet_optimization/TASK_SPEC.md
new file mode 100644
index 0000000..2e1183f
--- /dev/null
+++ b/experiments/facet_optimization/TASK_SPEC.md
@@ -0,0 +1,334 @@
+# Facet Metadata Optimization Task
+
+**Issue:** https://github.com/isamplesorg/pqg/issues/18
+**Goal:** Generate pre-computed facet summary tables for instant dashboard queries
+
+## Data Source
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+```
+
+- ~280 MB, ~20M rows
+- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN
+
+## Schema (Relevant Columns)
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier |
+| `otype` | VARCHAR | Entity type - `'MaterialSampleRecord'` for samples |
+| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN |
+| `label` | VARCHAR | Human-readable name |
+| `p__has_material_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+| `p__has_context_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+| `p__has_sample_object_type` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+
+For IdentifiedConcept rows (otype = 'IdentifiedConcept'):
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier (referenced by p__* arrays) |
+| `label` | VARCHAR | Concept label (e.g., "Rock", "Earth interior") |
+| `scheme_name` | VARCHAR | Vocabulary name |
+
+## Task 1: Baseline Benchmark
+
+Measure current facet query performance.
+
+```python
+import duckdb
+import time
+
+con = duckdb.connect()
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+
+# Query 1: Source facet counts
+SOURCE_FACET = f"""
+SELECT n as source, COUNT(*) as count
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+GROUP BY n
+ORDER BY count DESC
+"""
+
+# Query 2: Material category facet (requires join)
+MATERIAL_FACET = f"""
+WITH samples AS (
+    SELECT row_id, UNNEST(p__has_material_category) as material_id
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+      AND p__has_material_category IS NOT NULL
+),
+concepts AS (
+    SELECT row_id, label
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'IdentifiedConcept'
+)
+SELECT c.label as material, COUNT(*) as count
+FROM samples s
+JOIN concepts c ON c.row_id = s.material_id
+GROUP BY c.label
+ORDER BY count DESC
+LIMIT 50
+"""
+
+# Query 3: Entity type counts (quick sanity check)
+OTYPE_COUNTS = f"""
+SELECT otype, COUNT(*) as count
+FROM read_parquet('{PARQUET_URL}')
+GROUP BY otype
+ORDER BY count DESC
+"""
+```
+
+**Measure:** Execute each query 3 times, report median time in milliseconds.
+
+## Task 2: Generate Source Facet Summary
+
+Simple aggregation - should be tiny file.
+
+```python
+OUTPUT_PATH = "/tmp/facet_source_counts.parquet"
+
+query = f"""
+COPY (
+    SELECT
+        'source' as facet_type,
+        n as facet_value,
+        COUNT(*) as count
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+    GROUP BY n
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 3: Generate Material Category Facet Summary
+
+Requires joining through the relationship arrays.
+
+```python
+OUTPUT_PATH = "/tmp/facet_material_counts.parquet"
+
+query = f"""
+COPY (
+    WITH samples AS (
+        SELECT UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord'
+          AND p__has_material_category IS NOT NULL
+    ),
+    concepts AS (
+        SELECT row_id, label, scheme_name
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'IdentifiedConcept'
+    )
+    SELECT
+        'material' as facet_type,
+        c.label as facet_value,
+        c.scheme_name as scheme,
+        COUNT(*) as count
+    FROM samples s
+    JOIN concepts c ON c.row_id = s.material_id
+    GROUP BY c.label, c.scheme_name
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 4: Generate Context Category Facet Summary
+
+```python
+OUTPUT_PATH = "/tmp/facet_context_counts.parquet"
+
+query = f"""
+COPY (
+    WITH samples AS (
+        SELECT UNNEST(p__has_context_category) as context_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord'
+          AND p__has_context_category IS NOT NULL
+    ),
+    concepts AS (
+        SELECT row_id, label, scheme_name
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'IdentifiedConcept'
+    )
+    SELECT
+        'context' as facet_type,
+        c.label as facet_value,
+        c.scheme_name as scheme,
+        COUNT(*) as count
+    FROM samples s
+    JOIN concepts c ON c.row_id = s.context_id
+    GROUP BY c.label, c.scheme_name
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 5: Generate Combined Facet Summary
+
+All facets in one file for easy loading.
+
+```python
+OUTPUT_PATH = "/tmp/facet_summaries_all.parquet"
+
+query = f"""
+COPY (
+    -- Source facet
+    SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+    GROUP BY n
+
+    UNION ALL
+
+    -- Material facet
+    SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.material_id
+    GROUP BY c.label, c.scheme_name
+
+    UNION ALL
+
+    -- Context facet
+    SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_context_category) as context_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_context_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.context_id
+    GROUP BY c.label, c.scheme_name
+
+    UNION ALL
+
+    -- Object type facet
+    SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_sample_object_type) as type_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_sample_object_type IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.type_id
+    GROUP BY c.label, c.scheme_name
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 6: Generate Cross-Facet Summary (Source × Material)
+
+For "how many Rock samples from OPENCONTEXT?"
+
+```python
+OUTPUT_PATH = "/tmp/facet_source_material_cross.parquet"
+
+query = f"""
+COPY (
+    SELECT
+        s.source,
+        c.label as material,
+        COUNT(*) as count
+    FROM (
+        SELECT n as source, UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.material_id
+    GROUP BY s.source, c.label
+    HAVING COUNT(*) > 100  -- Filter out tiny combinations
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 7: Benchmark Summary Table Queries
+
+Compare querying summary tables vs full data.
+
+```python
+# Load summary and query
+SUMMARY_PATH = "/tmp/facet_summaries_all.parquet"
+
+# This should be nearly instant
+FAST_SOURCE_FACET = f"""
+SELECT facet_value, count
+FROM read_parquet('{SUMMARY_PATH}')
+WHERE facet_type = 'source'
+ORDER BY count DESC
+"""
+
+FAST_MATERIAL_FACET = f"""
+SELECT facet_value, count
+FROM read_parquet('{SUMMARY_PATH}')
+WHERE facet_type = 'material'
+ORDER BY count DESC
+"""
+```
+
+## Expected Output
+
+Generate a JSON results file:
+
+```json
+{
+  "baseline": {
+    "source_facet_ms": 2345,
+    "material_facet_ms": 5678,
+    "context_facet_ms": 4567,
+    "otype_counts_ms": 1234
+  },
+  "with_summary": {
+    "source_facet_ms": 5,
+    "material_facet_ms": 8,
+    "context_facet_ms": 7
+  },
+  "speedup": {
+    "source": 469,
+    "material": 710
+  },
+  "summary_files": {
+    "facet_summaries_all.parquet": {
+      "size_bytes": 12345,
+      "row_count": 234
+    },
+    "facet_source_material_cross.parquet": {
+      "size_bytes": 45678,
+      "row_count": 1234
+    }
+  },
+  "facet_counts": {
+    "source": {
+      "SESAR": 3100000,
+      "OPENCONTEXT": 1200000,
+      "GEOME": 1500000,
+      "SMITHSONIAN": 900000
+    },
+    "material_top10": ["Rock", "ite", "..."],
+    "context_top10": ["Earth interior", "..."]
+  }
+}
+```
+
+## Output Files
+
+Save to `experiments/facet_optimization/results/`:
+- `benchmark_results.json` - The JSON above
+- `facet_summaries_all.parquet` - Combined facet counts
+- `facet_source_material_cross.parquet` - Cross-tab counts
+- `benchmark_log.txt` - Full execution log
diff --git a/experiments/h3_optimization/TASK_SPEC.md b/experiments/h3_optimization/TASK_SPEC.md
new file mode 100644
index 0000000..39d8e4e
--- /dev/null
+++ b/experiments/h3_optimization/TASK_SPEC.md
@@ -0,0 +1,216 @@
+# H3 Geospatial Optimization Task
+
+**Issue:** https://github.com/isamplesorg/pqg/issues/17
+**Goal:** Add H3 index columns to iSamples parquet and benchmark speedup
+
+## Data Source
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+```
+
+- ~280 MB, ~20M rows
+- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN
+
+## Schema (Relevant Columns)
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier |
+| `otype` | VARCHAR | Entity type - filter to `'MaterialSampleRecord'` for samples |
+| `latitude` | DOUBLE | WGS84 latitude (nullable) |
+| `longitude` | DOUBLE | WGS84 longitude (nullable) |
+| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN |
+| `label` | VARCHAR | Human-readable name |
+
+## Environment Setup
+
+```python
+import duckdb
+
+# Install and load H3 extension
+con = duckdb.connect()
+con.execute("INSTALL h3; LOAD h3;")
+```
+
+## Task 1: Baseline Benchmark
+
+Measure current geospatial query performance.
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+
+# Query 1: Bounding box - Western US
+BBOX_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+"""
+
+# Query 2: Bounding box with facet
+BBOX_FACET_QUERY = f"""
+SELECT n as source, COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+GROUP BY n
+"""
+
+# Query 3: Point radius (approximate - 1 degree ≈ 111km)
+# San Francisco area, ~50km radius
+RADIUS_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 37.3 AND 38.1
+  AND longitude BETWEEN -122.8 AND -122.0
+"""
+```
+
+**Measure:** Execute each query 3 times, report median time in milliseconds.
+
+## Task 2: Generate H3-Enhanced Parquet
+
+Add H3 columns at resolutions 4, 6, and 8.
+
+```python
+import duckdb
+import time
+
+con = duckdb.connect()
+con.execute("INSTALL h3; LOAD h3;")
+
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet"
+
+# Generate with H3 columns
+query = f"""
+COPY (
+    SELECT *,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 4) END as h3_res4,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 6) END as h3_res6,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 8) END as h3_res8
+    FROM read_parquet('{PARQUET_URL}')
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET, COMPRESSION ZSTD);
+"""
+
+start = time.time()
+con.execute(query)
+elapsed = time.time() - start
+print(f"Generated in {elapsed:.1f}s")
+```
+
+**Report:**
+- Original file size (MB)
+- New file size with H3 (MB)
+- Size increase percentage
+- Row count with valid H3 (non-null lat/lon)
+
+## Task 3: H3 Benchmark
+
+Re-run equivalent queries using H3 filters.
+
+```python
+OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet"
+
+# Get H3 cells covering the Western US bbox at res 4
+# (In practice, use h3 library to get these)
+# For now, query to find the cells:
+FIND_CELLS = f"""
+SELECT DISTINCT h3_res4
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+  AND h3_res4 IS NOT NULL
+"""
+
+# Then filter by H3 cell instead of lat/lon
+# This should be faster because H3 is an integer column with good stats
+H3_BBOX_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+  AND h3_res4 IN (SELECT DISTINCT h3_res4
+                   FROM read_parquet('{OUTPUT_PATH}')
+                   WHERE latitude BETWEEN 32 AND 42
+                     AND longitude BETWEEN -125 AND -110)
+"""
+
+# For aggregation by location (clustering for map display)
+H3_CLUSTER_QUERY = f"""
+SELECT h3_res6, COUNT(*) as cnt,
+       AVG(latitude) as center_lat,
+       AVG(longitude) as center_lon
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+  AND h3_res4 IN (...cells from above...)
+GROUP BY h3_res6
+"""
+```
+
+## Task 4: Resolution Analysis
+
+Determine optimal H3 resolutions.
+
+```python
+# Count distinct cells at each resolution
+RESOLUTION_STATS = f"""
+SELECT
+    COUNT(DISTINCT h3_res4) as unique_res4,
+    COUNT(DISTINCT h3_res6) as unique_res6,
+    COUNT(DISTINCT h3_res8) as unique_res8,
+    COUNT(*) as total_rows,
+    COUNT(h3_res4) as rows_with_h3
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+"""
+```
+
+**Report:** Unique cells per resolution, average points per cell.
+
+## Expected Output
+
+Generate a JSON results file:
+
+```json
+{
+  "baseline": {
+    "bbox_query_ms": 1234,
+    "bbox_facet_ms": 1456,
+    "radius_query_ms": 1123
+  },
+  "with_h3": {
+    "bbox_query_ms": 234,
+    "bbox_facet_ms": 345,
+    "cluster_query_ms": 456
+  },
+  "speedup": {
+    "bbox": 5.3,
+    "facet": 4.2
+  },
+  "file_size": {
+    "original_mb": 282,
+    "with_h3_mb": 310,
+    "increase_pct": 9.9
+  },
+  "h3_stats": {
+    "rows_with_coords": 5400000,
+    "unique_res4_cells": 1234,
+    "unique_res6_cells": 45678,
+    "unique_res8_cells": 234567
+  }
+}
+```
+
+## Output Files
+
+Save to `experiments/h3_optimization/results/`:
+- `benchmark_results.json` - The JSON above
+- `isamples_wide_h3.parquet` - Enhanced parquet (or note if too large to include)
+- `benchmark_log.txt` - Full execution log

From 67ba4c76b60472f14e1a1b690dbc6a74f86d1690 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 30 Jan 2026 18:25:35 +0000
Subject: [PATCH 3/5] Add facet optimization benchmark results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Results from running all 7 facet optimization tasks:
- Baseline benchmarks: source (34ms), material (490ms), otype (35ms)
- Generated summary parquet files for source, material, context facets
- Combined facet summary with 60 rows
- Cross-facet summary (source × material) with 24 combinations
- Speedup achieved: 8.7x for source, 140.1x for material facets

https://claude.ai/code/session_016aGrEntdNnvpPjUqkpAtdC
---
 .../results/facet_results.json                | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 experiments/facet_optimization/results/facet_results.json

diff --git a/experiments/facet_optimization/results/facet_results.json b/experiments/facet_optimization/results/facet_results.json
new file mode 100644
index 0000000..4160856
--- /dev/null
+++ b/experiments/facet_optimization/results/facet_results.json
@@ -0,0 +1,142 @@
+{
+  "baseline": {
+    "source_facet_ms": 34.14,
+    "material_facet_ms": 490.4,
+    "otype_counts_ms": 34.67
+  },
+  "with_summary": {
+    "source_facet_ms": 3.92,
+    "material_facet_ms": 3.5,
+    "context_facet_ms": 3.21
+  },
+  "speedup": {
+    "source": 8.7,
+    "material": 140.1
+  },
+  "summary_files": {
+    "facet_summaries_all.parquet": {
+      "size_bytes": 2118,
+      "row_count": 60
+    },
+    "facet_source_material_cross.parquet": {
+      "size_bytes": 1266,
+      "row_count": 24
+    }
+  },
+  "facet_counts": {
+    "source": {
+      "SESAR": 4688386,
+      "OPENCONTEXT": 1064831,
+      "GEOME": 605554,
+      "SMITHSONIAN": 322161
+    },
+    "material_top10": [
+      "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/rock",
+      "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock",
+      "https://w3id.org/isample/vocabulary/material/1.0/material",
+      "https://w3id.org/isample/vocabulary/material/1.0/mineral",
+      "https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal",
+      "https://w3id.org/isample/opencontext/material/0.1/ceramicclay",
+      "https://w3id.org/isample/vocabulary/material/1.0/sediment"
+    ],
+    "otype_counts": {
+      "MaterialSampleRecord": 6680932,
+      "SamplingEvent": 6354171,
+      "GeospatialCoordLocation": 5980282,
+      "MaterialSampleCuration": 720254,
+      "SampleRelation": 501579,
+      "SamplingSite": 386160,
+      "IdentifiedConcept": 55893,
+      "Agent": 50087
+    },
+    "context_top10": [
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/anysampledfeature",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/pasthumanoccupationsite",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/earthinterior",
+      "https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/subaerialsurfaceenvironment",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/waterbody",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbodybottom",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/lakeriverstreambottom",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbody",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/terrestrialwaterbody"
+    ]
+  },
+  "task_details": {
+    "task1": {
+      "status": "completed",
+      "queries_run": 3,
+      "runs_per_query": 3
+    },
+    "task2": {
+      "status": "completed",
+      "output_file": "/tmp/facet_source_counts.parquet",
+      "row_count": 4,
+      "size_bytes": 542,
+      "generation_time_ms": 32.17
+    },
+    "task3": {
+      "status": "completed",
+      "output_file": "/tmp/facet_material_counts.parquet",
+      "row_count": 19,
+      "size_bytes": 1284,
+      "generation_time_ms": 583.98
+    },
+    "task4": {
+      "status": "completed",
+      "output_file": "/tmp/facet_context_counts.parquet",
+      "row_count": 17,
+      "size_bytes": 1171,
+      "generation_time_ms": 569.76
+    },
+    "task5": {
+      "status": "completed",
+      "output_file": "/tmp/facet_summaries_all.parquet",
+      "row_count": 60,
+      "size_bytes": 2118,
+      "generation_time_ms": 984.68
+    },
+    "task6": {
+      "status": "completed",
+      "output_file": "/tmp/facet_source_material_cross.parquet",
+      "row_count": 24,
+      "size_bytes": 1266,
+      "generation_time_ms": 142.55,
+      "top_combinations": [
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial",
+          "count": 2233939
+        },
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+          "count": 912855
+        },
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock",
+          "count": 838805
+        },
+        {
+          "source": "OPENCONTEXT",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial",
+          "count": 745539
+        },
+        {
+          "source": "GEOME",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial",
+          "count": 605554
+        }
+      ]
+    },
+    "task7": {
+      "status": "completed",
+      "speedup_source": 8.7,
+      "speedup_material": 140.1
+    }
+  }
+}
\ No newline at end of file

From 745103f7466c19a9b61217ba6059024c3fa39482 Mon Sep 17 00:00:00 2001
From: Raymond Yee <raymond.yee@gmail.com>
Date: Fri, 30 Jan 2026 11:09:24 -0800
Subject: [PATCH 4/5] Add H3 optimization benchmark results from fly.io sprite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 4.85x speedup for bbox queries (170ms → 35ms)
- 4.87x speedup for faceted geo queries
- Only 3.7% file size increase (282MB → 292MB)
- 5.98M samples with coords out of 6.68M total

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../results/benchmark_results.json            | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 experiments/h3_optimization/results/benchmark_results.json

diff --git a/experiments/h3_optimization/results/benchmark_results.json b/experiments/h3_optimization/results/benchmark_results.json
new file mode 100644
index 0000000..e7c79c2
--- /dev/null
+++ b/experiments/h3_optimization/results/benchmark_results.json
@@ -0,0 +1,29 @@
+{
+  "baseline": {
+    "bbox_query_ms": 170.0129508972168,
+    "bbox_facet_ms": 186.75613403320312,
+    "radius_query_ms": 179.39233779907227
+  },
+  "with_h3": {
+    "bbox_query_ms": 35.030364990234375,
+    "bbox_facet_ms": 38.38610649108887,
+    "cluster_query_ms": 51.71322822570801
+  },
+  "speedup": {
+    "bbox": 4.85,
+    "facet": 4.87
+  },
+  "file_size": {
+    "original_mb": 282,
+    "with_h3_mb": 292.4,
+    "increase_pct": 3.7,
+    "generation_time_s": 41.6
+  },
+  "h3_stats": {
+    "unique_res4_cells": 38406,
+    "unique_res6_cells": 111681,
+    "unique_res8_cells": 175653,
+    "total_sample_rows": 6680932,
+    "rows_with_coords": 5980282
+  }
+}
\ No newline at end of file

From aefd465b0f0554da3c19ddcea5bd0fca1ef04244 Mon Sep 17 00:00:00 2001
From: Raymond Yee <raymond.yee@gmail.com>
Date: Fri, 30 Jan 2026 11:35:35 -0800
Subject: [PATCH 5/5] Add CLI commands for H3 indexing and facet summaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- add-h3: Add H3 index columns at specified resolutions
  - Supports local files and remote URLs
  - Configurable lat/lon columns and resolutions
  - Uses H3 community extension

- facet-summaries: Generate pre-computed facet summary tables
  - Combined summaries for source, material, context, object_type
  - Source × material cross-tabulation
  - Configurable otype filter and minimum cross-count

Implements CLI support for optimizations benchmarked in #17 and #18.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pqg/__main__.py | 251 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 251 insertions(+)

diff --git a/pqg/__main__.py b/pqg/__main__.py
index 72a8687..8c23098 100644
--- a/pqg/__main__.py
+++ b/pqg/__main__.py
@@ -4,12 +4,15 @@
 
 import json
 import logging
+import os
+import time
 import typing
 import click
 import duckdb
 
 import rich
 import rich.tree
+import rich.console
 
 import pqg
 import pqg.common
@@ -186,5 +189,253 @@ def get_geo(ctx, store):
         print("]}")
 
 
+@cli.command("add-h3")
+@click.pass_context
+@click.argument("input_parquet")
+@click.option("-o", "--output", required=True, help="Output parquet file path")
+@click.option(
+    "-r",
+    "--resolutions",
+    default="4,6,8",
+    help="Comma-separated H3 resolutions to add (default: 4,6,8)",
+)
+@click.option(
+    "--lat-col", default="latitude", help="Latitude column name (default: latitude)"
+)
+@click.option(
+    "--lon-col", default="longitude", help="Longitude column name (default: longitude)"
+)
+def add_h3(
+    ctx,
+    input_parquet: str,
+    output: str,
+    resolutions: str,
+    lat_col: str,
+    lon_col: str,
+):
+    """Add H3 index columns to a parquet file.
+
+    Creates a new parquet file with h3_resN columns for each specified resolution.
+    Only rows with valid lat/lon will have H3 values; others will be NULL.
+
+    Example:
+        pqg add-h3 input.parquet -o output_h3.parquet
+        pqg add-h3 input.parquet -o output.parquet -r 4,6
+    """
+    console = rich.console.Console()
+    logger = get_logger()
+
+    # Parse resolutions
+    res_list = [int(r.strip()) for r in resolutions.split(",")]
+    logger.info(f"Adding H3 columns at resolutions: {res_list}")
+
+    con = ctx.obj["dbinstance"]
+
+    # Install and load H3 extension (community extension)
+    console.print("[blue]Installing H3 extension from community...[/blue]")
+    con.execute("INSTALL h3 FROM community; LOAD h3;")
+
+    # Build H3 column expressions
+    h3_cols = []
+    for res in res_list:
+        h3_cols.append(
+            f"CASE WHEN {lat_col} IS NOT NULL AND {lon_col} IS NOT NULL "
+            f"THEN h3_latlng_to_cell({lat_col}, {lon_col}, {res}) END as h3_res{res}"
+        )
+    h3_select = ", ".join(h3_cols)
+
+    # Determine source (local file or URL)
+    if input_parquet.startswith("http://") or input_parquet.startswith("https://"):
+        source = f"read_parquet('{input_parquet}')"
+    else:
+        source = f"read_parquet('{os.path.abspath(input_parquet)}')"
+
+    query = f"""
+    COPY (
+        SELECT *, {h3_select}
+        FROM {source}
+    ) TO '{os.path.abspath(output)}' (FORMAT PARQUET, COMPRESSION ZSTD);
+    """
+
+    console.print(f"[blue]Processing {input_parquet}...[/blue]")
+    start = time.time()
+    con.execute(query)
+    elapsed = time.time() - start
+
+    # Get stats
+    stats = con.sql(
+        f"SELECT COUNT(*) as total, COUNT(h3_res{res_list[0]}) as with_h3 "
+        f"FROM read_parquet('{os.path.abspath(output)}')"
+    ).fetchone()
+
+    output_size = os.path.getsize(output) / (1024 * 1024)
+
+    console.print(f"[green]✓ Generated {output}[/green]")
+    console.print(f"  Size: {output_size:.1f} MB")
+    console.print(f"  Total rows: {stats[0]:,}")
+    console.print(f"  Rows with H3: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
+    console.print(f"  Time: {elapsed:.1f}s")
+
+
+@cli.command("facet-summaries")
+@click.pass_context
+@click.argument("input_parquet")
+@click.option(
+    "-o",
+    "--output-dir",
+    required=True,
+    help="Output directory for summary files",
+)
+@click.option(
+    "--otype-filter",
+    default="MaterialSampleRecord",
+    help="Filter to this otype (default: MaterialSampleRecord)",
+)
+@click.option(
+    "--min-cross-count",
+    default=100,
+    type=int,
+    help="Minimum count for cross-facet combinations (default: 100)",
+)
+def facet_summaries(
+    ctx,
+    input_parquet: str,
+    output_dir: str,
+    otype_filter: str,
+    min_cross_count: int,
+):
+    """Generate pre-computed facet summary tables from a wide parquet file.
+
+    Creates two output files:
+    - facet_summaries_all.parquet: Combined counts for source, material, context, object_type
+    - facet_source_material_cross.parquet: Source × material cross-tabulation
+
+    Example:
+        pqg facet-summaries wide.parquet -o summaries/
+    """
+    console = rich.console.Console()
+    logger = get_logger()
+
+    con = ctx.obj["dbinstance"]
+
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Determine source
+    if input_parquet.startswith("http://") or input_parquet.startswith("https://"):
+        source = f"read_parquet('{input_parquet}')"
+    else:
+        source = f"read_parquet('{os.path.abspath(input_parquet)}')"
+
+    otype_clause = f"otype = '{otype_filter}'" if otype_filter else "1=1"
+
+    # Generate combined facet summaries
+    console.print("[blue]Generating combined facet summaries...[/blue]")
+    start = time.time()
+
+    combined_path = os.path.join(output_dir, "facet_summaries_all.parquet")
+    combined_query = f"""
+    COPY (
+        -- Source facet
+        SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count
+        FROM {source}
+        WHERE {otype_clause}
+        GROUP BY n
+
+        UNION ALL
+
+        -- Material facet
+        SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_material_category) as material_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_material_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.material_id
+        GROUP BY c.label, c.scheme_name
+
+        UNION ALL
+
+        -- Context facet
+        SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_context_category) as context_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_context_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.context_id
+        GROUP BY c.label, c.scheme_name
+
+        UNION ALL
+
+        -- Object type facet
+        SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_sample_object_type) as type_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_sample_object_type IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.type_id
+        GROUP BY c.label, c.scheme_name
+    ) TO '{combined_path}' (FORMAT PARQUET);
+    """
+    con.execute(combined_query)
+
+    combined_stats = con.sql(
+        f"SELECT COUNT(*) FROM read_parquet('{combined_path}')"
+    ).fetchone()
+    combined_size = os.path.getsize(combined_path)
+
+    console.print(f"[green]✓ {combined_path}[/green]")
+    console.print(f"  Rows: {combined_stats[0]}, Size: {combined_size:,} bytes")
+
+    # Generate cross-facet summary
+    console.print("[blue]Generating source × material cross-tabulation...[/blue]")
+
+    cross_path = os.path.join(output_dir, "facet_source_material_cross.parquet")
+    cross_query = f"""
+    COPY (
+        SELECT
+            s.source,
+            c.label as material,
+            COUNT(*) as count
+        FROM (
+            SELECT n as source, UNNEST(p__has_material_category) as material_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_material_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.material_id
+        GROUP BY s.source, c.label
+        HAVING COUNT(*) > {min_cross_count}
+        ORDER BY count DESC
+    ) TO '{cross_path}' (FORMAT PARQUET);
+    """
+    con.execute(cross_query)
+
+    cross_stats = con.sql(
+        f"SELECT COUNT(*) FROM read_parquet('{cross_path}')"
+    ).fetchone()
+    cross_size = os.path.getsize(cross_path)
+
+    elapsed = time.time() - start
+
+    console.print(f"[green]✓ {cross_path}[/green]")
+    console.print(f"  Rows: {cross_stats[0]}, Size: {cross_size:,} bytes")
+    console.print(f"[green]Total time: {elapsed:.1f}s[/green]")
+
+    # Print summary
+    console.print("\n[bold]Summary:[/bold]")
+    facet_counts = con.sql(
+        f"SELECT facet_type, COUNT(*) as n, SUM(count) as total "
+        f"FROM read_parquet('{combined_path}') GROUP BY facet_type"
+    ).fetchall()
+    for row in facet_counts:
+        console.print(f"  {row[0]}: {row[1]} values, {row[2]:,} total records")
+
+
 if __name__ == "__main__":
     cli()