diff --git a/examples/minimal/README.md b/examples/minimal/README.md
new file mode 100644
index 0000000..7a5943b
--- /dev/null
+++ b/examples/minimal/README.md
@@ -0,0 +1,195 @@
+# Minimal PQG Example Data
+
+This directory contains small, hand-crafted example datasets to help understand the iSamples PQG format. The same data is represented in JSON, CSV, and all three parquet formats (export, narrow, wide).
+
+## Dataset Overview
+
+**Domain**: Geological rock samples from Mount Rainier volcanic monitoring project
+
+**Entities**:
+- 3 MaterialSampleRecords (samples)
+- 3 SamplingEvents (collection/preparation events)
+- 2 GeospatialCoordLocations (coordinates)
+- 1 SamplingSite (Mount Rainier Summit Area)
+- 1 Agent (Jane Smith, collector)
+
+**Relationships demonstrated**:
+- Sample → produced_by → SamplingEvent (how samples are created)
+- Sample → derivedFrom → Sample (parent/child relationship)
+- SamplingEvent → sample_location → GeospatialCoordLocation
+- SamplingEvent → sampling_site → SamplingSite
+- SamplingSite → site_location → GeospatialCoordLocation
+
+## File Structure
+
+```
+minimal/
+├── json/
+│   ├── 1_sample.json       # Single sample (simplest case)
+│   └── 3_samples.json      # Three related samples
+├── csv/
+│   ├── samples.csv         # MaterialSampleRecords
+│   ├── events.csv          # SamplingEvents
+│   ├── locations.csv       # GeospatialCoordLocations
+│   ├── sites.csv           # SamplingSites
+│   ├── agents.csv          # Agents
+│   └── edges.csv           # Relationships (for narrow format)
+└── parquet/
+    ├── minimal_export.parquet    # Export format (3 rows, nested)
+    ├── minimal_narrow.parquet    # Narrow format (21 rows, with edges)
+    └── minimal_wide.parquet      # Wide format (10 rows, p__* columns)
+```
+
+## The Three Parquet Formats
+
+### Export Format (`minimal_export.parquet`)
+- **3 rows** - one per sample
+- Sample-centric with nested structs for related entities
+- Best for: Simple queries on sample properties
+- Coordinates pre-extracted to `sample_location_latitude/longitude`
+
+### Narrow Format (`minimal_narrow.parquet`)
+- **21 rows** - 10 entities + 11 edge rows
+- Graph-normalized with explicit `_edge_` rows
+- Columns `s` (subject), `p` (predicate), `o` (object array)
+- Best for: Graph traversal, flexible relationship queries
+
+### Wide Format (`minimal_wide.parquet`)
+- **10 rows** - one per entity (no edge rows)
+- Relationships stored as `p__*` columns with row_id arrays
+- Best for: Fast entity queries, smaller file size, analytical queries
+
+## Example Queries
+
+### Query 1: Find all samples (works in all formats)
+
+**Export format:**
+```sql
+SELECT sample_identifier, label
+FROM read_parquet('parquet/minimal_export.parquet')
+```
+
+**Wide format:**
+```sql
+SELECT pid, label
+FROM read_parquet('parquet/minimal_wide.parquet')
+WHERE otype = 'MaterialSampleRecord'
+```
+
+**Narrow format:**
+```sql
+SELECT pid, label
+FROM read_parquet('parquet/minimal_narrow.parquet')
+WHERE otype = 'MaterialSampleRecord'
+```
+
+### Query 2: Find samples with their locations
+
+**Wide format (uses p__* columns):**
+```sql
+SELECT
+    s.pid as sample,
+    s.label,
+    loc.latitude,
+    loc.longitude
+FROM read_parquet('parquet/minimal_wide.parquet') s
+JOIN read_parquet('parquet/minimal_wide.parquet') e
+    ON e.otype = 'SamplingEvent'
+    AND list_contains(s.p__produced_by, e.row_id)
+JOIN read_parquet('parquet/minimal_wide.parquet') loc
+    ON loc.otype = 'GeospatialCoordLocation'
+    AND list_contains(e.p__sample_location, loc.row_id)
+WHERE s.otype = 'MaterialSampleRecord'
+```
+
+**Narrow format (uses edge rows):**
+```sql
+SELECT
+    s.pid as sample,
+    s.label,
+    loc.latitude,
+    loc.longitude
+FROM read_parquet('parquet/minimal_narrow.parquet') s
+JOIN read_parquet('parquet/minimal_narrow.parquet') e1
+    ON e1.otype = '_edge_'
+    AND e1.s = s.row_id
+    AND e1.p = 'produced_by'
+JOIN read_parquet('parquet/minimal_narrow.parquet') ev
+    ON ev.otype = 'SamplingEvent'
+    AND list_contains(e1.o, ev.row_id)
+JOIN read_parquet('parquet/minimal_narrow.parquet') e2
+    ON e2.otype = '_edge_'
+    AND e2.s = ev.row_id
+    AND e2.p = 'sample_location'
+JOIN read_parquet('parquet/minimal_narrow.parquet') loc
+    ON loc.otype = 'GeospatialCoordLocation'
+    AND list_contains(e2.o, loc.row_id)
+WHERE s.otype = 'MaterialSampleRecord'
+```
+
+### Query 3: Count entities by type
+
+```sql
+SELECT otype, COUNT(*) as count
+FROM read_parquet('parquet/minimal_wide.parquet')
+GROUP BY otype
+ORDER BY count DESC
+```
+
+Expected output:
+```
+MaterialSampleRecord     3
+SamplingEvent           3
+GeospatialCoordLocation 2
+SamplingSite            1
+Agent                   1
+```
+
+## JSON Schema Validation
+
+The JSON files validate against the iSamples Core 1.0 schema:
+
+```python
+import json
+from jsonschema import validate
+
+# Load schema (from isamplesorg-metadata repo)
+with open('path/to/iSamplesSchemaCore1.0.json') as f:
+    schema = json.load(f)
+
+# Load and validate
+with open('json/1_sample.json') as f:
+    sample = json.load(f)
+
+validate(instance=sample, schema=schema)  # Raises if invalid
+```
+
+## Entity Relationship Diagram
+
+```
+MaterialSampleRecord ──produced_by──► SamplingEvent ──sample_location──► GeospatialCoordLocation
+        │                                   │
+        │                                   └──sampling_site──► SamplingSite ──site_location──► GeospatialCoordLocation
+        │
+        ├──registrant──► Agent
+        │
+        └──derivedFrom──► MaterialSampleRecord (parent sample)
+```
+
+## Size Comparison
+
+| Format | Rows | File Size | Notes |
+|--------|------|-----------|-------|
+| Export | 3 | 1.7 KB | Nested structs, sample-centric |
+| Narrow | 21 | 4.8 KB | Explicit edge rows |
+| Wide | 10 | 5.0 KB | p__* columns |
+
+In production datasets:
+- Wide is typically 60-70% smaller than narrow
+- Export is smallest but less flexible for complex queries
+
+## See Also
+
+- [PQG Specification](../../docs/PQG_SPECIFICATION.md) - Full format specification
+- [Edge Types](../../pqg/edge_types.py) - All 14 iSamples edge types
+- [Schema Definitions](../../pqg/schemas/) - Python schema validators
diff --git a/examples/minimal/csv/agents.csv b/examples/minimal/csv/agents.csv
new file mode 100644
index 0000000..0e3c700
--- /dev/null
+++ b/examples/minimal/csv/agents.csv
@@ -0,0 +1,4 @@
+agent_id,name,role,affiliation,contact_information
+agent:jsmith,Jane Smith,collector,University of Washington,jsmith@uw.edu
+agent:labtech,Lab Technician,preparer,University of Washington,
+agent:curator,Collections Manager,curator,Burke Museum,
diff --git a/examples/minimal/csv/edges.csv b/examples/minimal/csv/edges.csv
new file mode 100644
index 0000000..1090eab
--- /dev/null
+++ b/examples/minimal/csv/edges.csv
@@ -0,0 +1,14 @@
+subject_id,predicate,object_id,description
+ark:/99999/example001,produced_by,event:example001,Sample was produced by this sampling event
+ark:/99999/example002,produced_by,event:example002,Sample was produced by this sampling event
+ark:/99999/example003,produced_by,event:example003,Sample was produced by this sampling event
+ark:/99999/example002,derivedFrom,ark:/99999/example001,Thin section derived from parent rock sample
+ark:/99999/example003,relatedTo,ark:/99999/example001,Sibling sample from same site
+event:example001,sample_location,loc:rainier001,Event occurred at this location
+event:example003,sample_location,loc:rainier002,Event occurred at this location
+event:example001,sampling_site,site:rainier001,Event occurred at this site
+event:example003,sampling_site,site:rainier001,Event occurred at this site
+site:rainier001,site_location,loc:rainier001,Site is at this location
+ark:/99999/example001,registrant,agent:jsmith,Sample registered by this agent
+ark:/99999/example002,registrant,agent:jsmith,Sample registered by this agent
+ark:/99999/example003,registrant,agent:jsmith,Sample registered by this agent
diff --git a/examples/minimal/csv/events.csv b/examples/minimal/csv/events.csv
new file mode 100644
index 0000000..f9d8e04
--- /dev/null
+++ b/examples/minimal/csv/events.csv
@@ -0,0 +1,4 @@
+event_id,label,description,result_time,project,feature_of_interest,site_id,location_id,collector_id
+event:example001,Mount Rainier Field Collection 2024-06-10,Field collection during summer geology survey,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier001,agent:jsmith
+event:example002,Lab Preparation 2024-07-01,Thin section preparation in petrology lab,2024-07-01,,,,agent:labtech
+event:example003,Mount Rainier Field Collection 2024-06-10 (Site B),Field collection 10m from first sample,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier002,agent:jsmith
diff --git a/examples/minimal/csv/locations.csv b/examples/minimal/csv/locations.csv
new file mode 100644
index 0000000..d63ebb2
--- /dev/null
+++ b/examples/minimal/csv/locations.csv
@@ -0,0 +1,3 @@
+location_id,latitude,longitude,elevation,obfuscated
+loc:rainier001,46.8523,-121.7603,4392 m above mean sea level,false
+loc:rainier002,46.8524,-121.7601,4390 m above mean sea level,false
diff --git a/examples/minimal/csv/samples.csv b/examples/minimal/csv/samples.csv
new file mode 100644
index 0000000..b73ebd3
--- /dev/null
+++ b/examples/minimal/csv/samples.csv
@@ -0,0 +1,4 @@
+sample_id,label,description,last_modified_time,event_id,material_category,sample_object_type,registrant_id
+ark:/99999/example001,Rock Sample MR-001 (Parent),"Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",2024-06-15T10:30:00Z,event:example001,rock,physicalspecimen,agent:jsmith
+ark:/99999/example002,Rock Sample MR-001-A (Child - Thin Section),Thin section prepared from parent sample MR-001 for petrographic analysis.,2024-07-01T14:00:00Z,event:example002,rock,thinsection,agent:jsmith
+ark:/99999/example003,Rock Sample MR-002,"Second basalt sample from same site, collected 10m away from MR-001.",2024-06-15T11:00:00Z,event:example003,rock,physicalspecimen,agent:jsmith
diff --git a/examples/minimal/csv/sites.csv b/examples/minimal/csv/sites.csv
new file mode 100644
index 0000000..d2573e4
--- /dev/null
+++ b/examples/minimal/csv/sites.csv
@@ -0,0 +1,2 @@
+site_id,label,description,place_name
+site:rainier001,Mount Rainier Summit Area,Collection site near the summit crater rim,"Mount Rainier, Pierce County, Washington, USA"
diff --git a/examples/minimal/json/1_sample.json b/examples/minimal/json/1_sample.json
new file mode 100644
index 0000000..ac915fd
--- /dev/null
+++ b/examples/minimal/json/1_sample.json
@@ -0,0 +1,86 @@
+{
+  "sample_identifier": "ark:/99999/example001",
+  "label": "Rock Sample from Mount Rainier",
+  "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",
+  "last_modified_time": "2024-06-15T10:30:00Z",
+  "produced_by": {
+    "label": "Mount Rainier Field Collection 2024-06-10",
+    "description": "Field collection during summer geology survey",
+    "result_time": "2024-06-10",
+    "project": "Cascade Volcanic Monitoring Project",
+    "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+    "sampling_site": {
+      "label": "Mount Rainier Summit Area",
+      "description": "Collection site near the summit crater rim",
+      "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+      "sample_location": {
+        "latitude": 46.8523,
+        "longitude": -121.7603,
+        "elevation": "4392 m above mean sea level",
+        "obfuscated": false
+      }
+    },
+    "responsibility": [
+      {
+        "name": "Jane Smith",
+        "role": "collector",
+        "affiliation": "University of Washington",
+        "contact_information": "jsmith@uw.edu"
+      }
+    ]
+  },
+  "has_material_category": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/material/1.0/ite",
+      "label": "ite",
+      "scheme_name": "iSamples Material Type"
+    }
+  ],
+  "has_context_category": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/sampledfeature/1.0/activehumanoccupationsite",
+      "label": "Earth interior",
+      "scheme_name": "iSamples Sampled Feature Type"
+    }
+  ],
+  "has_sample_object_type": [
+    {
+      "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+      "label": "Physical specimen",
+      "scheme_name": "iSamples Specimen Type"
+    }
+  ],
+  "keywords": [
+    {
+      "keyword": "basalt",
+      "scheme_name": "Free text"
+    },
+    {
+      "keyword": "volcanic rock",
+      "scheme_name": "Free text"
+    },
+    {
+      "keyword": "Cascade Range",
+      "scheme_name": "Geographic"
+    }
+  ],
+  "registrant": {
+    "name": "Jane Smith",
+    "affiliation": "University of Washington",
+    "contact_information": "jsmith@uw.edu",
+    "role": "registrant"
+  },
+  "curation": {
+    "label": "UW Geology Sample Collection",
+    "description": "Stored in climate-controlled facility",
+    "curation_location": "University of Washington, Burke Museum, Room 142, Drawer B-15",
+    "access_constraints": ["By appointment only", "Research use only"],
+    "responsibility": [
+      {
+        "name": "Collections Manager",
+        "role": "curator",
+        "affiliation": "Burke Museum"
+      }
+    ]
+  }
+}
diff --git a/examples/minimal/json/3_samples.json b/examples/minimal/json/3_samples.json
new file mode 100644
index 0000000..02e47fe
--- /dev/null
+++ b/examples/minimal/json/3_samples.json
@@ -0,0 +1,146 @@
+[
+  {
+    "sample_identifier": "ark:/99999/example001",
+    "label": "Rock Sample MR-001 (Parent)",
+    "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow. This is the original field sample.",
+    "last_modified_time": "2024-06-15T10:30:00Z",
+    "produced_by": {
+      "label": "Mount Rainier Field Collection 2024-06-10",
+      "identifier": "event:example001",
+      "result_time": "2024-06-10",
+      "project": "Cascade Volcanic Monitoring Project",
+      "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+      "sampling_site": {
+        "identifier": "site:rainier001",
+        "label": "Mount Rainier Summit Area",
+        "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+        "sample_location": {
+          "latitude": 46.8523,
+          "longitude": -121.7603,
+          "elevation": "4392 m above mean sea level"
+        }
+      },
+      "responsibility": [
+        {
+          "name": "Jane Smith",
+          "role": "collector",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock",
+        "scheme_name": "iSamples Material Type"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+        "label": "Physical specimen"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  },
+  {
+    "sample_identifier": "ark:/99999/example002",
+    "label": "Rock Sample MR-001-A (Child - Thin Section)",
+    "description": "Thin section prepared from parent sample MR-001 for petrographic analysis.",
+    "last_modified_time": "2024-07-01T14:00:00Z",
+    "produced_by": {
+      "label": "Lab Preparation 2024-07-01",
+      "identifier": "event:example002",
+      "result_time": "2024-07-01",
+      "description": "Thin section preparation in petrology lab",
+      "responsibility": [
+        {
+          "name": "Lab Technician",
+          "role": "preparer",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/thinsection",
+        "label": "Thin section"
+      }
+    ],
+    "related_resource": [
+      {
+        "label": "Parent sample",
+        "relationship": "derivedFrom",
+        "target": "ark:/99999/example001",
+        "description": "This thin section was prepared from the parent rock sample"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  },
+  {
+    "sample_identifier": "ark:/99999/example003",
+    "label": "Rock Sample MR-002",
+    "description": "Second basalt sample from same site, collected 10m away from MR-001.",
+    "last_modified_time": "2024-06-15T11:00:00Z",
+    "produced_by": {
+      "label": "Mount Rainier Field Collection 2024-06-10 (Site B)",
+      "identifier": "event:example003",
+      "result_time": "2024-06-10",
+      "project": "Cascade Volcanic Monitoring Project",
+      "has_feature_of_interest": "Recent lava flow on Mount Rainier",
+      "sampling_site": {
+        "identifier": "site:rainier001",
+        "label": "Mount Rainier Summit Area",
+        "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"],
+        "sample_location": {
+          "latitude": 46.8524,
+          "longitude": -121.7601,
+          "elevation": "4390 m above mean sea level"
+        }
+      },
+      "responsibility": [
+        {
+          "name": "Jane Smith",
+          "role": "collector",
+          "affiliation": "University of Washington"
+        }
+      ]
+    },
+    "has_material_category": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+        "label": "Rock"
+      }
+    ],
+    "has_sample_object_type": [
+      {
+        "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+        "label": "Physical specimen"
+      }
+    ],
+    "related_resource": [
+      {
+        "label": "Sibling sample",
+        "relationship": "relatedTo",
+        "target": "ark:/99999/example001",
+        "description": "Collected from same site as MR-001"
+      }
+    ],
+    "registrant": {
+      "name": "Jane Smith",
+      "affiliation": "University of Washington"
+    }
+  }
+]
diff --git a/examples/minimal/parquet/minimal_export.parquet b/examples/minimal/parquet/minimal_export.parquet
new file mode 100644
index 0000000..d06011e
Binary files /dev/null and b/examples/minimal/parquet/minimal_export.parquet differ
diff --git a/examples/minimal/parquet/minimal_narrow.parquet b/examples/minimal/parquet/minimal_narrow.parquet
new file mode 100644
index 0000000..64d12bb
Binary files /dev/null and b/examples/minimal/parquet/minimal_narrow.parquet differ
diff --git a/examples/minimal/parquet/minimal_wide.parquet b/examples/minimal/parquet/minimal_wide.parquet
new file mode 100644
index 0000000..3f7e38a
Binary files /dev/null and b/examples/minimal/parquet/minimal_wide.parquet differ
diff --git a/experiments/facet_optimization/TASK_SPEC.md b/experiments/facet_optimization/TASK_SPEC.md
new file mode 100644
index 0000000..2e1183f
--- /dev/null
+++ b/experiments/facet_optimization/TASK_SPEC.md
@@ -0,0 +1,334 @@
+# Facet Metadata Optimization Task
+
+**Issue:** https://github.com/isamplesorg/pqg/issues/18
+**Goal:** Generate pre-computed facet summary tables for instant dashboard queries
+
+## Data Source
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+```
+
+- ~280 MB, ~20M rows
+- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN
+
+## Schema (Relevant Columns)
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier |
+| `otype` | VARCHAR | Entity type - `'MaterialSampleRecord'` for samples |
+| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN |
+| `label` | VARCHAR | Human-readable name |
+| `p__has_material_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+| `p__has_context_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+| `p__has_sample_object_type` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept |
+
+For IdentifiedConcept rows (otype = 'IdentifiedConcept'):
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier (referenced by p__* arrays) |
+| `label` | VARCHAR | Concept label (e.g., "Rock", "Earth interior") |
+| `scheme_name` | VARCHAR | Vocabulary name |
+
+## Task 1: Baseline Benchmark
+
+Measure current facet query performance.
+
+```python
+import duckdb
+import time
+
+con = duckdb.connect()
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+
+# Query 1: Source facet counts
+SOURCE_FACET = f"""
+SELECT n as source, COUNT(*) as count
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+GROUP BY n
+ORDER BY count DESC
+"""
+
+# Query 2: Material category facet (requires join)
+MATERIAL_FACET = f"""
+WITH samples AS (
+    SELECT row_id, UNNEST(p__has_material_category) as material_id
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+      AND p__has_material_category IS NOT NULL
+),
+concepts AS (
+    SELECT row_id, label
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'IdentifiedConcept'
+)
+SELECT c.label as material, COUNT(*) as count
+FROM samples s
+JOIN concepts c ON c.row_id = s.material_id
+GROUP BY c.label
+ORDER BY count DESC
+LIMIT 50
+"""
+
+# Query 3: Entity type counts (quick sanity check)
+OTYPE_COUNTS = f"""
+SELECT otype, COUNT(*) as count
+FROM read_parquet('{PARQUET_URL}')
+GROUP BY otype
+ORDER BY count DESC
+"""
+```
+
+**Measure:** Execute each query 3 times, report median time in milliseconds.
+
+## Task 2: Generate Source Facet Summary
+
+Simple aggregation - should be tiny file.
+
+```python
+OUTPUT_PATH = "/tmp/facet_source_counts.parquet"
+
+query = f"""
+COPY (
+    SELECT
+        'source' as facet_type,
+        n as facet_value,
+        COUNT(*) as count
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+    GROUP BY n
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 3: Generate Material Category Facet Summary
+
+Requires joining through the relationship arrays.
+
+```python
+OUTPUT_PATH = "/tmp/facet_material_counts.parquet"
+
+query = f"""
+COPY (
+    WITH samples AS (
+        SELECT UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord'
+          AND p__has_material_category IS NOT NULL
+    ),
+    concepts AS (
+        SELECT row_id, label, scheme_name
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'IdentifiedConcept'
+    )
+    SELECT
+        'material' as facet_type,
+        c.label as facet_value,
+        c.scheme_name as scheme,
+        COUNT(*) as count
+    FROM samples s
+    JOIN concepts c ON c.row_id = s.material_id
+    GROUP BY c.label, c.scheme_name
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 4: Generate Context Category Facet Summary
+
+```python
+OUTPUT_PATH = "/tmp/facet_context_counts.parquet"
+
+query = f"""
+COPY (
+    WITH samples AS (
+        SELECT UNNEST(p__has_context_category) as context_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord'
+          AND p__has_context_category IS NOT NULL
+    ),
+    concepts AS (
+        SELECT row_id, label, scheme_name
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'IdentifiedConcept'
+    )
+    SELECT
+        'context' as facet_type,
+        c.label as facet_value,
+        c.scheme_name as scheme,
+        COUNT(*) as count
+    FROM samples s
+    JOIN concepts c ON c.row_id = s.context_id
+    GROUP BY c.label, c.scheme_name
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 5: Generate Combined Facet Summary
+
+All facets in one file for easy loading.
+
+```python
+OUTPUT_PATH = "/tmp/facet_summaries_all.parquet"
+
+query = f"""
+COPY (
+    -- Source facet
+    SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count
+    FROM read_parquet('{PARQUET_URL}')
+    WHERE otype = 'MaterialSampleRecord'
+    GROUP BY n
+
+    UNION ALL
+
+    -- Material facet
+    SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.material_id
+    GROUP BY c.label, c.scheme_name
+
+    UNION ALL
+
+    -- Context facet
+    SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_context_category) as context_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_context_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.context_id
+    GROUP BY c.label, c.scheme_name
+
+    UNION ALL
+
+    -- Object type facet
+    SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+    FROM (
+        SELECT UNNEST(p__has_sample_object_type) as type_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_sample_object_type IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.type_id
+    GROUP BY c.label, c.scheme_name
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 6: Generate Cross-Facet Summary (Source × Material)
+
+For "how many Rock samples from OPENCONTEXT?"
+
+```python
+OUTPUT_PATH = "/tmp/facet_source_material_cross.parquet"
+
+query = f"""
+COPY (
+    SELECT
+        s.source,
+        c.label as material,
+        COUNT(*) as count
+    FROM (
+        SELECT n as source, UNNEST(p__has_material_category) as material_id
+        FROM read_parquet('{PARQUET_URL}')
+        WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL
+    ) s
+    JOIN (SELECT row_id, label FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c
+    ON c.row_id = s.material_id
+    GROUP BY s.source, c.label
+    HAVING COUNT(*) > 100  -- Filter out tiny combinations
+    ORDER BY count DESC
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET);
+"""
+con.execute(query)
+```
+
+## Task 7: Benchmark Summary Table Queries
+
+Compare querying summary tables vs full data.
+
+```python
+# Load summary and query
+SUMMARY_PATH = "/tmp/facet_summaries_all.parquet"
+
+# This should be nearly instant
+FAST_SOURCE_FACET = f"""
+SELECT facet_value, count
+FROM read_parquet('{SUMMARY_PATH}')
+WHERE facet_type = 'source'
+ORDER BY count DESC
+"""
+
+FAST_MATERIAL_FACET = f"""
+SELECT facet_value, count
+FROM read_parquet('{SUMMARY_PATH}')
+WHERE facet_type = 'material'
+ORDER BY count DESC
+"""
+```
+
+## Expected Output
+
+Generate a JSON results file:
+
+```json
+{
+  "baseline": {
+    "source_facet_ms": 2345,
+    "material_facet_ms": 5678,
+    "context_facet_ms": 4567,
+    "otype_counts_ms": 1234
+  },
+  "with_summary": {
+    "source_facet_ms": 5,
+    "material_facet_ms": 8,
+    "context_facet_ms": 7
+  },
+  "speedup": {
+    "source": 469,
+    "material": 710
+  },
+  "summary_files": {
+    "facet_summaries_all.parquet": {
+      "size_bytes": 12345,
+      "row_count": 234
+    },
+    "facet_source_material_cross.parquet": {
+      "size_bytes": 45678,
+      "row_count": 1234
+    }
+  },
+  "facet_counts": {
+    "source": {
+      "SESAR": 3100000,
+      "OPENCONTEXT": 1200000,
+      "GEOME": 1500000,
+      "SMITHSONIAN": 900000
+    },
+    "material_top10": ["Rock", "ite", "..."],
+    "context_top10": ["Earth interior", "..."]
+  }
+}
+```
+
+## Output Files
+
+Save to `experiments/facet_optimization/results/`:
+- `benchmark_results.json` - The JSON above
+- `facet_summaries_all.parquet` - Combined facet counts
+- `facet_source_material_cross.parquet` - Cross-tab counts
+- `benchmark_log.txt` - Full execution log
diff --git a/experiments/facet_optimization/results/facet_results.json b/experiments/facet_optimization/results/facet_results.json
new file mode 100644
index 0000000..4160856
--- /dev/null
+++ b/experiments/facet_optimization/results/facet_results.json
@@ -0,0 +1,142 @@
+{
+  "baseline": {
+    "source_facet_ms": 34.14,
+    "material_facet_ms": 490.4,
+    "otype_counts_ms": 34.67
+  },
+  "with_summary": {
+    "source_facet_ms": 3.92,
+    "material_facet_ms": 3.5,
+    "context_facet_ms": 3.21
+  },
+  "speedup": {
+    "source": 8.7,
+    "material": 140.1
+  },
+  "summary_files": {
+    "facet_summaries_all.parquet": {
+      "size_bytes": 2118,
+      "row_count": 60
+    },
+    "facet_source_material_cross.parquet": {
+      "size_bytes": 1266,
+      "row_count": 24
+    }
+  },
+  "facet_counts": {
+    "source": {
+      "SESAR": 4688386,
+      "OPENCONTEXT": 1064831,
+      "GEOME": 605554,
+      "SMITHSONIAN": 322161
+    },
+    "material_top10": [
+      "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/rock",
+      "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial",
+      "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock",
+      "https://w3id.org/isample/vocabulary/material/1.0/material",
+      "https://w3id.org/isample/vocabulary/material/1.0/mineral",
+      "https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal",
+      "https://w3id.org/isample/opencontext/material/0.1/ceramicclay",
+      "https://w3id.org/isample/vocabulary/material/1.0/sediment"
+    ],
+    "otype_counts": {
+      "MaterialSampleRecord": 6680932,
+      "SamplingEvent": 6354171,
+      "GeospatialCoordLocation": 5980282,
+      "MaterialSampleCuration": 720254,
+      "SampleRelation": 501579,
+      "SamplingSite": 386160,
+      "IdentifiedConcept": 55893,
+      "Agent": 50087
+    },
+    "context_top10": [
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/anysampledfeature",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/pasthumanoccupationsite",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/earthinterior",
+      "https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/subaerialsurfaceenvironment",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/waterbody",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbodybottom",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/lakeriverstreambottom",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbody",
+      "https://w3id.org/isample/vocabulary/sampledfeature/1.0/terrestrialwaterbody"
+    ]
+  },
+  "task_details": {
+    "task1": {
+      "status": "completed",
+      "queries_run": 3,
+      "runs_per_query": 3
+    },
+    "task2": {
+      "status": "completed",
+      "output_file": "/tmp/facet_source_counts.parquet",
+      "row_count": 4,
+      "size_bytes": 542,
+      "generation_time_ms": 32.17
+    },
+    "task3": {
+      "status": "completed",
+      "output_file": "/tmp/facet_material_counts.parquet",
+      "row_count": 19,
+      "size_bytes": 1284,
+      "generation_time_ms": 583.98
+    },
+    "task4": {
+      "status": "completed",
+      "output_file": "/tmp/facet_context_counts.parquet",
+      "row_count": 17,
+      "size_bytes": 1171,
+      "generation_time_ms": 569.76
+    },
+    "task5": {
+      "status": "completed",
+      "output_file": "/tmp/facet_summaries_all.parquet",
+      "row_count": 60,
+      "size_bytes": 2118,
+      "generation_time_ms": 984.68
+    },
+    "task6": {
+      "status": "completed",
+      "output_file": "/tmp/facet_source_material_cross.parquet",
+      "row_count": 24,
+      "size_bytes": 1266,
+      "generation_time_ms": 142.55,
+      "top_combinations": [
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial",
+          "count": 2233939
+        },
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/rock",
+          "count": 912855
+        },
+        {
+          "source": "SESAR",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock",
+          "count": 838805
+        },
+        {
+          "source": "OPENCONTEXT",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial",
+          "count": 745539
+        },
+        {
+          "source": "GEOME",
+          "material": "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial",
+          "count": 605554
+        }
+      ]
+    },
+    "task7": {
+      "status": "completed",
+      "speedup_source": 8.7,
+      "speedup_material": 140.1
+    }
+  }
+}
\ No newline at end of file
diff --git a/experiments/h3_optimization/TASK_SPEC.md b/experiments/h3_optimization/TASK_SPEC.md
new file mode 100644
index 0000000..39d8e4e
--- /dev/null
+++ b/experiments/h3_optimization/TASK_SPEC.md
@@ -0,0 +1,216 @@
+# H3 Geospatial Optimization Task
+
+**Issue:** https://github.com/isamplesorg/pqg/issues/17
+**Goal:** Add H3 index columns to iSamples parquet and benchmark speedup
+
+## Data Source
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+```
+
+- ~280 MB, ~20M rows
+- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN
+
+## Schema (Relevant Columns)
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `row_id` | INTEGER | Unique identifier |
+| `otype` | VARCHAR | Entity type - filter to `'MaterialSampleRecord'` for samples |
+| `latitude` | DOUBLE | WGS84 latitude (nullable) |
+| `longitude` | DOUBLE | WGS84 longitude (nullable) |
+| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN |
+| `label` | VARCHAR | Human-readable name |
+
+## Environment Setup
+
+```python
+import duckdb
+
+# Install and load H3 extension
+con = duckdb.connect()
+con.execute("INSTALL h3; LOAD h3;")
+```
+
+## Task 1: Baseline Benchmark
+
+Measure current geospatial query performance.
+
+```python
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+
+# Query 1: Bounding box - Western US
+BBOX_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+"""
+
+# Query 2: Bounding box with facet
+BBOX_FACET_QUERY = f"""
+SELECT n as source, COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+GROUP BY n
+"""
+
+# Query 3: Point radius (approximate - 1 degree ≈ 111km)
+# San Francisco area, ~50km radius
+RADIUS_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{PARQUET_URL}')
+WHERE otype = 'MaterialSampleRecord'
+  AND latitude BETWEEN 37.3 AND 38.1
+  AND longitude BETWEEN -122.8 AND -122.0
+"""
+```
+
+**Measure:** Execute each query 3 times, report median time in milliseconds.
+
+## Task 2: Generate H3-Enhanced Parquet
+
+Add H3 columns at resolutions 4, 6, and 8.
+
+```python
+import duckdb
+import time
+
+con = duckdb.connect()
+con.execute("INSTALL h3; LOAD h3;")
+
+PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
+OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet"
+
+# Generate with H3 columns
+query = f"""
+COPY (
+    SELECT *,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 4) END as h3_res4,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 6) END as h3_res6,
+        CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL
+             THEN h3_latlng_to_cell(latitude, longitude, 8) END as h3_res8
+    FROM read_parquet('{PARQUET_URL}')
+) TO '{OUTPUT_PATH}' (FORMAT PARQUET, COMPRESSION ZSTD);
+"""
+
+start = time.time()
+con.execute(query)
+elapsed = time.time() - start
+print(f"Generated in {elapsed:.1f}s")
+```
+
+**Report:**
+- Original file size (MB)
+- New file size with H3 (MB)
+- Size increase percentage
+- Row count with valid H3 (non-null lat/lon)
+
+## Task 3: H3 Benchmark
+
+Re-run equivalent queries using H3 filters.
+
+```python
+OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet"
+
+# Get H3 cells covering the Western US bbox at res 4
+# (In practice, use h3 library to get these)
+# For now, query to find the cells:
+FIND_CELLS = f"""
+SELECT DISTINCT h3_res4
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE latitude BETWEEN 32 AND 42
+  AND longitude BETWEEN -125 AND -110
+  AND h3_res4 IS NOT NULL
+"""
+
+# Then filter by H3 cell instead of lat/lon
+# This should be faster because H3 is an integer column with good stats
+H3_BBOX_QUERY = f"""
+SELECT COUNT(*) as cnt
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+  AND h3_res4 IN (SELECT DISTINCT h3_res4
+                   FROM read_parquet('{OUTPUT_PATH}')
+                   WHERE latitude BETWEEN 32 AND 42
+                     AND longitude BETWEEN -125 AND -110)
+"""
+
+# For aggregation by location (clustering for map display)
+H3_CLUSTER_QUERY = f"""
+SELECT h3_res6, COUNT(*) as cnt,
+       AVG(latitude) as center_lat,
+       AVG(longitude) as center_lon
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+  AND h3_res4 IN (...cells from above...)
+GROUP BY h3_res6
+"""
+```
+
+## Task 4: Resolution Analysis
+
+Determine optimal H3 resolutions.
+
+```python
+# Count distinct cells at each resolution
+RESOLUTION_STATS = f"""
+SELECT
+    COUNT(DISTINCT h3_res4) as unique_res4,
+    COUNT(DISTINCT h3_res6) as unique_res6,
+    COUNT(DISTINCT h3_res8) as unique_res8,
+    COUNT(*) as total_rows,
+    COUNT(h3_res4) as rows_with_h3
+FROM read_parquet('{OUTPUT_PATH}')
+WHERE otype = 'MaterialSampleRecord'
+"""
+```
+
+**Report:** Unique cells per resolution, average points per cell.
+
+## Expected Output
+
+Generate a JSON results file:
+
+```json
+{
+  "baseline": {
+    "bbox_query_ms": 1234,
+    "bbox_facet_ms": 1456,
+    "radius_query_ms": 1123
+  },
+  "with_h3": {
+    "bbox_query_ms": 234,
+    "bbox_facet_ms": 345,
+    "cluster_query_ms": 456
+  },
+  "speedup": {
+    "bbox": 5.3,
+    "facet": 4.2
+  },
+  "file_size": {
+    "original_mb": 282,
+    "with_h3_mb": 310,
+    "increase_pct": 9.9
+  },
+  "h3_stats": {
+    "rows_with_coords": 5400000,
+    "unique_res4_cells": 1234,
+    "unique_res6_cells": 45678,
+    "unique_res8_cells": 234567
+  }
+}
+```
+
+## Output Files
+
+Save to `experiments/h3_optimization/results/`:
+- `benchmark_results.json` - The JSON above
+- `isamples_wide_h3.parquet` - Enhanced parquet (or note if too large to include)
+- `benchmark_log.txt` - Full execution log
diff --git a/experiments/h3_optimization/results/benchmark_results.json b/experiments/h3_optimization/results/benchmark_results.json
new file mode 100644
index 0000000..e7c79c2
--- /dev/null
+++ b/experiments/h3_optimization/results/benchmark_results.json
@@ -0,0 +1,29 @@
+{
+  "baseline": {
+    "bbox_query_ms": 170.0129508972168,
+    "bbox_facet_ms": 186.75613403320312,
+    "radius_query_ms": 179.39233779907227
+  },
+  "with_h3": {
+    "bbox_query_ms": 35.030364990234375,
+    "bbox_facet_ms": 38.38610649108887,
+    "cluster_query_ms": 51.71322822570801
+  },
+  "speedup": {
+    "bbox": 4.85,
+    "facet": 4.87
+  },
+  "file_size": {
+    "original_mb": 282,
+    "with_h3_mb": 292.4,
+    "increase_pct": 3.7,
+    "generation_time_s": 41.6
+  },
+  "h3_stats": {
+    "unique_res4_cells": 38406,
+    "unique_res6_cells": 111681,
+    "unique_res8_cells": 175653,
+    "total_sample_rows": 6680932,
+    "rows_with_coords": 5980282
+  }
+}
\ No newline at end of file
diff --git a/pqg/__main__.py b/pqg/__main__.py
index 72a8687..8c23098 100644
--- a/pqg/__main__.py
+++ b/pqg/__main__.py
@@ -4,12 +4,15 @@
 
 import json
 import logging
+import os
+import time
 import typing
 import click
 import duckdb
 
 import rich
 import rich.tree
+import rich.console
 
 import pqg
 import pqg.common
@@ -186,5 +189,253 @@ def get_geo(ctx, store):
         print("]}")
 
 
+@cli.command("add-h3")
+@click.pass_context
+@click.argument("input_parquet")
+@click.option("-o", "--output", required=True, help="Output parquet file path")
+@click.option(
+    "-r",
+    "--resolutions",
+    default="4,6,8",
+    help="Comma-separated H3 resolutions to add (default: 4,6,8)",
+)
+@click.option(
+    "--lat-col", default="latitude", help="Latitude column name (default: latitude)"
+)
+@click.option(
+    "--lon-col", default="longitude", help="Longitude column name (default: longitude)"
+)
+def add_h3(
+    ctx,
+    input_parquet: str,
+    output: str,
+    resolutions: str,
+    lat_col: str,
+    lon_col: str,
+):
+    """Add H3 index columns to a parquet file.
+
+    Creates a new parquet file with h3_resN columns for each specified resolution.
+    Only rows with valid lat/lon will have H3 values; others will be NULL.
+
+    Example:
+        pqg add-h3 input.parquet -o output_h3.parquet
+        pqg add-h3 input.parquet -o output.parquet -r 4,6
+    """
+    console = rich.console.Console()
+    logger = get_logger()
+
+    # Parse resolutions
+    res_list = [int(r.strip()) for r in resolutions.split(",")]
+    logger.info(f"Adding H3 columns at resolutions: {res_list}")
+
+    con = ctx.obj["dbinstance"]
+
+    # Install and load H3 extension (community extension)
+    console.print("[blue]Installing H3 extension from community...[/blue]")
+    con.execute("INSTALL h3 FROM community; LOAD h3;")
+
+    # Build H3 column expressions
+    h3_cols = []
+    for res in res_list:
+        h3_cols.append(
+            f"CASE WHEN {lat_col} IS NOT NULL AND {lon_col} IS NOT NULL "
+            f"THEN h3_latlng_to_cell({lat_col}, {lon_col}, {res}) END as h3_res{res}"
+        )
+    h3_select = ", ".join(h3_cols)
+
+    # Determine source (local file or URL)
+    if input_parquet.startswith("http://") or input_parquet.startswith("https://"):
+        source = f"read_parquet('{input_parquet}')"
+    else:
+        source = f"read_parquet('{os.path.abspath(input_parquet)}')"
+
+    query = f"""
+    COPY (
+        SELECT *, {h3_select}
+        FROM {source}
+    ) TO '{os.path.abspath(output)}' (FORMAT PARQUET, COMPRESSION ZSTD);
+    """
+
+    console.print(f"[blue]Processing {input_parquet}...[/blue]")
+    start = time.time()
+    con.execute(query)
+    elapsed = time.time() - start
+
+    # Get stats
+    stats = con.sql(
+        f"SELECT COUNT(*) as total, COUNT(h3_res{res_list[0]}) as with_h3 "
+        f"FROM read_parquet('{os.path.abspath(output)}')"
+    ).fetchone()
+
+    output_size = os.path.getsize(output) / (1024 * 1024)
+
+    console.print(f"[green]✓ Generated {output}[/green]")
+    console.print(f"  Size: {output_size:.1f} MB")
+    console.print(f"  Total rows: {stats[0]:,}")
+    console.print(f"  Rows with H3: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)")
+    console.print(f"  Time: {elapsed:.1f}s")
+
+
+@cli.command("facet-summaries")
+@click.pass_context
+@click.argument("input_parquet")
+@click.option(
+    "-o",
+    "--output-dir",
+    required=True,
+    help="Output directory for summary files",
+)
+@click.option(
+    "--otype-filter",
+    default="MaterialSampleRecord",
+    help="Filter to this otype (default: MaterialSampleRecord)",
+)
+@click.option(
+    "--min-cross-count",
+    default=100,
+    type=int,
+    help="Minimum count for cross-facet combinations (default: 100)",
+)
+def facet_summaries(
+    ctx,
+    input_parquet: str,
+    output_dir: str,
+    otype_filter: str,
+    min_cross_count: int,
+):
+    """Generate pre-computed facet summary tables from a wide parquet file.
+
+    Creates two output files:
+    - facet_summaries_all.parquet: Combined counts for source, material, context, object_type
+    - facet_source_material_cross.parquet: Source × material cross-tabulation
+
+    Example:
+        pqg facet-summaries wide.parquet -o summaries/
+    """
+    console = rich.console.Console()
+    logger = get_logger()
+
+    con = ctx.obj["dbinstance"]
+
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Determine source
+    if input_parquet.startswith("http://") or input_parquet.startswith("https://"):
+        source = f"read_parquet('{input_parquet}')"
+    else:
+        source = f"read_parquet('{os.path.abspath(input_parquet)}')"
+
+    otype_clause = f"otype = '{otype_filter}'" if otype_filter else "1=1"
+
+    # Generate combined facet summaries
+    console.print("[blue]Generating combined facet summaries...[/blue]")
+    start = time.time()
+
+    combined_path = os.path.join(output_dir, "facet_summaries_all.parquet")
+    combined_query = f"""
+    COPY (
+        -- Source facet
+        SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count
+        FROM {source}
+        WHERE {otype_clause}
+        GROUP BY n
+
+        UNION ALL
+
+        -- Material facet
+        SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_material_category) as material_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_material_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.material_id
+        GROUP BY c.label, c.scheme_name
+
+        UNION ALL
+
+        -- Context facet
+        SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_context_category) as context_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_context_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.context_id
+        GROUP BY c.label, c.scheme_name
+
+        UNION ALL
+
+        -- Object type facet
+        SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count
+        FROM (
+            SELECT UNNEST(p__has_sample_object_type) as type_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_sample_object_type IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.type_id
+        GROUP BY c.label, c.scheme_name
+    ) TO '{combined_path}' (FORMAT PARQUET);
+    """
+    con.execute(combined_query)
+
+    combined_stats = con.sql(
+        f"SELECT COUNT(*) FROM read_parquet('{combined_path}')"
+    ).fetchone()
+    combined_size = os.path.getsize(combined_path)
+
+    console.print(f"[green]✓ {combined_path}[/green]")
+    console.print(f"  Rows: {combined_stats[0]}, Size: {combined_size:,} bytes")
+
+    # Generate cross-facet summary
+    console.print("[blue]Generating source × material cross-tabulation...[/blue]")
+
+    cross_path = os.path.join(output_dir, "facet_source_material_cross.parquet")
+    cross_query = f"""
+    COPY (
+        SELECT
+            s.source,
+            c.label as material,
+            COUNT(*) as count
+        FROM (
+            SELECT n as source, UNNEST(p__has_material_category) as material_id
+            FROM {source}
+            WHERE {otype_clause} AND p__has_material_category IS NOT NULL
+        ) s
+        JOIN (SELECT row_id, label FROM {source} WHERE otype = 'IdentifiedConcept') c
+        ON c.row_id = s.material_id
+        GROUP BY s.source, c.label
+        HAVING COUNT(*) > {min_cross_count}
+        ORDER BY count DESC
+    ) TO '{cross_path}' (FORMAT PARQUET);
+    """
+    con.execute(cross_query)
+
+    cross_stats = con.sql(
+        f"SELECT COUNT(*) FROM read_parquet('{cross_path}')"
+    ).fetchone()
+    cross_size = os.path.getsize(cross_path)
+
+    elapsed = time.time() - start
+
+    console.print(f"[green]✓ {cross_path}[/green]")
+    console.print(f"  Rows: {cross_stats[0]}, Size: {cross_size:,} bytes")
+    console.print(f"[green]Total time: {elapsed:.1f}s[/green]")
+
+    # Print summary
+    console.print("\n[bold]Summary:[/bold]")
+    facet_counts = con.sql(
+        f"SELECT facet_type, COUNT(*) as n, SUM(count) as total "
+        f"FROM read_parquet('{combined_path}') GROUP BY facet_type"
+    ).fetchall()
+    for row in facet_counts:
+        console.print(f"  {row[0]}: {row[1]} values, {row[2]:,} total records")
+
+
 if __name__ == "__main__":
     cli()