diff --git a/examples/minimal/README.md b/examples/minimal/README.md new file mode 100644 index 0000000..7a5943b --- /dev/null +++ b/examples/minimal/README.md @@ -0,0 +1,195 @@ +# Minimal PQG Example Data + +This directory contains small, hand-crafted example datasets to help understand the iSamples PQG format. The same data is represented in JSON, CSV, and all three parquet formats (export, narrow, wide). + +## Dataset Overview + +**Domain**: Geological rock samples from Mount Rainier volcanic monitoring project + +**Entities**: +- 3 MaterialSampleRecords (samples) +- 3 SamplingEvents (collection/preparation events) +- 2 GeospatialCoordLocations (coordinates) +- 1 SamplingSite (Mount Rainier Summit Area) +- 1 Agent (Jane Smith, collector) + +**Relationships demonstrated**: +- Sample → produced_by → SamplingEvent (how samples are created) +- Sample → derivedFrom → Sample (parent/child relationship) +- SamplingEvent → sample_location → GeospatialCoordLocation +- SamplingEvent → sampling_site → SamplingSite +- SamplingSite → site_location → GeospatialCoordLocation + +## File Structure + +``` +minimal/ +├── json/ +│ ├── 1_sample.json # Single sample (simplest case) +│ └── 3_samples.json # Three related samples +├── csv/ +│ ├── samples.csv # MaterialSampleRecords +│ ├── events.csv # SamplingEvents +│ ├── locations.csv # GeospatialCoordLocations +│ ├── sites.csv # SamplingSites +│ ├── agents.csv # Agents +│ └── edges.csv # Relationships (for narrow format) +└── parquet/ + ├── minimal_export.parquet # Export format (3 rows, nested) + ├── minimal_narrow.parquet # Narrow format (21 rows, with edges) + └── minimal_wide.parquet # Wide format (10 rows, p__* columns) +``` + +## The Three Parquet Formats + +### Export Format (`minimal_export.parquet`) +- **3 rows** - one per sample +- Sample-centric with nested structs for related entities +- Best for: Simple queries on sample properties +- Coordinates pre-extracted to `sample_location_latitude/longitude` + +### Narrow Format (`minimal_narrow.parquet`) +- **21 rows** - 10 entities + 11 edge rows +- Graph-normalized with explicit `_edge_` rows +- Columns `s` (subject), `p` (predicate), `o` (object array) +- Best for: Graph traversal, flexible relationship queries + +### Wide Format (`minimal_wide.parquet`) +- **10 rows** - one per entity (no edge rows) +- Relationships stored as `p__*` columns with row_id arrays +- Best for: Fast entity queries, smaller file size, analytical queries + +## Example Queries + +### Query 1: Find all samples (works in all formats) + +**Export format:** +```sql +SELECT sample_identifier, label +FROM read_parquet('parquet/minimal_export.parquet') +``` + +**Wide format:** +```sql +SELECT pid, label +FROM read_parquet('parquet/minimal_wide.parquet') +WHERE otype = 'MaterialSampleRecord' +``` + +**Narrow format:** +```sql +SELECT pid, label +FROM read_parquet('parquet/minimal_narrow.parquet') +WHERE otype = 'MaterialSampleRecord' +``` + +### Query 2: Find samples with their locations + +**Wide format (uses p__* columns):** +```sql +SELECT + s.pid as sample, + s.label, + loc.latitude, + loc.longitude +FROM read_parquet('parquet/minimal_wide.parquet') s +JOIN read_parquet('parquet/minimal_wide.parquet') e + ON e.otype = 'SamplingEvent' + AND list_contains(s.p__produced_by, e.row_id) +JOIN read_parquet('parquet/minimal_wide.parquet') loc + ON loc.otype = 'GeospatialCoordLocation' + AND list_contains(e.p__sample_location, loc.row_id) +WHERE s.otype = 'MaterialSampleRecord' +``` + +**Narrow format (uses edge rows):** +```sql +SELECT + s.pid as sample, + s.label, + loc.latitude, + loc.longitude +FROM read_parquet('parquet/minimal_narrow.parquet') s +JOIN read_parquet('parquet/minimal_narrow.parquet') e1 + ON e1.otype = '_edge_' + AND e1.s = s.row_id + AND e1.p = 'produced_by' +JOIN read_parquet('parquet/minimal_narrow.parquet') ev + ON ev.otype = 'SamplingEvent' + AND list_contains(e1.o, ev.row_id) +JOIN read_parquet('parquet/minimal_narrow.parquet') e2 + ON e2.otype = '_edge_' + AND e2.s = ev.row_id + AND e2.p = 'sample_location' +JOIN read_parquet('parquet/minimal_narrow.parquet') loc + ON loc.otype = 'GeospatialCoordLocation' + AND list_contains(e2.o, loc.row_id) +WHERE s.otype = 'MaterialSampleRecord' +``` + +### Query 3: Count entities by type + +```sql +SELECT otype, COUNT(*) as count +FROM read_parquet('parquet/minimal_wide.parquet') +GROUP BY otype +ORDER BY count DESC +``` + +Expected output: +``` +MaterialSampleRecord 3 +SamplingEvent 3 +GeospatialCoordLocation 2 +SamplingSite 1 +Agent 1 +``` + +## JSON Schema Validation + +The JSON files validate against the iSamples Core 1.0 schema: + +```python +import json +from jsonschema import validate + +# Load schema (from isamplesorg-metadata repo) +with open('path/to/iSamplesSchemaCore1.0.json') as f: + schema = json.load(f) + +# Load and validate +with open('json/1_sample.json') as f: + sample = json.load(f) + +validate(instance=sample, schema=schema) # Raises if invalid +``` + +## Entity Relationship Diagram + +``` +MaterialSampleRecord ──produced_by──► SamplingEvent ──sample_location──► GeospatialCoordLocation + │ │ + │ └──sampling_site──► SamplingSite ──site_location──► GeospatialCoordLocation + │ + ├──registrant──► Agent + │ + └──derivedFrom──► MaterialSampleRecord (parent sample) +``` + +## Size Comparison + +| Format | Rows | File Size | Notes | +|--------|------|-----------|-------| +| Export | 3 | 1.7 KB | Nested structs, sample-centric | +| Narrow | 21 | 4.8 KB | Explicit edge rows | +| Wide | 10 | 5.0 KB | p__* columns | + +In production datasets: +- Wide is typically 60-70% smaller than narrow +- Export is smallest but less flexible for complex queries + +## See Also + +- [PQG Specification](../../docs/PQG_SPECIFICATION.md) - Full format specification +- [Edge Types](../../pqg/edge_types.py) - All 14 iSamples edge types +- [Schema Definitions](../../pqg/schemas/) - Python schema validators diff --git a/examples/minimal/csv/agents.csv b/examples/minimal/csv/agents.csv new file mode 100644 index 0000000..0e3c700 --- /dev/null +++ b/examples/minimal/csv/agents.csv @@ -0,0 +1,4 @@ +agent_id,name,role,affiliation,contact_information +agent:jsmith,Jane Smith,collector,University of Washington,jsmith@uw.edu +agent:labtech,Lab Technician,preparer,University of Washington, +agent:curator,Collections Manager,curator,Burke Museum, diff --git a/examples/minimal/csv/edges.csv b/examples/minimal/csv/edges.csv new file mode 100644 index 0000000..1090eab --- /dev/null +++ b/examples/minimal/csv/edges.csv @@ -0,0 +1,14 @@ +subject_id,predicate,object_id,description +ark:/99999/example001,produced_by,event:example001,Sample was produced by this sampling event +ark:/99999/example002,produced_by,event:example002,Sample was produced by this sampling event +ark:/99999/example003,produced_by,event:example003,Sample was produced by this sampling event +ark:/99999/example002,derivedFrom,ark:/99999/example001,Thin section derived from parent rock sample +ark:/99999/example003,relatedTo,ark:/99999/example001,Sibling sample from same site +event:example001,sample_location,loc:rainier001,Event occurred at this location +event:example003,sample_location,loc:rainier002,Event occurred at this location +event:example001,sampling_site,site:rainier001,Event occurred at this site +event:example003,sampling_site,site:rainier001,Event occurred at this site +site:rainier001,site_location,loc:rainier001,Site is at this location +ark:/99999/example001,registrant,agent:jsmith,Sample registered by this agent +ark:/99999/example002,registrant,agent:jsmith,Sample registered by this agent +ark:/99999/example003,registrant,agent:jsmith,Sample registered by this agent diff --git a/examples/minimal/csv/events.csv b/examples/minimal/csv/events.csv new file mode 100644 index 0000000..f9d8e04 --- /dev/null +++ b/examples/minimal/csv/events.csv @@ -0,0 +1,4 @@ +event_id,label,description,result_time,project,feature_of_interest,site_id,location_id,collector_id +event:example001,Mount Rainier Field Collection 2024-06-10,Field collection during summer geology survey,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier001,agent:jsmith +event:example002,Lab Preparation 2024-07-01,Thin section preparation in petrology lab,2024-07-01,,,,agent:labtech +event:example003,Mount Rainier Field Collection 2024-06-10 (Site B),Field collection 10m from first sample,2024-06-10,Cascade Volcanic Monitoring Project,Recent lava flow on Mount Rainier,site:rainier001,loc:rainier002,agent:jsmith diff --git a/examples/minimal/csv/locations.csv b/examples/minimal/csv/locations.csv new file mode 100644 index 0000000..d63ebb2 --- /dev/null +++ b/examples/minimal/csv/locations.csv @@ -0,0 +1,3 @@ +location_id,latitude,longitude,elevation,obfuscated +loc:rainier001,46.8523,-121.7603,4392 m above mean sea level,false +loc:rainier002,46.8524,-121.7601,4390 m above mean sea level,false diff --git a/examples/minimal/csv/samples.csv b/examples/minimal/csv/samples.csv new file mode 100644 index 0000000..b73ebd3 --- /dev/null +++ b/examples/minimal/csv/samples.csv @@ -0,0 +1,4 @@ +sample_id,label,description,last_modified_time,event_id,material_category,sample_object_type,registrant_id +ark:/99999/example001,Rock Sample MR-001 (Parent),"Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.",2024-06-15T10:30:00Z,event:example001,rock,physicalspecimen,agent:jsmith +ark:/99999/example002,Rock Sample MR-001-A (Child - Thin Section),Thin section prepared from parent sample MR-001 for petrographic analysis.,2024-07-01T14:00:00Z,event:example002,rock,thinsection,agent:jsmith +ark:/99999/example003,Rock Sample MR-002,"Second basalt sample from same site, collected 10m away from MR-001.",2024-06-15T11:00:00Z,event:example003,rock,physicalspecimen,agent:jsmith diff --git a/examples/minimal/csv/sites.csv b/examples/minimal/csv/sites.csv new file mode 100644 index 0000000..d2573e4 --- /dev/null +++ b/examples/minimal/csv/sites.csv @@ -0,0 +1,2 @@ +site_id,label,description,place_name +site:rainier001,Mount Rainier Summit Area,Collection site near the summit crater rim,"Mount Rainier, Pierce County, Washington, USA" diff --git a/examples/minimal/json/1_sample.json b/examples/minimal/json/1_sample.json new file mode 100644 index 0000000..ac915fd --- /dev/null +++ b/examples/minimal/json/1_sample.json @@ -0,0 +1,86 @@ +{ + "sample_identifier": "ark:/99999/example001", + "label": "Rock Sample from Mount Rainier", + "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow.", + "last_modified_time": "2024-06-15T10:30:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10", + "description": "Field collection during summer geology survey", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "label": "Mount Rainier Summit Area", + "description": "Collection site near the summit crater rim", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8523, + "longitude": -121.7603, + "elevation": "4392 m above mean sea level", + "obfuscated": false + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington", + "contact_information": "jsmith@uw.edu" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/ite", + "label": "ite", + "scheme_name": "iSamples Material Type" + } + ], + "has_context_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/sampledfeature/1.0/activehumanoccupationsite", + "label": "Earth interior", + "scheme_name": "iSamples Sampled Feature Type" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen", + "scheme_name": "iSamples Specimen Type" + } + ], + "keywords": [ + { + "keyword": "basalt", + "scheme_name": "Free text" + }, + { + "keyword": "volcanic rock", + "scheme_name": "Free text" + }, + { + "keyword": "Cascade Range", + "scheme_name": "Geographic" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington", + "contact_information": "jsmith@uw.edu", + "role": "registrant" + }, + "curation": { + "label": "UW Geology Sample Collection", + "description": "Stored in climate-controlled facility", + "curation_location": "University of Washington, Burke Museum, Room 142, Drawer B-15", + "access_constraints": ["By appointment only", "Research use only"], + "responsibility": [ + { + "name": "Collections Manager", + "role": "curator", + "affiliation": "Burke Museum" + } + ] + } +} diff --git a/examples/minimal/json/3_samples.json b/examples/minimal/json/3_samples.json new file mode 100644 index 0000000..02e47fe --- /dev/null +++ b/examples/minimal/json/3_samples.json @@ -0,0 +1,146 @@ +[ + { + "sample_identifier": "ark:/99999/example001", + "label": "Rock Sample MR-001 (Parent)", + "description": "Basalt collected during 2024 field survey. Fresh, unweathered sample from recent lava flow. This is the original field sample.", + "last_modified_time": "2024-06-15T10:30:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10", + "identifier": "event:example001", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "identifier": "site:rainier001", + "label": "Mount Rainier Summit Area", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8523, + "longitude": -121.7603, + "elevation": "4392 m above mean sea level" + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock", + "scheme_name": "iSamples Material Type" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + }, + { + "sample_identifier": "ark:/99999/example002", + "label": "Rock Sample MR-001-A (Child - Thin Section)", + "description": "Thin section prepared from parent sample MR-001 for petrographic analysis.", + "last_modified_time": "2024-07-01T14:00:00Z", + "produced_by": { + "label": "Lab Preparation 2024-07-01", + "identifier": "event:example002", + "result_time": "2024-07-01", + "description": "Thin section preparation in petrology lab", + "responsibility": [ + { + "name": "Lab Technician", + "role": "preparer", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/thinsection", + "label": "Thin section" + } + ], + "related_resource": [ + { + "label": "Parent sample", + "relationship": "derivedFrom", + "target": "ark:/99999/example001", + "description": "This thin section was prepared from the parent rock sample" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + }, + { + "sample_identifier": "ark:/99999/example003", + "label": "Rock Sample MR-002", + "description": "Second basalt sample from same site, collected 10m away from MR-001.", + "last_modified_time": "2024-06-15T11:00:00Z", + "produced_by": { + "label": "Mount Rainier Field Collection 2024-06-10 (Site B)", + "identifier": "event:example003", + "result_time": "2024-06-10", + "project": "Cascade Volcanic Monitoring Project", + "has_feature_of_interest": "Recent lava flow on Mount Rainier", + "sampling_site": { + "identifier": "site:rainier001", + "label": "Mount Rainier Summit Area", + "place_name": ["Mount Rainier", "Pierce County", "Washington", "USA"], + "sample_location": { + "latitude": 46.8524, + "longitude": -121.7601, + "elevation": "4390 m above mean sea level" + } + }, + "responsibility": [ + { + "name": "Jane Smith", + "role": "collector", + "affiliation": "University of Washington" + } + ] + }, + "has_material_category": [ + { + "identifier": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "label": "Rock" + } + ], + "has_sample_object_type": [ + { + "identifier": "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen", + "label": "Physical specimen" + } + ], + "related_resource": [ + { + "label": "Sibling sample", + "relationship": "relatedTo", + "target": "ark:/99999/example001", + "description": "Collected from same site as MR-001" + } + ], + "registrant": { + "name": "Jane Smith", + "affiliation": "University of Washington" + } + } +] diff --git a/examples/minimal/parquet/minimal_export.parquet b/examples/minimal/parquet/minimal_export.parquet new file mode 100644 index 0000000..d06011e Binary files /dev/null and b/examples/minimal/parquet/minimal_export.parquet differ diff --git a/examples/minimal/parquet/minimal_narrow.parquet b/examples/minimal/parquet/minimal_narrow.parquet new file mode 100644 index 0000000..64d12bb Binary files /dev/null and b/examples/minimal/parquet/minimal_narrow.parquet differ diff --git a/examples/minimal/parquet/minimal_wide.parquet b/examples/minimal/parquet/minimal_wide.parquet new file mode 100644 index 0000000..3f7e38a Binary files /dev/null and b/examples/minimal/parquet/minimal_wide.parquet differ diff --git a/experiments/facet_optimization/TASK_SPEC.md b/experiments/facet_optimization/TASK_SPEC.md new file mode 100644 index 0000000..2e1183f --- /dev/null +++ b/experiments/facet_optimization/TASK_SPEC.md @@ -0,0 +1,334 @@ +# Facet Metadata Optimization Task + +**Issue:** https://github.com/isamplesorg/pqg/issues/18 +**Goal:** Generate pre-computed facet summary tables for instant dashboard queries + +## Data Source + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +``` + +- ~280 MB, ~20M rows +- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN + +## Schema (Relevant Columns) + +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier | +| `otype` | VARCHAR | Entity type - `'MaterialSampleRecord'` for samples | +| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN | +| `label` | VARCHAR | Human-readable name | +| `p__has_material_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | +| `p__has_context_category` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | +| `p__has_sample_object_type` | INTEGER[] | Array of row_ids pointing to IdentifiedConcept | + +For IdentifiedConcept rows (otype = 'IdentifiedConcept'): +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier (referenced by p__* arrays) | +| `label` | VARCHAR | Concept label (e.g., "Rock", "Earth interior") | +| `scheme_name` | VARCHAR | Vocabulary name | + +## Task 1: Baseline Benchmark + +Measure current facet query performance. + +```python +import duckdb +import time + +con = duckdb.connect() +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" + +# Query 1: Source facet counts +SOURCE_FACET = f""" +SELECT n as source, COUNT(*) as count +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' +GROUP BY n +ORDER BY count DESC +""" + +# Query 2: Material category facet (requires join) +MATERIAL_FACET = f""" +WITH samples AS ( + SELECT row_id, UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_material_category IS NOT NULL +), +concepts AS ( + SELECT row_id, label + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' +) +SELECT c.label as material, COUNT(*) as count +FROM samples s +JOIN concepts c ON c.row_id = s.material_id +GROUP BY c.label +ORDER BY count DESC +LIMIT 50 +""" + +# Query 3: Entity type counts (quick sanity check) +OTYPE_COUNTS = f""" +SELECT otype, COUNT(*) as count +FROM read_parquet('{PARQUET_URL}') +GROUP BY otype +ORDER BY count DESC +""" +``` + +**Measure:** Execute each query 3 times, report median time in milliseconds. + +## Task 2: Generate Source Facet Summary + +Simple aggregation - should be tiny file. + +```python +OUTPUT_PATH = "/tmp/facet_source_counts.parquet" + +query = f""" +COPY ( + SELECT + 'source' as facet_type, + n as facet_value, + COUNT(*) as count + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + GROUP BY n + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 3: Generate Material Category Facet Summary + +Requires joining through the relationship arrays. + +```python +OUTPUT_PATH = "/tmp/facet_material_counts.parquet" + +query = f""" +COPY ( + WITH samples AS ( + SELECT UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_material_category IS NOT NULL + ), + concepts AS ( + SELECT row_id, label, scheme_name + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' + ) + SELECT + 'material' as facet_type, + c.label as facet_value, + c.scheme_name as scheme, + COUNT(*) as count + FROM samples s + JOIN concepts c ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 4: Generate Context Category Facet Summary + +```python +OUTPUT_PATH = "/tmp/facet_context_counts.parquet" + +query = f""" +COPY ( + WITH samples AS ( + SELECT UNNEST(p__has_context_category) as context_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + AND p__has_context_category IS NOT NULL + ), + concepts AS ( + SELECT row_id, label, scheme_name + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'IdentifiedConcept' + ) + SELECT + 'context' as facet_type, + c.label as facet_value, + c.scheme_name as scheme, + COUNT(*) as count + FROM samples s + JOIN concepts c ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 5: Generate Combined Facet Summary + +All facets in one file for easy loading. + +```python +OUTPUT_PATH = "/tmp/facet_summaries_all.parquet" + +query = f""" +COPY ( + -- Source facet + SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' + GROUP BY n + + UNION ALL + + -- Material facet + SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Context facet + SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_context_category) as context_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_context_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Object type facet + SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_sample_object_type) as type_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_sample_object_type IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.type_id + GROUP BY c.label, c.scheme_name +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 6: Generate Cross-Facet Summary (Source × Material) + +For "how many Rock samples from OPENCONTEXT?" + +```python +OUTPUT_PATH = "/tmp/facet_source_material_cross.parquet" + +query = f""" +COPY ( + SELECT + s.source, + c.label as material, + COUNT(*) as count + FROM ( + SELECT n as source, UNNEST(p__has_material_category) as material_id + FROM read_parquet('{PARQUET_URL}') + WHERE otype = 'MaterialSampleRecord' AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label FROM read_parquet('{PARQUET_URL}') WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY s.source, c.label + HAVING COUNT(*) > 100 -- Filter out tiny combinations + ORDER BY count DESC +) TO '{OUTPUT_PATH}' (FORMAT PARQUET); +""" +con.execute(query) +``` + +## Task 7: Benchmark Summary Table Queries + +Compare querying summary tables vs full data. + +```python +# Load summary and query +SUMMARY_PATH = "/tmp/facet_summaries_all.parquet" + +# This should be nearly instant +FAST_SOURCE_FACET = f""" +SELECT facet_value, count +FROM read_parquet('{SUMMARY_PATH}') +WHERE facet_type = 'source' +ORDER BY count DESC +""" + +FAST_MATERIAL_FACET = f""" +SELECT facet_value, count +FROM read_parquet('{SUMMARY_PATH}') +WHERE facet_type = 'material' +ORDER BY count DESC +""" +``` + +## Expected Output + +Generate a JSON results file: + +```json +{ + "baseline": { + "source_facet_ms": 2345, + "material_facet_ms": 5678, + "context_facet_ms": 4567, + "otype_counts_ms": 1234 + }, + "with_summary": { + "source_facet_ms": 5, + "material_facet_ms": 8, + "context_facet_ms": 7 + }, + "speedup": { + "source": 469, + "material": 710 + }, + "summary_files": { + "facet_summaries_all.parquet": { + "size_bytes": 12345, + "row_count": 234 + }, + "facet_source_material_cross.parquet": { + "size_bytes": 45678, + "row_count": 1234 + } + }, + "facet_counts": { + "source": { + "SESAR": 3100000, + "OPENCONTEXT": 1200000, + "GEOME": 1500000, + "SMITHSONIAN": 900000 + }, + "material_top10": ["Rock", "ite", "..."], + "context_top10": ["Earth interior", "..."] + } +} +``` + +## Output Files + +Save to `experiments/facet_optimization/results/`: +- `benchmark_results.json` - The JSON above +- `facet_summaries_all.parquet` - Combined facet counts +- `facet_source_material_cross.parquet` - Cross-tab counts +- `benchmark_log.txt` - Full execution log diff --git a/experiments/facet_optimization/results/facet_results.json b/experiments/facet_optimization/results/facet_results.json new file mode 100644 index 0000000..4160856 --- /dev/null +++ b/experiments/facet_optimization/results/facet_results.json @@ -0,0 +1,142 @@ +{ + "baseline": { + "source_facet_ms": 34.14, + "material_facet_ms": 490.4, + "otype_counts_ms": 34.67 + }, + "with_summary": { + "source_facet_ms": 3.92, + "material_facet_ms": 3.5, + "context_facet_ms": 3.21 + }, + "speedup": { + "source": 8.7, + "material": 140.1 + }, + "summary_files": { + "facet_summaries_all.parquet": { + "size_bytes": 2118, + "row_count": 60 + }, + "facet_source_material_cross.parquet": { + "size_bytes": 1266, + "row_count": 24 + } + }, + "facet_counts": { + "source": { + "SESAR": 4688386, + "OPENCONTEXT": 1064831, + "GEOME": 605554, + "SMITHSONIAN": 322161 + }, + "material_top10": [ + "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/rock", + "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial", + "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock", + "https://w3id.org/isample/vocabulary/material/1.0/material", + "https://w3id.org/isample/vocabulary/material/1.0/mineral", + "https://w3id.org/isample/vocabulary/material/1.0/anthropogenicmetal", + "https://w3id.org/isample/opencontext/material/0.1/ceramicclay", + "https://w3id.org/isample/vocabulary/material/1.0/sediment" + ], + "otype_counts": { + "MaterialSampleRecord": 6680932, + "SamplingEvent": 6354171, + "GeospatialCoordLocation": 5980282, + "MaterialSampleCuration": 720254, + "SampleRelation": 501579, + "SamplingSite": 386160, + "IdentifiedConcept": 55893, + "Agent": 50087 + }, + "context_top10": [ + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/anysampledfeature", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/pasthumanoccupationsite", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/earthinterior", + "https://w3id.org/isample/biology/biosampledfeature/1.0/Animalia", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/subaerialsurfaceenvironment", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/waterbody", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbodybottom", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/lakeriverstreambottom", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/marinewaterbody", + "https://w3id.org/isample/vocabulary/sampledfeature/1.0/terrestrialwaterbody" + ] + }, + "task_details": { + "task1": { + "status": "completed", + "queries_run": 3, + "runs_per_query": 3 + }, + "task2": { + "status": "completed", + "output_file": "/tmp/facet_source_counts.parquet", + "row_count": 4, + "size_bytes": 542, + "generation_time_ms": 32.17 + }, + "task3": { + "status": "completed", + "output_file": "/tmp/facet_material_counts.parquet", + "row_count": 19, + "size_bytes": 1284, + "generation_time_ms": 583.98 + }, + "task4": { + "status": "completed", + "output_file": "/tmp/facet_context_counts.parquet", + "row_count": 17, + "size_bytes": 1171, + "generation_time_ms": 569.76 + }, + "task5": { + "status": "completed", + "output_file": "/tmp/facet_summaries_all.parquet", + "row_count": 60, + "size_bytes": 2118, + "generation_time_ms": 984.68 + }, + "task6": { + "status": "completed", + "output_file": "/tmp/facet_source_material_cross.parquet", + "row_count": 24, + "size_bytes": 1266, + "generation_time_ms": 142.55, + "top_combinations": [ + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/earthmaterial", + "count": 2233939 + }, + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/rock", + "count": 912855 + }, + { + "source": "SESAR", + "material": "https://w3id.org/isample/vocabulary/material/1.0/mixedsoilsedimentrock", + "count": 838805 + }, + { + "source": "OPENCONTEXT", + "material": "https://w3id.org/isample/vocabulary/material/1.0/biogenicnonorganicmaterial", + "count": 745539 + }, + { + "source": "GEOME", + "material": "https://w3id.org/isample/vocabulary/material/1.0/organicmaterial", + "count": 605554 + } + ] + }, + "task7": { + "status": "completed", + "speedup_source": 8.7, + "speedup_material": 140.1 + } + } +} \ No newline at end of file diff --git a/experiments/h3_optimization/TASK_SPEC.md b/experiments/h3_optimization/TASK_SPEC.md new file mode 100644 index 0000000..39d8e4e --- /dev/null +++ b/experiments/h3_optimization/TASK_SPEC.md @@ -0,0 +1,216 @@ +# H3 Geospatial Optimization Task + +**Issue:** https://github.com/isamplesorg/pqg/issues/17 +**Goal:** Add H3 index columns to iSamples parquet and benchmark speedup + +## Data Source + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +``` + +- ~280 MB, ~20M rows +- Contains samples from OPENCONTEXT, SESAR, GEOME, SMITHSONIAN + +## Schema (Relevant Columns) + +| Column | Type | Description | +|--------|------|-------------| +| `row_id` | INTEGER | Unique identifier | +| `otype` | VARCHAR | Entity type - filter to `'MaterialSampleRecord'` for samples | +| `latitude` | DOUBLE | WGS84 latitude (nullable) | +| `longitude` | DOUBLE | WGS84 longitude (nullable) | +| `n` | VARCHAR | Source: OPENCONTEXT, SESAR, GEOME, SMITHSONIAN | +| `label` | VARCHAR | Human-readable name | + +## Environment Setup + +```python +import duckdb + +# Install and load H3 extension +con = duckdb.connect() +con.execute("INSTALL h3; LOAD h3;") +``` + +## Task 1: Baseline Benchmark + +Measure current geospatial query performance. + +```python +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" + +# Query 1: Bounding box - Western US +BBOX_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 +""" + +# Query 2: Bounding box with facet +BBOX_FACET_QUERY = f""" +SELECT n as source, COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 +GROUP BY n +""" + +# Query 3: Point radius (approximate - 1 degree ≈ 111km) +# San Francisco area, ~50km radius +RADIUS_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{PARQUET_URL}') +WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 37.3 AND 38.1 + AND longitude BETWEEN -122.8 AND -122.0 +""" +``` + +**Measure:** Execute each query 3 times, report median time in milliseconds. + +## Task 2: Generate H3-Enhanced Parquet + +Add H3 columns at resolutions 4, 6, and 8. + +```python +import duckdb +import time + +con = duckdb.connect() +con.execute("INSTALL h3; LOAD h3;") + +PARQUET_URL = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet" + +# Generate with H3 columns +query = f""" +COPY ( + SELECT *, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 4) END as h3_res4, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 6) END as h3_res6, + CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL + THEN h3_latlng_to_cell(latitude, longitude, 8) END as h3_res8 + FROM read_parquet('{PARQUET_URL}') +) TO '{OUTPUT_PATH}' (FORMAT PARQUET, COMPRESSION ZSTD); +""" + +start = time.time() +con.execute(query) +elapsed = time.time() - start +print(f"Generated in {elapsed:.1f}s") +``` + +**Report:** +- Original file size (MB) +- New file size with H3 (MB) +- Size increase percentage +- Row count with valid H3 (non-null lat/lon) + +## Task 3: H3 Benchmark + +Re-run equivalent queries using H3 filters. + +```python +OUTPUT_PATH = "/tmp/isamples_wide_h3.parquet" + +# Get H3 cells covering the Western US bbox at res 4 +# (In practice, use h3 library to get these) +# For now, query to find the cells: +FIND_CELLS = f""" +SELECT DISTINCT h3_res4 +FROM read_parquet('{OUTPUT_PATH}') +WHERE latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110 + AND h3_res4 IS NOT NULL +""" + +# Then filter by H3 cell instead of lat/lon +# This should be faster because H3 is an integer column with good stats +H3_BBOX_QUERY = f""" +SELECT COUNT(*) as cnt +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' + AND h3_res4 IN (SELECT DISTINCT h3_res4 + FROM read_parquet('{OUTPUT_PATH}') + WHERE latitude BETWEEN 32 AND 42 + AND longitude BETWEEN -125 AND -110) +""" + +# For aggregation by location (clustering for map display) +H3_CLUSTER_QUERY = f""" +SELECT h3_res6, COUNT(*) as cnt, + AVG(latitude) as center_lat, + AVG(longitude) as center_lon +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' + AND h3_res4 IN (...cells from above...) +GROUP BY h3_res6 +""" +``` + +## Task 4: Resolution Analysis + +Determine optimal H3 resolutions. + +```python +# Count distinct cells at each resolution +RESOLUTION_STATS = f""" +SELECT + COUNT(DISTINCT h3_res4) as unique_res4, + COUNT(DISTINCT h3_res6) as unique_res6, + COUNT(DISTINCT h3_res8) as unique_res8, + COUNT(*) as total_rows, + COUNT(h3_res4) as rows_with_h3 +FROM read_parquet('{OUTPUT_PATH}') +WHERE otype = 'MaterialSampleRecord' +""" +``` + +**Report:** Unique cells per resolution, average points per cell. + +## Expected Output + +Generate a JSON results file: + +```json +{ + "baseline": { + "bbox_query_ms": 1234, + "bbox_facet_ms": 1456, + "radius_query_ms": 1123 + }, + "with_h3": { + "bbox_query_ms": 234, + "bbox_facet_ms": 345, + "cluster_query_ms": 456 + }, + "speedup": { + "bbox": 5.3, + "facet": 4.2 + }, + "file_size": { + "original_mb": 282, + "with_h3_mb": 310, + "increase_pct": 9.9 + }, + "h3_stats": { + "rows_with_coords": 5400000, + "unique_res4_cells": 1234, + "unique_res6_cells": 45678, + "unique_res8_cells": 234567 + } +} +``` + +## Output Files + +Save to `experiments/h3_optimization/results/`: +- `benchmark_results.json` - The JSON above +- `isamples_wide_h3.parquet` - Enhanced parquet (or note if too large to include) +- `benchmark_log.txt` - Full execution log diff --git a/experiments/h3_optimization/results/benchmark_results.json b/experiments/h3_optimization/results/benchmark_results.json new file mode 100644 index 0000000..e7c79c2 --- /dev/null +++ b/experiments/h3_optimization/results/benchmark_results.json @@ -0,0 +1,29 @@ +{ + "baseline": { + "bbox_query_ms": 170.0129508972168, + "bbox_facet_ms": 186.75613403320312, + "radius_query_ms": 179.39233779907227 + }, + "with_h3": { + "bbox_query_ms": 35.030364990234375, + "bbox_facet_ms": 38.38610649108887, + "cluster_query_ms": 51.71322822570801 + }, + "speedup": { + "bbox": 4.85, + "facet": 4.87 + }, + "file_size": { + "original_mb": 282, + "with_h3_mb": 292.4, + "increase_pct": 3.7, + "generation_time_s": 41.6 + }, + "h3_stats": { + "unique_res4_cells": 38406, + "unique_res6_cells": 111681, + "unique_res8_cells": 175653, + "total_sample_rows": 6680932, + "rows_with_coords": 5980282 + } +} \ No newline at end of file diff --git a/pqg/__main__.py b/pqg/__main__.py index 72a8687..8c23098 100644 --- a/pqg/__main__.py +++ b/pqg/__main__.py @@ -4,12 +4,15 @@ import json import logging +import os +import time import typing import click import duckdb import rich import rich.tree +import rich.console import pqg import pqg.common @@ -186,5 +189,253 @@ def get_geo(ctx, store): print("]}") +@cli.command("add-h3") +@click.pass_context +@click.argument("input_parquet") +@click.option("-o", "--output", required=True, help="Output parquet file path") +@click.option( + "-r", + "--resolutions", + default="4,6,8", + help="Comma-separated H3 resolutions to add (default: 4,6,8)", +) +@click.option( + "--lat-col", default="latitude", help="Latitude column name (default: latitude)" +) +@click.option( + "--lon-col", default="longitude", help="Longitude column name (default: longitude)" +) +def add_h3( + ctx, + input_parquet: str, + output: str, + resolutions: str, + lat_col: str, + lon_col: str, +): + """Add H3 index columns to a parquet file. + + Creates a new parquet file with h3_resN columns for each specified resolution. + Only rows with valid lat/lon will have H3 values; others will be NULL. + + Example: + pqg add-h3 input.parquet -o output_h3.parquet + pqg add-h3 input.parquet -o output.parquet -r 4,6 + """ + console = rich.console.Console() + logger = get_logger() + + # Parse resolutions + res_list = [int(r.strip()) for r in resolutions.split(",")] + logger.info(f"Adding H3 columns at resolutions: {res_list}") + + con = ctx.obj["dbinstance"] + + # Install and load H3 extension (community extension) + console.print("[blue]Installing H3 extension from community...[/blue]") + con.execute("INSTALL h3 FROM community; LOAD h3;") + + # Build H3 column expressions + h3_cols = [] + for res in res_list: + h3_cols.append( + f"CASE WHEN {lat_col} IS NOT NULL AND {lon_col} IS NOT NULL " + f"THEN h3_latlng_to_cell({lat_col}, {lon_col}, {res}) END as h3_res{res}" + ) + h3_select = ", ".join(h3_cols) + + # Determine source (local file or URL) + if input_parquet.startswith("http://") or input_parquet.startswith("https://"): + source = f"read_parquet('{input_parquet}')" + else: + source = f"read_parquet('{os.path.abspath(input_parquet)}')" + + query = f""" + COPY ( + SELECT *, {h3_select} + FROM {source} + ) TO '{os.path.abspath(output)}' (FORMAT PARQUET, COMPRESSION ZSTD); + """ + + console.print(f"[blue]Processing {input_parquet}...[/blue]") + start = time.time() + con.execute(query) + elapsed = time.time() - start + + # Get stats + stats = con.sql( + f"SELECT COUNT(*) as total, COUNT(h3_res{res_list[0]}) as with_h3 " + f"FROM read_parquet('{os.path.abspath(output)}')" + ).fetchone() + + output_size = os.path.getsize(output) / (1024 * 1024) + + console.print(f"[green]✓ Generated {output}[/green]") + console.print(f" Size: {output_size:.1f} MB") + console.print(f" Total rows: {stats[0]:,}") + console.print(f" Rows with H3: {stats[1]:,} ({100*stats[1]/stats[0]:.1f}%)") + console.print(f" Time: {elapsed:.1f}s") + + +@cli.command("facet-summaries") +@click.pass_context +@click.argument("input_parquet") +@click.option( + "-o", + "--output-dir", + required=True, + help="Output directory for summary files", +) +@click.option( + "--otype-filter", + default="MaterialSampleRecord", + help="Filter to this otype (default: MaterialSampleRecord)", +) +@click.option( + "--min-cross-count", + default=100, + type=int, + help="Minimum count for cross-facet combinations (default: 100)", +) +def facet_summaries( + ctx, + input_parquet: str, + output_dir: str, + otype_filter: str, + min_cross_count: int, +): + """Generate pre-computed facet summary tables from a wide parquet file. + + Creates two output files: + - facet_summaries_all.parquet: Combined counts for source, material, context, object_type + - facet_source_material_cross.parquet: Source × material cross-tabulation + + Example: + pqg facet-summaries wide.parquet -o summaries/ + """ + console = rich.console.Console() + logger = get_logger() + + con = ctx.obj["dbinstance"] + + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Determine source + if input_parquet.startswith("http://") or input_parquet.startswith("https://"): + source = f"read_parquet('{input_parquet}')" + else: + source = f"read_parquet('{os.path.abspath(input_parquet)}')" + + otype_clause = f"otype = '{otype_filter}'" if otype_filter else "1=1" + + # Generate combined facet summaries + console.print("[blue]Generating combined facet summaries...[/blue]") + start = time.time() + + combined_path = os.path.join(output_dir, "facet_summaries_all.parquet") + combined_query = f""" + COPY ( + -- Source facet + SELECT 'source' as facet_type, n as facet_value, NULL as scheme, COUNT(*) as count + FROM {source} + WHERE {otype_clause} + GROUP BY n + + UNION ALL + + -- Material facet + SELECT 'material' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_material_category) as material_id + FROM {source} + WHERE {otype_clause} AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Context facet + SELECT 'context' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_context_category) as context_id + FROM {source} + WHERE {otype_clause} AND p__has_context_category IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.context_id + GROUP BY c.label, c.scheme_name + + UNION ALL + + -- Object type facet + SELECT 'object_type' as facet_type, c.label as facet_value, c.scheme_name as scheme, COUNT(*) as count + FROM ( + SELECT UNNEST(p__has_sample_object_type) as type_id + FROM {source} + WHERE {otype_clause} AND p__has_sample_object_type IS NOT NULL + ) s + JOIN (SELECT row_id, label, scheme_name FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.type_id + GROUP BY c.label, c.scheme_name + ) TO '{combined_path}' (FORMAT PARQUET); + """ + con.execute(combined_query) + + combined_stats = con.sql( + f"SELECT COUNT(*) FROM read_parquet('{combined_path}')" + ).fetchone() + combined_size = os.path.getsize(combined_path) + + console.print(f"[green]✓ {combined_path}[/green]") + console.print(f" Rows: {combined_stats[0]}, Size: {combined_size:,} bytes") + + # Generate cross-facet summary + console.print("[blue]Generating source × material cross-tabulation...[/blue]") + + cross_path = os.path.join(output_dir, "facet_source_material_cross.parquet") + cross_query = f""" + COPY ( + SELECT + s.source, + c.label as material, + COUNT(*) as count + FROM ( + SELECT n as source, UNNEST(p__has_material_category) as material_id + FROM {source} + WHERE {otype_clause} AND p__has_material_category IS NOT NULL + ) s + JOIN (SELECT row_id, label FROM {source} WHERE otype = 'IdentifiedConcept') c + ON c.row_id = s.material_id + GROUP BY s.source, c.label + HAVING COUNT(*) > {min_cross_count} + ORDER BY count DESC + ) TO '{cross_path}' (FORMAT PARQUET); + """ + con.execute(cross_query) + + cross_stats = con.sql( + f"SELECT COUNT(*) FROM read_parquet('{cross_path}')" + ).fetchone() + cross_size = os.path.getsize(cross_path) + + elapsed = time.time() - start + + console.print(f"[green]✓ {cross_path}[/green]") + console.print(f" Rows: {cross_stats[0]}, Size: {cross_size:,} bytes") + console.print(f"[green]Total time: {elapsed:.1f}s[/green]") + + # Print summary + console.print("\n[bold]Summary:[/bold]") + facet_counts = con.sql( + f"SELECT facet_type, COUNT(*) as n, SUM(count) as total " + f"FROM read_parquet('{combined_path}') GROUP BY facet_type" + ).fetchall() + for row in facet_counts: + console.print(f" {row[0]}: {row[1]} values, {row[2]:,} total records") + + if __name__ == "__main__": cli()