From d20b17297783a971268c3afc5e926dd3096b17a9 Mon Sep 17 00:00:00 2001 From: Alexander Belikov Date: Mon, 6 Apr 2026 01:15:22 +0200 Subject: [PATCH] update docs --- README.md | 98 ++++++--------- docs/concepts/backend_indexes.md | 10 +- docs/concepts/index.md | 16 +-- docs/examples/example-5.md | 2 +- docs/getting_started/creating_manifest.md | 23 +++- docs/index.md | 94 +++++++++------ .../contract/declarations/resource.py | 7 +- graflo/architecture/database_features.py | 97 ++++++++++++++- graflo/architecture/schema/document.py | 5 +- graflo/db/tigergraph/conn.py | 112 +++++++++++++++--- graflo/db/tigergraph/gsql_literals.py | 28 +++++ ...atabase_profile_default_property_values.py | 70 +++++++++++ test/db/tigergraph/test_gsql_literals.py | 33 ++++++ .../test_tigergraph_ddl_default_clauses.py | 52 ++++++++ 14 files changed, 515 insertions(+), 132 deletions(-) create mode 100644 graflo/db/tigergraph/gsql_literals.py create mode 100644 test/architecture/test_database_profile_default_property_values.py create mode 100644 test/db/tigergraph/test_gsql_literals.py create mode 100644 test/db/tigergraph/test_tigergraph_ddl_default_clauses.py diff --git a/README.md b/README.md index 710d6eec..ce3a07a3 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,5 @@ -# GraFlo graflo logo +# GraFlo — Graph Schema & Transformation Language (GSTL) graflo logo -**GraFlo** is a **Graph Schema & Transformation Language (GSTL)** for **labeled property graphs (LPGs)**. You describe the graph once—**vertices and edges**, typed **`properties`**, identity, and optional backend hints—in **YAML or Python**. You describe how raw records become that graph using **resource** pipelines (an expressive sequence of **actors**: descend, transform, vertex, edge, and routers). **Connectors** attach files, SQL tables, SPARQL/RDF, APIs, or in-memory data to those pipelines. **`GraphEngine`** and **`Caster`** then infer schema when possible, project the logical model for a chosen database, and ingest. - -**Why it matters:** the **logical graph** is **database-agnostic**; the same manifest can target **ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph** without rewriting your transformation story. Backend-specific names, defaults, and indexes are applied only at **DB-aware projection** (`Schema.resolve_db_aware(...)`). - -> **Package Renamed**: This package was formerly known as `graphcast`. ![Python](https://img.shields.io/badge/python-3.11%2B-blue.svg) [![PyPI version](https://badge.fury.io/py/graflo.svg)](https://badge.fury.io/py/graflo) @@ -13,60 +8,43 @@ [![pre-commit](https://github.com/growgraph/graflo/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/growgraph/graflo/actions/workflows/pre-commit.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15446131.svg)]( https://doi.org/10.5281/zenodo.15446131) -## Core ideas - -| Idea | What you get | -|------|----------------| -| **Logical LPG first** | One declarative **schema** (`Vertex` / `Edge` with **`properties`**) is the source of truth—not a particular vendor’s DDL. | -| **Expressive transformation** | **`Resource`** pipelines compose small **actors** so wide tables, nested JSON, RDF, or API payloads map cleanly to vertices and edges—reusable across sources. | -| **Separation of concerns** | **Sources** (connectors + `DataSourceRegistry`), **shape of the graph** (`Schema`), and **ingestion steps** (`IngestionModel`) evolve independently. | -| **Safe wiring** | Optional **`connector_connection`** maps connectors to **`conn_proxy`** labels so manifests stay free of secrets; a runtime **`ConnectionProvider`** supplies credentials. | - -## Overview - -GraFlo separates *what the graph looks like* from *where data comes from* and *which database stores it*. - -```mermaid -%%{ init: { - "theme": "base", - "themeVariables": { - "primaryColor": "#90CAF9", - "primaryTextColor": "#111111", - "primaryBorderColor": "#1E88E5", - "lineColor": "#546E7A", - "secondaryColor": "#A5D6A7", - "tertiaryColor": "#CE93D8" - } -} }%% - -flowchart LR - SI["Source Instance
File · SQL · SPARQL · API"] - R["Resource
Actor Pipeline"] - GS["Logical Graph Schema
Vertex/Edge Definitions
Identities · DB Profile"] - DBA["DB-aware Projection
DatabaseProfile
VertexConfigDBAware · EdgeConfigDBAware"] - GC["GraphContainer
Covariant Graph Representation"] - DB["Graph DB (LPG)
ArangoDB · Neo4j · TigerGraph · Others"] - - SI --> R --> GS --> GC --> DBA --> DB -``` +**GraFlo** is a manifest-driven toolkit for **labeled property graphs (LPGs)**: describe vertices, edges, and ingestion (`GraphManifest` — YAML or Python), then project and load into a target graph database. + +### What you get + +- **One pipeline, several graph databases** — The same manifest targets ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph; `DatabaseProfile` and DB-aware types absorb naming, defaults, and indexing differences. +- **Explicit identities** — Vertex identity fields and indexes back upserts so reloads merge on keys instead of blindly duplicating nodes. +- **Reusable ingestion** — `Resource` actor pipelines (including **vertex_router** / **edge_router** steps) bind to files, SQL, SPARQL/RDF, APIs, or in-memory batches via `Bindings` and the `DataSourceRegistry`. + +### What’s in the manifest + +- **`schema`** — `Schema`: metadata, **`core_schema`** (vertices, edges, typed **`properties`**, identities), and **`db_profile`** (`DatabaseProfile`: target flavor, storage names, secondary indexes, TigerGraph `default_property_values`, …). +- **`ingestion_model`** — `IngestionModel`: named **`resources`** (actor sequences: *descend*, *transform*, *vertex*, *edge*, …) and a registry of reusable **`transforms`**. +- **`bindings`** — Connectors (e.g. `FileConnector`, `TableConnector`, `SparqlConnector`) plus **`resource_connector`** wiring. Optional **`connector_connection`** maps connectors to **`conn_proxy`** labels so YAML stays secret-free; a runtime **`ConnectionProvider`** supplies credentials. + +### Runtime path -**Source Instance** → **Resource** (actors) → **Logical Graph Schema** → **Covariant Graph Representation** (`GraphContainer`) → **DB-aware Projection** → **Graph DB** +1. **Source instance** — Batches from a `DataSourceType` adapter (`FileDataSource`, `SQLDataSource`, `SparqlEndpointDataSource`, `APIDataSource`, …). +2. **Resource (actors)** — Maps records to graph elements against the logical schema (validated during `IngestionModel.finish_init` / pipeline execution). +3. **`GraphContainer`** — Intermediate, database-agnostic vertex/edge batches. +4. **DB-aware projection** — `Schema.resolve_db_aware()` plus `VertexConfigDBAware` / `EdgeConfigDBAware` for the active `DBType`. +5. **Graph DB** — `DBWriter` + `ConnectionManager` and the backend-specific `Connection` implementation. -| Stage | Role | Code | +| Piece | Role | Code | |-------|------|------| -| **Source Instance** | A concrete data artifact — a CSV file, a PostgreSQL table, a SPARQL endpoint, a `.ttl` file. | `AbstractDataSource` subclasses (`FileDataSource`, `SQLDataSource`, `SparqlEndpointDataSource`, …) with a `DataSourceType`. | -| **Resource** | A reusable transformation pipeline — actor steps (descend, transform, vertex, edge, vertex_router, edge_router) that map raw records to graph elements. Data sources bind to Resources by name via the `DataSourceRegistry`. | `Resource` (part of `IngestionModel`). | -| **Graph Schema** | Declarative logical vertex/edge definitions, identities, typed **properties**, and DB profile — defined in YAML or Python. | `Schema`, `VertexConfig`, `EdgeConfig`. | -| **Covariant Graph Representation** | A database-independent collection of vertices and edges. | `GraphContainer`. | -| **DB-aware Projection** | Resolves DB-specific naming/default/index behavior from logical schema + `DatabaseProfile`. | `Schema.resolve_db_aware()`, `VertexConfigDBAware`, `EdgeConfigDBAware`. | -| **Graph DB** | The target LPG store — same API for all supported databases. | `ConnectionManager`, `DBWriter`, DB connectors. | +| **Logical graph schema** | Manifest `schema`: vertex/edge definitions, identities, typed **properties**, DB profile. Constrains pipeline output and projection; not a separate queue between steps. | `Schema`, `VertexConfig`, `EdgeConfig` (under `core_schema`). | +| **Source instance** | Concrete input: file, SQL table, SPARQL endpoint, API payload, in-memory rows. | `AbstractDataSource` + `DataSourceType`. | +| **Resource** | Ordered actors; resources are looked up by name when sources are registered. | `Resource` in `IngestionModel`. | +| **Covariant graph** (`GraphContainer`) | Batches of vertices/edges before load. | `GraphContainer`. | +| **DB-aware projection** | Physical names, defaults, indexes for the target. | `Schema.resolve_db_aware()`, `VertexConfigDBAware`, `EdgeConfigDBAware`. | +| **Graph DB** | Target LPG; each `DBType` has its own connector, orchestrated the same way. | `ConnectionManager`, `DBWriter`, per-backend `Connection`. | ### Supported source types (`DataSourceType`) | DataSourceType | Connector | DataSource | Schema inference | |---|---|---|---| | `FILE` — CSV / JSON / JSONL / Parquet | `FileConnector` | `FileDataSource` | manual | -| `SQL` — PostgreSQL tables | `TableConnector` | `SQLDataSource` | automatic (3NF with PK/FK) | +| `SQL` — relational tables (docs focus on PostgreSQL; other engines via SQLAlchemy where supported) | `TableConnector` | `SQLDataSource` | automatic for PostgreSQL-style 3NF (PK/FK heuristics) | | `SPARQL` — RDF files (`.ttl`, `.rdf`, `.n3`) | `SparqlConnector` | `RdfFileDataSource` | automatic (OWL/RDFS ontology) | | `SPARQL` — SPARQL endpoints (Fuseki, …) | `SparqlConnector` | `SparqlEndpointDataSource` | automatic (OWL/RDFS ontology) | | `API` — REST APIs | — | `APIDataSource` | manual | @@ -74,18 +52,16 @@ flowchart LR ### Supported targets -ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, NebulaGraph — same API for all. +The graph engines listed in **What you get** are the supported **output** `DBType` values in `graflo.onto`. Each backend uses its own `Connection` implementation under the shared `ConnectionManager` / `DBWriter` / `GraphEngine` flow. -## Features +## More capabilities -- **Declarative LPG schema** — Define vertices, edges, vertex identity, secondary DB indexes, edge **properties**, and transforms in YAML or Python. The `Schema` is the single source of truth, independent of source or target. -- **Database abstraction** — One logical schema, one API. Target ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph without rewriting pipelines. DB idiosyncrasies are handled in DB-aware projection (`Schema.resolve_db_aware(...)`) and connector/writer stages. -- **Resource abstraction** — Each `Resource` defines a reusable actor pipeline (descend, transform, vertex, edge, plus **VertexRouter** and **EdgeRouter** for dynamic type-based routing) that maps raw records to graph elements. Data sources bind to Resources by name via the `DataSourceRegistry`, decoupling transformation logic from data retrieval. -- **SPARQL & RDF support** — Query SPARQL endpoints (e.g. Apache Fuseki), read `.ttl`/`.rdf`/`.n3` files, and auto-infer schemas from OWL/RDFS ontologies (`rdflib` and `SPARQLWrapper` ship with the default package). -- **Schema inference** — Generate graph schemas from PostgreSQL 3NF databases (PK/FK heuristics) or from OWL/RDFS ontologies (`owl:Class` → vertices, `owl:ObjectProperty` → edges, `owl:DatatypeProperty` → vertex properties). -- **Typed properties** — Vertex and edge **`properties`** may carry types (`INT`, `FLOAT`, `STRING`, `DATETIME`, `BOOL`) for validation and database-specific optimisation. -- **Parallel batch processing** — Configurable batch sizes and multi-core execution. -- **Credential-free source contracts** — `Bindings.connector_connection` maps each `TableConnector` / `SparqlConnector` (by **connector name** or **hash**) to a `conn_proxy` label. Manifests stay free of secrets; a runtime `ConnectionProvider` resolves each proxy to concrete `GeneralizedConnConfig` (for example PostgreSQL or SPARQL endpoint settings). Ingestion resource names are separate and may map to multiple connectors. +- **SPARQL & RDF** — Endpoints and RDF files (`.ttl`, `.rdf`, `.n3`, …); optional OWL/RDFS schema inference (`rdflib`, `SPARQLWrapper` in the default install). +- **Schema inference** — From PostgreSQL-style 3NF layouts (PK/FK heuristics) or from OWL/RDFS (`owl:Class` → vertices, `owl:ObjectProperty` → edges, `owl:DatatypeProperty` → vertex fields). +- **Schema migrations** — Plan and apply guarded schema deltas (`migrate_schema` console script → `graflo.cli.migrate_schema`; library in `graflo.migrate`; see docs). +- **Typed `properties`** — Optional field types (`INT`, `FLOAT`, `STRING`, `DATETIME`, `BOOL`) on vertices and edges. +- **Batching & concurrency** — Configurable batch sizes, worker counts, and DB write concurrency on `IngestionParams` / `DBWriter`. +- **`GraphEngine`** — High-level orchestration for infer, define schema, and ingest (`define_and_ingest`, …); `Caster` stays available for lower-level control. ## Documentation Full documentation is available at: [growgraph.github.io/graflo](https://growgraph.github.io/graflo) diff --git a/docs/concepts/backend_indexes.md b/docs/concepts/backend_indexes.md index 063f54e8..6fae4416 100644 --- a/docs/concepts/backend_indexes.md +++ b/docs/concepts/backend_indexes.md @@ -2,12 +2,14 @@ This document describes how vertex and edge indexes are handled across different graph database backends. Understanding this helps ensure your schema has the right indexes for efficient lookups and MERGE operations. +In manifests, physical index and naming configuration lives under **`schema.db_profile`** (the `DatabaseProfile` model; Python module `graflo.architecture.database_features`). Below, **`db_profile`** refers to that object—whether loaded from YAML or constructed in code. + ## Identity vs Secondary Indexes - **Identity index**: Required for vertex matching/upserts. Uses `Vertex.identity` (or `_key`/`id` for blank vertices). Each backend handles this differently. -- **Secondary indexes**: Optional indexes for query performance. Configured in `database_features.vertex_indexes` and `database_features.edge_specs[*].indexes`. +- **Secondary indexes**: Optional indexes for query performance. Configured in `db_profile.vertex_indexes` and `db_profile.edge_specs[*].indexes`. -The `vertex_indexes` in `database_features` is for **secondary** indexes only. Identity is handled by the backend during `define_vertex_indexes` or at collection/vertex-type creation. +The `vertex_indexes` on **`db_profile`** are for **secondary** indexes only. Identity is handled by the backend during `define_vertex_indexes` or at collection/vertex-type creation. ## Backend Summary @@ -22,7 +24,7 @@ The `vertex_indexes` in `database_features` is for **secondary** indexes only. I ## Implications -- **Neo4j, Memgraph, FalkorDB**: If you omit `database_features.vertex_indexes` for a vertex, the identity index is still created automatically when `define_vertex_indexes` runs with a schema. You only need `vertex_indexes` for **additional** (secondary) indexes. +- **Neo4j, Memgraph, FalkorDB**: If you omit `db_profile.vertex_indexes` for a vertex, the identity index is still created automatically when `define_vertex_indexes` runs with a schema. You only need `vertex_indexes` for **additional** (secondary) indexes. - **ArangoDB, TigerGraph**: Identity is covered at collection/vertex-type creation. `define_vertex_indexes` adds only secondary indexes from `vertex_indexes`. - **Nebula**: Identity index is always created in `define_vertex_indexes`; `vertex_indexes` adds secondary indexes. @@ -34,4 +36,4 @@ When `schema` is `None` in `define_vertex_indexes`, identity indexes cannot be e Vertex upserts use node keys from `Vertex` identity. For edges, endpoints are matched on those vertex keys; the relationship itself is merged using a **relationship property map** so parallel edges remain distinct. -GraFlo chooses property names for that map from the edge’s logical identity policy: the **first** entry in `Edge.identities` (excluding `source` / `target` tokens; including a `relation` token as the relationship’s `relation` property when applicable). If `identities` is empty or does not name any relationship fields, **all** declared edge **`properties`** names are used instead. Compile-time edge **indexes** from `identities` (via `database_features`) remain separate from this writer-time `MERGE` key selection; both should agree with your intended uniqueness for a given edge definition. +GraFlo chooses property names for that map from the edge’s logical identity policy: the **first** entry in `Edge.identities` (excluding `source` / `target` tokens; including a `relation` token as the relationship’s `relation` property when applicable). If `identities` is empty or does not name any relationship fields, **all** declared edge **`properties`** names are used instead. Compile-time edge **indexes** from `identities` (via **`db_profile` / `EdgeConfigDBAware`**) remain separate from this writer-time `MERGE` key selection; both should agree with your intended uniqueness for a given edge definition. diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 46653a74..185bc182 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -96,7 +96,7 @@ flowchart LR - **Bindings** (`FileConnector`, `TableConnector`, `SparqlConnector`) describe *where* data comes from (file paths, SQL tables, SPARQL endpoints). Multiple connectors may attach to the same ingestion resource name; optional **`connector_connection`** entries assign each SQL/SPARQL connector a **`conn_proxy`** by **connector `name` or `hash`** (not by resource name). The `ConnectionProvider` turns that label into real connection config at runtime so manifests stay credential-free. - **DataSources** (`AbstractDataSource` subclasses) handle *how* to read data in batches. Each carries a `DataSourceType` and is registered in the `DataSourceRegistry`. -- **Resources** define *what* to extract — each `Resource` is a reusable actor pipeline (descend → transform → vertex → edge) that maps raw records to graph elements. Set **`drop_trivial_input_fields`: `true`** on a resource to strip top-level `null` / `""` fields from each row before the pipeline (optional, default `false`). +- **Resources** define *what* to extract — each `Resource` is a reusable actor pipeline (descend → transform → vertex → edge) that maps raw records to graph elements. Optional **`drop_trivial_input_fields`: `true`** removes top-level keys whose value is `null` or `""` **before** actors run (shallow only; `0` and `false` stay). **TigerGraph** physical defaults for missing attributes belong in **`schema.db_profile.default_property_values`** (GSQL `DEFAULT` at DDL time), not in the covariant `GraphContainer` assembly path. - **GraphContainer** (covariant graph representation) collects the resulting vertices and edges in a database-independent format. - **DBWriter** pushes the graph data into the target LPG store (ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, NebulaGraph). @@ -538,11 +538,11 @@ Ingestion-only controls (**`relation_field`**, **`relation_from_key`**, **`match Vertex fields that should appear on edges are configured via **edge actor** options (e.g. **`vertex_weights`**, maps), not via a `weights` block on the logical `Edge`. DB layers may still use an internal `WeightConfig` built from `Edge.properties` for backends that need it. #### Edge behavior control -- Edge physical variants should be modeled with `database_features.edge_specs[*].purpose`. +- Edge physical variants should be modeled with `schema.db_profile.edge_specs[*].purpose` (YAML) / `db_profile.edge_specs[*].purpose` (in code). - `Edge.aux` is no longer a behavior switch. > DB-only physical edge metadata (including `purpose`) is configured under -> `database_features.edge_specs`, not on `Edge`. +> **`schema.db_profile.edge_specs`**, not on `Edge`. #### Matching and filtering (ingestion) - **`match_source`** / **`match_target`** / **`match`**: edge **actor** options for branch selection when building edges from hierarchical documents @@ -550,7 +550,7 @@ Vertex fields that should appear on edges are configured via **edge actor** opti #### Advanced logical configuration - **`type`**: Edge type (DIRECT or INDIRECT) - **`by`**: Vertex name for indirect edges -- DB-specific edge storage/type names are resolved from `database_features` +- DB-specific edge storage/type names are resolved from **`schema.db_profile`** through DB-aware wrappers (`EdgeConfigDBAware`), not stored on `Edge`. #### When to use what @@ -794,12 +794,12 @@ Transform steps are executed in the order they appear in `apply`. ### Schema & Abstraction - **Declarative LPG schema** — `Schema` defines vertices, edges, identity rules, and edge **`properties`** in YAML or Python; the single source of truth for graph structure. Transforms/resources are defined in `IngestionModel`. -- **Database abstraction** — one logical schema, multiple backends; DB-specific behavior is applied in DB-aware projection/writer stages (`Schema.resolve_db_aware(...)`, `VertexConfigDBAware`, `EdgeConfigDBAware`). +- **Database abstraction** — one logical schema, multiple backends; each target uses its own `Connection` type behind `ConnectionManager` / `DBWriter`, with DB-specific behavior applied in DB-aware projection (`Schema.resolve_db_aware(...)`, `VertexConfigDBAware`, `EdgeConfigDBAware`). - **Resource abstraction** — each `Resource` is a reusable actor pipeline that maps raw records to graph elements, decoupled from data retrieval. - **DataSourceRegistry** — pluggable `AbstractDataSource` adapters (`FILE`, `SQL`, `API`, `SPARQL`, `IN_MEMORY`) bound to Resources by name. ### Schema Features -- **Flexible Identity + Indexing** — logical identity plus DB-specific secondary indexes. +- **Flexible Identity + Indexing** — logical identity plus DB-specific secondary indexes (`schema.db_profile.vertex_indexes`, `edge_specs`, …). - **Typed properties** — optional type information on vertex and edge **`properties`** (INT, FLOAT, STRING, DATETIME, BOOL). - **Hierarchical Edge Definition** — define edges at any level of nested documents (via resource **edge** steps and actors). - **Relationship payload** — logical edges declare **`properties`**; additional payload from vertices or row shape is wired in **edge actors** (`vertex_weights`, maps, etc.) with optional types. @@ -893,9 +893,9 @@ Schema comparison gives you a predictable transition path between versions. Inst - **Smart Caching**: Minimize redundant operations ## Best Practices -1. Use compound identity fields for natural keys, and `database_features` indexes for query performance +1. Use compound identity fields for natural keys, and **`schema.db_profile`** secondary indexes for query performance 2. Leverage blank vertices for complex relationship modeling -3. Define transforms at the schema level for reusability +3. Define reusable transforms in **`ingestion_model.transforms`** and reference them from resource steps 4. Configure appropriate batch sizes based on your data volume 5. Enable parallel processing for large datasets 6. Choose the right relationship attribute based on your data format: diff --git a/docs/examples/example-5.md b/docs/examples/example-5.md index 610332e2..8ea56fd4 100644 --- a/docs/examples/example-5.md +++ b/docs/examples/example-5.md @@ -121,7 +121,7 @@ The inferred schema automatically includes: - `users → products` (from `purchases` table) with **properties**: `purchase_date`, `quantity`, `total_amount` - `users → users` (from `follows` table) with **properties**: `created_at` - **Resources**: Automatically created for each table with appropriate actors -- **Indexes**: Primary keys drive vertex identity / indexing; foreign keys drive edge mappings (see `database_features` for secondary indexes) +- **Indexes**: Primary keys drive vertex identity / indexing; foreign keys drive edge mappings (see **`schema.db_profile`** / `vertex_indexes` for secondary indexes) - **Edge payload**: Additional columns in edge tables become edge **properties** on the logical `Edge` ### Graph Structure Visualization diff --git a/docs/getting_started/creating_manifest.md b/docs/getting_started/creating_manifest.md index 3b386fc0..caf4944d 100644 --- a/docs/getting_started/creating_manifest.md +++ b/docs/getting_started/creating_manifest.md @@ -64,7 +64,7 @@ Defines the graph contract. - `metadata`: human-facing identity (`name`, optional `version`) - `graph.vertex_config`: vertex types, **`properties`**, identity keys - `graph.edge_config`: source/target relationships, optional `relation`, edge **`properties`**, `identities` -- `db_profile`: DB-specific physical behavior (indexes, naming, backend details) +- `db_profile`: DB-specific physical behavior (indexes, naming, **`default_property_values`** for TigerGraph GSQL `DEFAULT` on vertex/edge attributes, backend details) Use `schema` for **what graph exists**. @@ -74,7 +74,26 @@ Defines ingestion behavior. - `resources`: named pipelines (`name`) with ordered actor steps - `transforms`: reusable named transforms as a **list** (each entry must define `name`) and referenced from resources via `transform.call.use` -- Optional per-resource flags include **`drop_trivial_input_fields`** (default `false`): when `true`, top-level `null` or `""` fields are removed from each row before the pipeline—handy for sparse wide tables without extra transforms (shallow only; nested objects are unchanged). +- Optional per-resource flags include **`drop_trivial_input_fields`** (default `false`): when `true`, top-level keys whose value is `null` or `""` are removed **before** the actor pipeline runs. Only the top-level dict is filtered (nested structures are not recursed); numeric zero and boolean false are kept. Useful for sparse wide tables (CSV/SQL) without custom transforms. + +**TigerGraph attribute defaults (schema / `db_profile`, not ingestion):** under `schema.db_profile`, optional **`default_property_values`** declares GSQL `DEFAULT` literals per logical vertex property and per logical edge type, for example: + +```yaml +db_profile: + db_flavor: tigergraph + default_property_values: + vertices: + Sensor: + reading: -1.0 + edges: + - source: Person + target: Company + relation: works_at + values: + since_year: 0 +``` + +This corresponds to overriding TigerGraph’s built-in defaults (e.g. `reading FLOAT DEFAULT -1.0`); see the [TigerGraph “Defining a Graph Schema”](https://docs.tigergraph.com/gsql-ref/4.2/ddl-and-loading/defining-a-graph-schema) documentation. Use `ingestion_model` for **how source records become vertices/edges**. diff --git a/docs/index.md b/docs/index.md index 7f8d2bae..9129ca9a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,13 +1,4 @@ -# GraFlo graflo logo - -**GraFlo** is a **Python package** and **manifest format** (`GraphManifest`: YAML **`schema`** + **`ingestion_model`** + **`bindings`**) for **labeled property graphs**. It is a **Graph Schema & Transformation Language (GSTL)**: you encode the LPG **once** at the logical layer (vertices, edges, typed **`properties`**, identity), express **how** records become graph elements with **`Resource`** actor pipelines, and **project** that model per backend before load. **`GraphEngine`** covers inference, DDL, and ingest; **`Caster`** focuses on batching records into a **`GraphContainer`** and **`DBWriter`**. - -## Why GraFlo - -- **DB-agnostic LPG** — The **logical schema** describes an LPG independent of ArangoDB, Neo4j, Cypher-family stores, TigerGraph, and so on. You do not fork your “graph design” per vendor; you fork only **projection** and connectors. -- **Expressive, composable transforms** — **`Resource`** pipelines chain **actors** (descend into nested data, apply named **transforms**, emit **vertices** and **edges**, route by type with **VertexRouter** / **EdgeRouter**). The same pipeline can be bound to CSV, PostgreSQL, SPARQL, or an API via **`Bindings`**. -- **Clear boundaries** — **`Schema`** is structure only. **`IngestionModel`** holds resources and shared transforms. **`Bindings`** map ingestion resource names to one or more **connectors** and optional **`conn_proxy`** labels—so manifests stay credential-free at rest. -- **Multi-target ingestion** — One code path and manifest can target **ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph**; backend quirks are handled in **DB-aware** types and writers, not in your logical model. +# GraFlo — Graph Schema & Transformation Language (GSTL) graflo logo ![Python](https://img.shields.io/badge/python-3.11%2B-blue.svg) [![PyPI version](https://badge.fury.io/py/graflo.svg)](https://badge.fury.io/py/graflo) @@ -16,21 +7,55 @@ [![pre-commit](https://github.com/growgraph/graflo/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/growgraph/graflo/actions/workflows/pre-commit.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15446131.svg)]( https://doi.org/10.5281/zenodo.15446131) - +**GraFlo** is a manifest-driven toolkit for **labeled property graphs (LPGs)**: describe vertices, edges, and ingestion (`GraphManifest` — YAML or Python), then project and load into a target graph database. + +It is a **Python package** and **Graph Schema & Transformation Language (GSTL)**. **`GraphEngine`** covers inference, DDL, and ingest; **`Caster`** focuses on batching records into a **`GraphContainer`** and **`DBWriter`**. + +### What you get + +- **One pipeline, several graph databases** — The same manifest targets ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph; `DatabaseProfile` and DB-aware types absorb naming, defaults, and indexing differences. +- **Explicit identities** — Vertex identity fields and indexes back upserts so reloads merge on keys instead of blindly duplicating nodes. +- **Reusable ingestion** — `Resource` actor pipelines (including **vertex_router** / **edge_router** steps and the **`VertexRouterActor`** / **`EdgeRouterActor`** implementations) bind to files, SQL, SPARQL/RDF, APIs, or in-memory batches via `Bindings` and the `DataSourceRegistry`. + +### What’s in the manifest +- **`schema`** — `Schema`: metadata, **`core_schema`** (vertices, edges, typed **`properties`**, identities), and **`db_profile`** (`DatabaseProfile`: target flavor, storage names, secondary indexes, TigerGraph `default_property_values`, …). +- **`ingestion_model`** — `IngestionModel`: named **`resources`** (actor sequences: *descend*, *transform*, *vertex*, *edge*, …) and a registry of reusable **`transforms`**. +- **`bindings`** — Connectors (e.g. `FileConnector`, `TableConnector`, `SparqlConnector`) plus **`resource_connector`** wiring. Optional **`connector_connection`** maps connectors to **`conn_proxy`** labels so YAML stays secret-free; a runtime **`ConnectionProvider`** supplies credentials. -## Pipeline +### Runtime path -**Source Instance** → **Resource** (actor pipeline) → **Logical Graph Schema** → **Covariant Graph Representation** (`GraphContainer`) → **DB-aware Projection** → **Graph DB** +1. **Source instance** — Batches from a `DataSourceType` adapter (`FileDataSource`, `SQLDataSource`, `SparqlEndpointDataSource`, `APIDataSource`, …). +2. **Resource (actors)** — Maps records to graph elements against the logical schema (validated during `IngestionModel.finish_init` / pipeline execution). +3. **`GraphContainer`** — Intermediate, database-agnostic vertex/edge batches. +4. **DB-aware projection** — `Schema.resolve_db_aware()` plus `VertexConfigDBAware` / `EdgeConfigDBAware` for the active `DBType`. +5. **Graph DB** — `DBWriter` + `ConnectionManager` and the backend-specific `Connection` implementation. -| Stage | Role | Code | +| Piece | Role | Code | |-------|------|------| -| **Source Instance** | A concrete data artifact — a CSV file, a PostgreSQL table, a SPARQL endpoint, a `.ttl` file. | `AbstractDataSource` subclasses with a `DataSourceType` (`FILE`, `SQL`, `SPARQL`, `API`, `IN_MEMORY`). | -| **Resource** | A reusable transformation pipeline — actor steps (descend, transform, vertex, edge, vertex_router, edge_router) that map raw records to graph elements. Data sources bind to Resources by name via the `DataSourceRegistry`. | `Resource` (part of `IngestionModel`). | -| **Graph Schema** | Declarative logical vertex/edge definitions, identities, typed **properties**, and DB profile. | `Schema`, `VertexConfig`, `EdgeConfig`. | -| **Covariant Graph Representation** | A database-independent collection of vertices and edges. | `GraphContainer`. | -| **DB-aware Projection** | Resolves DB-specific naming/default/index behavior from logical schema + `DatabaseProfile`. | `Schema.resolve_db_aware()`, `VertexConfigDBAware`, `EdgeConfigDBAware`. | -| **Graph DB** | The target LPG store — same API for all supported databases. | `ConnectionManager`, `DBWriter`, DB connectors. | +| **Logical graph schema** | Manifest `schema`: vertex/edge definitions, identities, typed **properties**, DB profile. Constrains pipeline output and projection; not a separate queue between steps. | `Schema`, `VertexConfig`, `EdgeConfig` (under `core_schema`). | +| **Source instance** | Concrete input: file, SQL table, SPARQL endpoint, API payload, in-memory rows. | `AbstractDataSource` + `DataSourceType`. | +| **Resource** | Ordered actors; resources are looked up by name when sources are registered. | `Resource` in `IngestionModel`. | +| **Covariant graph** (`GraphContainer`) | Batches of vertices/edges before load. | `GraphContainer`. | +| **DB-aware projection** | Physical names, defaults, indexes for the target. | `Schema.resolve_db_aware()`, `VertexConfigDBAware`, `EdgeConfigDBAware`. | +| **Graph DB** | Target LPG; each `DBType` has its own connector, orchestrated the same way. | `ConnectionManager`, `DBWriter`, per-backend `Connection`. | + +### Supported source types (`DataSourceType`) + +| DataSourceType | Adapter | DataSource | Schema inference | +|---|---|---|---| +| `FILE` — CSV / JSON / JSONL / Parquet | `FileConnector` | `FileDataSource` | manual | +| `SQL` — relational tables (docs focus on PostgreSQL; other engines via SQLAlchemy where supported) | `TableConnector` | `SQLDataSource` | automatic for PostgreSQL-style 3NF (PK/FK heuristics) | +| `SPARQL` — RDF files (`.ttl`, `.rdf`, `.n3`) | `SparqlConnector` | `RdfFileDataSource` | automatic (OWL/RDFS ontology) | +| `SPARQL` — SPARQL endpoints (Fuseki, …) | `SparqlConnector` | `SparqlEndpointDataSource` | automatic (OWL/RDFS ontology) | +| `API` — REST APIs | — | `APIDataSource` | manual | +| `IN_MEMORY` — list / DataFrame | — | `InMemoryDataSource` | manual | + +### Supported targets + +The graph engines listed in **What you get** are the supported **output** `DBType` values in `graflo.onto`. Each backend uses its own `Connection` implementation under the shared `ConnectionManager` / `DBWriter` / `GraphEngine` flow. + + ## Core Concepts @@ -43,9 +68,9 @@ GraFlo targets the LPG model: ### Schema -The Schema is the single source of truth for the graph structure: +The `Schema` is the single source of truth for **graph structure** (not for ingestion transforms): -- **Vertex definitions** — vertex types, **`properties`** (optionally typed: `INT`, `FLOAT`, `STRING`, `DATETIME`, `BOOL`), identity, and filters; secondary indexes live under **`database_features`**. +- **Vertex definitions** — vertex types, **`properties`** (optionally typed: `INT`, `FLOAT`, `STRING`, `DATETIME`, `BOOL`), identity, and filters. **Secondary indexes** and physical naming live under **`schema.db_profile`** (`DatabaseProfile`: e.g. `vertex_indexes`, `edge_specs`; see [Backend indexes](concepts/backend_indexes.md)). - **Edge definitions** — source/target (and optional `relation`), **`properties`** for relationship payload, and optional **`identities`** for parallel-edge / MERGE semantics. - **Schema inference** — generate schemas from PostgreSQL 3NF databases (PK/FK heuristics) or from OWL/RDFS ontologies. @@ -62,7 +87,9 @@ Resources and transforms are part of `IngestionModel`, not `Schema`. A `Resource` is the central abstraction that bridges data sources and the graph schema. Each Resource defines a reusable pipeline of actors (descend, transform, vertex, edge) that maps raw records to graph elements. Data sources bind to Resources by name via the `DataSourceRegistry`, so the same transformation logic applies regardless of whether data arrives from a file, an API, or a SPARQL endpoint. -For wide rows with many empty or null columns, **`drop_trivial_input_fields`** (default `false`) removes only **top-level** keys whose value is `null` or `""` before the pipeline runs—no recursion into nested structures. +For wide rows with many empty or null columns, **`drop_trivial_input_fields`** (default `false`) removes only **top-level** keys whose value is `null` or `""` before the pipeline runs. The filter is **shallow**: nested dicts and lists are not walked, and empty `{}` / `[]` values are kept because they are not `null` or `""`. **`0`** and **`false`** are kept. + +For **TigerGraph**, optional attribute defaults belong in the covariant physical layer: **`schema.db_profile.default_property_values`** maps logical vertex/edge properties to YAML literals that GraFlo turns into GSQL **`DEFAULT`** clauses when defining the graph schema (same idea as `CREATE VERTEX Sensor (id STRING PRIMARY KEY, reading FLOAT DEFAULT -1.0)` in the [TigerGraph schema reference](https://docs.tigergraph.com/gsql-ref/4.2/ddl-and-loading/defining-a-graph-schema)). ### DataSourceRegistry @@ -81,20 +108,15 @@ The `DataSourceRegistry` manages `AbstractDataSource` adapters, each carrying a `GraphEngine` orchestrates end-to-end operations: schema inference, schema definition in the target database, connector creation from data sources, and data ingestion. -## Key Features - -- **Declarative LPG schema DSL** — Define vertices, edges, indexes, edge **properties**, and transforms in YAML or Python. The `Schema` is the single source of truth, independent of source or target. -- **Database abstraction** — One logical schema and transformation DSL, one API. Target ArangoDB, Neo4j, TigerGraph, FalkorDB, Memgraph, or NebulaGraph without rewriting pipelines. DB idiosyncrasies are handled in DB-aware projection (`Schema.resolve_db_aware(...)`) and connector/writer stages. -- **Resource abstraction** — Each `Resource` defines a reusable actor pipeline that maps raw records to graph elements. Actor types include descend, transform, vertex, edge, plus **VertexRouter** and **EdgeRouter** for dynamic type-based routing (see [Concepts — Actor](concepts/index.md#actor)). Data sources bind to Resources by name via the `DataSourceRegistry`, decoupling transformation logic from data retrieval. -- **DataSourceRegistry** — Register `FILE`, `SQL`, `API`, `IN_MEMORY`, or `SPARQL` data sources. Each `DataSourceType` plugs into the same Resource pipeline. -- **SPARQL & RDF support** — Query SPARQL endpoints (e.g. Apache Fuseki), read `.ttl`/`.rdf`/`.n3` files, and auto-infer schemas from OWL/RDFS ontologies (`rdflib` and `SPARQLWrapper` are included in the default install). -- **Schema inference** — Generate graph schemas from PostgreSQL 3NF databases (PK/FK heuristics) or from OWL/RDFS ontologies. See [Example 5](examples/example-5.md). -- **Schema migration planning/execution** — Generate typed migration plans between schema versions, apply low-risk additive changes with risk gates, and track revision history via `migrate_schema`. - - Compare `from` and `to` schemas before execution to preview structural deltas and blocked high-risk operations. -- **Typed properties** — Vertex and edge **`properties`** carry optional types for validation and database-specific optimisation. -- **Parallel batch processing** — Configurable batch sizes and multi-core execution. +## More capabilities + +- **SPARQL & RDF** — Endpoints and RDF files; optional OWL/RDFS schema inference (`rdflib`, `SPARQLWrapper` in the default install). +- **Schema inference** — From PostgreSQL-style 3NF layouts (PK/FK heuristics) or from OWL/RDFS (`owl:Class` → vertices, `owl:ObjectProperty` → edges, `owl:DatatypeProperty` → vertex fields). See [Example 5](examples/example-5.md). +- **Schema migrations** — Plan and apply guarded schema deltas (`migrate_schema` console script → `graflo.cli.migrate_schema`; library in `graflo.migrate`). Compare `from` / `to` schemas before execution to preview deltas and blocked high-risk operations. See [Concepts — Schema Migration](concepts/index.md#schema-migration-v1). +- **Typed `properties`** — Optional field types (`INT`, `FLOAT`, `STRING`, `DATETIME`, `BOOL`) on vertices and edges. +- **Batching & concurrency** — Configurable batch sizes, worker counts (`IngestionParams.n_cores`), and DB write concurrency (`IngestionParams.max_concurrent_db_ops` / `DBWriter`). - **Advanced filtering** — Server-side filtering (e.g. TigerGraph REST++ API), client-side filter expressions, and **SelectSpec** for declarative SQL view/filter control before data reaches Resources. -- **Blank vertices** — Create intermediate nodes for complex relationship modelling. +- **Blank vertices** — Intermediate nodes for complex relationship modelling. ## Quick Links diff --git a/graflo/architecture/contract/declarations/resource.py b/graflo/architecture/contract/declarations/resource.py index 1151a92a..8c6baeb2 100644 --- a/graflo/architecture/contract/declarations/resource.py +++ b/graflo/architecture/contract/declarations/resource.py @@ -246,8 +246,11 @@ class Resource(ConfigBaseModel): drop_trivial_input_fields: bool = PydanticField( default=False, description=( - "If True, drop top-level input keys whose value is None or '' before the pipeline runs. " - "Does not recurse into nested dicts or lists. Default False." + "If True, remove top-level input keys whose value is None or the empty string before " + "the actor pipeline runs. Only the outer dict is filtered: nested dicts and list " + "elements are left unchanged, and keys whose values are containers (dict/list) are " + "kept even when empty. Numeric 0 and boolean False are kept. Use with wide or " + "sparse tabular rows so VertexActor projection sees fewer irrelevant columns." ), ) diff --git a/graflo/architecture/database_features.py b/graflo/architecture/database_features.py index eea11f60..9a24d93a 100644 --- a/graflo/architecture/database_features.py +++ b/graflo/architecture/database_features.py @@ -5,10 +5,9 @@ from __future__ import annotations -from typing import Literal +from typing import Any, Literal -from pydantic import Field as PydanticField -from pydantic import model_validator +from pydantic import AliasChoices, Field as PydanticField, model_validator from graflo.architecture.base import ConfigBaseModel from graflo.architecture.graph_types import EdgeId, Index @@ -49,6 +48,41 @@ def edge_id(self) -> EdgeId: return (self.source, self.target, self.relation) +class EdgePropertyDefaults(ConfigBaseModel): + """Per logical edge type: optional GSQL ``DEFAULT`` values for edge attributes.""" + + source: str = PydanticField(..., description="Logical source vertex name.") + target: str = PydanticField(..., description="Logical target vertex name.") + relation: str | None = PydanticField( + default=None, + description="Logical relation; must match edge identity (use null for default relation).", + ) + values: dict[str, Any] = PydanticField( + default_factory=dict, + description="Edge attribute name to default value (YAML/JSON literals).", + validation_alias=AliasChoices("values", "properties"), + ) + + +class DefaultPropertyValues(ConfigBaseModel): + """TigerGraph-style attribute defaults for physical schema DDL (covariant profile). + + Maps to GSQL ``attribute_name type [DEFAULT default_value]`` on vertices and edges; + see TigerGraph `Defining a Graph Schema`_. + + .. _Defining a Graph Schema: https://docs.tigergraph.com/gsql-ref/4.2/ddl-and-loading/defining-a-graph-schema + """ + + vertices: dict[str, dict[str, Any]] = PydanticField( + default_factory=dict, + description="Logical vertex name -> property name -> default value for GSQL DEFAULT.", + ) + edges: list[EdgePropertyDefaults] = PydanticField( + default_factory=list, + description="Per (source, target, relation) edge type: attribute defaults.", + ) + + class DatabaseProfile(ConfigBaseModel): """Container for DB-only physical features such as secondary indexes.""" @@ -76,6 +110,14 @@ class DatabaseProfile(ConfigBaseModel): default_factory=list, description="Unified edge physical specs keyed by edge identity + purpose.", ) + default_property_values: DefaultPropertyValues | None = PydanticField( + default=None, + description=( + "Optional per-attribute GSQL DEFAULT values for TigerGraph (and similar) DDL. " + "Vertex keys are logical vertex names; edge entries match logical (source, target, relation). " + "Does not change logical LPG types—only physical schema projection." + ), + ) @model_validator(mode="after") def _normalize_edge_specs(self) -> "DatabaseProfile": @@ -129,6 +171,55 @@ def _ensure_variant( object.__setattr__(self, "edge_specs", list(merged.values())) return self + def vertex_property_default( + self, vertex_name: str, property_name: str + ) -> Any | None: + """Return declared default for a vertex property, or None if not specified.""" + dpv = self.default_property_values + if dpv is None: + return None + per_vertex = dpv.vertices.get(vertex_name) + if per_vertex is None: + return None + return per_vertex.get(property_name) + + def has_vertex_property_default(self, vertex_name: str, property_name: str) -> bool: + dpv = self.default_property_values + if dpv is None: + return False + per_vertex = dpv.vertices.get(vertex_name) + return per_vertex is not None and property_name in per_vertex + + def edge_property_default(self, edge_id: EdgeId, property_name: str) -> Any | None: + """Return declared default for an edge attribute, or None if not specified.""" + dpv = self.default_property_values + if dpv is None or not dpv.edges: + return None + source, target, relation = edge_id + for spec in reversed(dpv.edges): + if spec.source != source or spec.target != target: + continue + if spec.relation != relation: + continue + if property_name not in spec.values: + continue + return spec.values[property_name] + return None + + def has_edge_property_default(self, edge_id: EdgeId, property_name: str) -> bool: + dpv = self.default_property_values + if dpv is None or not dpv.edges: + return False + source, target, relation = edge_id + for spec in reversed(dpv.edges): + if spec.source != source or spec.target != target: + continue + if spec.relation != relation: + continue + if property_name in spec.values: + return True + return False + def _edge_variant_spec( self, edge_id: EdgeId, diff --git a/graflo/architecture/schema/document.py b/graflo/architecture/schema/document.py index d10b3a23..097d9ed9 100644 --- a/graflo/architecture/schema/document.py +++ b/graflo/architecture/schema/document.py @@ -33,7 +33,10 @@ class Schema(ConfigBaseModel): ) db_profile: DatabaseProfile = PydanticField( default_factory=DatabaseProfile, - description="Database-specific physical profile (secondary indexes, naming, etc.).", + description=( + "Database-specific physical profile (secondary indexes, naming, TigerGraph GSQL " + "DEFAULT overrides via default_property_values, etc.)." + ), ) @model_validator(mode="after") diff --git a/graflo/db/tigergraph/conn.py b/graflo/db/tigergraph/conn.py index ec1cde9c..02041000 100644 --- a/graflo/db/tigergraph/conn.py +++ b/graflo/db/tigergraph/conn.py @@ -42,11 +42,12 @@ from graflo.architecture.schema.edge import DEFAULT_TIGERGRAPH_RELATION_WEIGHTNAME, Edge from graflo.architecture.database_features import DatabaseProfile from graflo.architecture.schema import VertexConfigDBAware -from graflo.architecture.graph_types import Index +from graflo.architecture.graph_types import EdgeId, Index from graflo.architecture.schema import Schema from graflo.architecture.schema.vertex import FieldType, Vertex, VertexConfig from graflo.db.conn import Connection, SchemaExistsError, consume_insert_edges_kwargs from graflo.db.connection import TigergraphConfig +from graflo.db.tigergraph.gsql_literals import gsql_default_literal from graflo.db.tigergraph.onto import ( TIGERGRAPH_TYPE_ALIASES, VALID_TIGERGRAPH_TYPES, @@ -1618,17 +1619,49 @@ def close(self): """Close connection - no cleanup needed (using direct REST API calls).""" pass - def _get_vertex_add_statement(self, vertex: Vertex, vertex_config) -> str: + def _gsql_vertex_field_def( + self, + *, + logical_vertex_name: str, + field_name: str, + tg_type: str, + db_profile: DatabaseProfile | None, + ) -> str: + """Single attribute fragment: ``name TYPE`` or ``name TYPE DEFAULT ...``.""" + line = f"{field_name} {tg_type}" + if db_profile is None or not db_profile.has_vertex_property_default( + logical_vertex_name, field_name + ): + return line + raw = db_profile.vertex_property_default(logical_vertex_name, field_name) + if raw is None: + return line + lit = gsql_default_literal(raw) + return f"{line} DEFAULT {lit}" + + def _get_vertex_add_statement( + self, + vertex: Vertex, + vertex_config, + *, + db_profile: DatabaseProfile | None = None, + ) -> str: """Generate ADD VERTEX statement for a schema change job. Args: vertex: Vertex object to generate statement for vertex_config: Vertex configuration + db_profile: Optional profile for ``default_property_values`` (GSQL DEFAULT clauses). Returns: str: GSQL ADD VERTEX statement """ + profile = db_profile + if profile is None and hasattr(vertex_config, "db_profile"): + profile = getattr(vertex_config, "db_profile", None) + vertex_dbname = vertex_config.vertex_dbname(vertex.name) + logical = vertex.name index_fields = vertex_config.identity_fields(vertex.name) if len(index_fields) == 0: @@ -1672,9 +1705,22 @@ def _get_vertex_add_statement(self, vertex: Vertex, vertex_config) -> str: if name != primary_field_name ] - # Build field list: PRIMARY_ID comes first, then other fields - field_parts = [f"PRIMARY_ID {primary_field_name} {primary_field_type}"] - field_parts.extend([f"{name} {ftype}" for name, ftype in other_fields]) + primary_attr = self._gsql_vertex_field_def( + logical_vertex_name=logical, + field_name=primary_field_name, + tg_type=primary_field_type, + db_profile=profile, + ) + field_parts = [f"PRIMARY_ID {primary_attr}"] + for name, ftype in other_fields: + field_parts.append( + self._gsql_vertex_field_def( + logical_vertex_name=logical, + field_name=name, + tg_type=ftype, + db_profile=profile, + ) + ) field_definitions = ",\n ".join(field_parts) @@ -1685,7 +1731,15 @@ def _get_vertex_add_statement(self, vertex: Vertex, vertex_config) -> str: ) else: # Composite key: use PRIMARY KEY syntax - field_parts = [f"{name} {ftype}" for name, ftype in all_fields] + field_parts = [ + self._gsql_vertex_field_def( + logical_vertex_name=logical, + field_name=name, + tg_type=ftype, + db_profile=profile, + ) + for name, ftype in all_fields + ] vindex = "(" + ", ".join(index_fields) + ")" field_parts.append(f"PRIMARY KEY {vindex}") @@ -1708,13 +1762,20 @@ def _edge_for_tigergraph_ddl(self, edge: Edge, ec_db: EdgeConfigDBAware) -> Edge return edge_copy def _format_edge_attributes( - self, edge: Edge, exclude_fields: set[str] | None = None + self, + edge: Edge, + exclude_fields: set[str] | None = None, + *, + db_profile: DatabaseProfile | None = None, + edge_id: EdgeId | None = None, ) -> str: """Format edge attributes for GSQL ADD DIRECTED EDGE statement. Args: edge: Edge object to format attributes for exclude_fields: Optional set of field names to exclude from attributes + db_profile: Optional profile for ``default_property_values`` (GSQL DEFAULT). + edge_id: Logical edge identity; defaults to ``edge.edge_id``. Returns: str: Formatted attribute string (e.g., " date STRING,\n relation STRING") @@ -1725,12 +1786,22 @@ def _format_edge_attributes( if exclude_fields is None: exclude_fields = set() + eid = edge_id if edge_id is not None else edge.edge_id + attr_parts = [] for field in edge.properties: field_name = field.name if field_name not in exclude_fields: field_type = self._get_tigergraph_type(field.type) - attr_parts.append(f" {field_name} {field_type}") + segment = f"{field_name} {field_type}" + if db_profile is not None and db_profile.has_edge_property_default( + eid, field_name + ): + raw = db_profile.edge_property_default(eid, field_name) + if raw is not None: + lit = gsql_default_literal(raw) + segment = f"{segment} DEFAULT {lit}" + attr_parts.append(f" {segment}") return ",\n".join(attr_parts) @@ -1755,6 +1826,7 @@ def _get_edge_add_statement( relation_name: str, source_vertex: str, target_vertex: str, + db_profile: DatabaseProfile | None = None, ) -> str: """Generate ADD DIRECTED EDGE statement for a schema change job. @@ -1784,7 +1856,10 @@ def _get_edge_add_statement( # Format edge attributes, excluding discriminator fields (they're in DISCRIMINATOR clause) edge_attrs = self._format_edge_attributes( - edge, exclude_fields=indexed_field_names + edge, + exclude_fields=indexed_field_names, + db_profile=db_profile, + edge_id=edge.edge_id, ) # Build discriminator clause with all indexed fields @@ -1849,6 +1924,7 @@ def _get_edge_group_create_statement( relation_name: str, source_vertices: dict[int, str], target_vertices: dict[int, str], + db_profile: DatabaseProfile | None = None, ) -> str: """Generate ADD DIRECTED EDGE statement for a group of edges with the same relation. @@ -1886,7 +1962,10 @@ def _get_edge_group_create_statement( # Format edge attributes, excluding discriminator fields edge_attrs = self._format_edge_attributes( - first_edge, exclude_fields=indexed_field_names + first_edge, + exclude_fields=indexed_field_names, + db_profile=db_profile, + edge_id=first_edge.edge_id, ) # Get field types for discriminator fields @@ -2056,7 +2135,11 @@ def _define_schema_local(self, schema: Schema) -> None: # Validate vertex name vertex_dbname = db_schema.vertex_config.vertex_dbname(vertex.name) _validate_tigergraph_schema_name(vertex_dbname, "vertex") - stmt = self._get_vertex_add_statement(vertex, db_schema.vertex_config) + stmt = self._get_vertex_add_statement( + vertex, + db_schema.vertex_config, + db_profile=db_schema.db_profile, + ) vertex_stmts.append(stmt) # Edges - group by relation since TigerGraph requires edges of the same type @@ -2098,6 +2181,7 @@ def _define_schema_local(self, schema: Schema) -> None: relation_name=relation, source_vertices=ddl_source_vertices, target_vertices=ddl_target_vertices, + db_profile=db_schema.db_profile, ) edge_stmts.append(stmt) @@ -2512,8 +2596,7 @@ def _format_vertex_fields(self, vertex: Vertex) -> str: for field in fields: # Field type should already be set (STRING if was None) field_type = field.type or FieldType.STRING.value - # Format as: field_name TYPE - # TODO: Add DEFAULT clause support if needed in the future + # Format as: field_name TYPE (DEFAULT clauses live in schema.db_profile.default_property_values) field_list.append(f"{field.name} {field_type}") return ",\n ".join(field_list) @@ -3168,8 +3251,9 @@ def _generate_upsert_payload( # 4. Format attributes for TigerGraph REST++ API # TigerGraph requires attribute values to be wrapped in {"value": ...} + # Include falsy but valid values (0, False, "") — only None is omitted. formatted_attributes = { - k: {"value": v} for k, v in clean_record.items() if v + k: {"value": v} for k, v in clean_record.items() if v is not None } # 5. Add the record attributes to the map using the composite ID as the key diff --git a/graflo/db/tigergraph/gsql_literals.py b/graflo/db/tigergraph/gsql_literals.py new file mode 100644 index 00000000..fac0b0c3 --- /dev/null +++ b/graflo/db/tigergraph/gsql_literals.py @@ -0,0 +1,28 @@ +"""GSQL literal formatting for schema DDL (e.g. DEFAULT clauses).""" + +from __future__ import annotations + +import json +from decimal import Decimal +from typing import Any + + +def gsql_default_literal(value: Any) -> str: + """Format a manifest/JSON-friendly value as a GSQL literal for ``DEFAULT``.""" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, int): + return str(value) + if isinstance(value, float): + if value != value: # NaN + raise ValueError("NaN is not a valid GSQL default literal") + return json.dumps(value) + if isinstance(value, Decimal): + return format(value, "f") + if isinstance(value, str): + escaped = value.replace("\\", "\\\\").replace('"', '\\"') + return f'"{escaped}"' + raise TypeError( + f"Unsupported type for GSQL DEFAULT literal: {type(value).__name__}. " + "Use bool, int, float, Decimal, or str." + ) diff --git a/test/architecture/test_database_profile_default_property_values.py b/test/architecture/test_database_profile_default_property_values.py new file mode 100644 index 00000000..bc479890 --- /dev/null +++ b/test/architecture/test_database_profile_default_property_values.py @@ -0,0 +1,70 @@ +"""Tests for DatabaseProfile.default_property_values.""" + +from __future__ import annotations + +from graflo.architecture.database_features import ( + DatabaseProfile, + DefaultPropertyValues, + EdgePropertyDefaults, +) +from graflo.onto import DBType + + +def test_database_profile_parses_default_property_values_yaml_shape() -> None: + dp = DatabaseProfile( + db_flavor=DBType.TIGERGRAPH, + default_property_values=DefaultPropertyValues( + vertices={"Sensor": {"reading": -1.0}}, + edges=[ + EdgePropertyDefaults( + source="Person", + target="Company", + relation="works_at", + values={"since_year": 0}, + ) + ], + ), + ) + assert dp.vertex_property_default("Sensor", "reading") == -1.0 + assert dp.has_vertex_property_default("Sensor", "reading") + assert not dp.has_vertex_property_default("Sensor", "missing") + + eid = ("Person", "Company", "works_at") + assert dp.edge_property_default(eid, "since_year") == 0 + assert dp.has_edge_property_default(eid, "since_year") + assert not dp.has_edge_property_default(eid, "nope") + + +def test_default_property_values_accepts_properties_alias_in_raw_dict() -> None: + raw = { + "db_flavor": "tigergraph", + "default_property_values": { + "vertices": {"Sensor": {"reading": -1.0}}, + "edges": [ + { + "source": "A", + "target": "B", + "relation": None, + "properties": {"x": 1}, + } + ], + }, + } + dp = DatabaseProfile.model_validate(raw) + assert dp.edge_property_default(("A", "B", None), "x") == 1 + + +def test_edge_property_defaults_last_spec_wins() -> None: + dp = DatabaseProfile( + default_property_values=DefaultPropertyValues( + edges=[ + EdgePropertyDefaults( + source="A", target="B", relation=None, values={"w": 1} + ), + EdgePropertyDefaults( + source="A", target="B", relation=None, values={"w": 2} + ), + ] + ) + ) + assert dp.edge_property_default(("A", "B", None), "w") == 2 diff --git a/test/db/tigergraph/test_gsql_literals.py b/test/db/tigergraph/test_gsql_literals.py new file mode 100644 index 00000000..fc797d64 --- /dev/null +++ b/test/db/tigergraph/test_gsql_literals.py @@ -0,0 +1,33 @@ +"""Tests for GSQL DEFAULT literal formatting.""" + +from __future__ import annotations + +import pytest + +from graflo.db.tigergraph.gsql_literals import gsql_default_literal + + +def test_gsql_default_literal_bool() -> None: + assert gsql_default_literal(True) == "true" + assert gsql_default_literal(False) == "false" + + +def test_gsql_default_literal_numbers() -> None: + assert gsql_default_literal(0) == "0" + assert gsql_default_literal(-1) == "-1" + assert gsql_default_literal(-1.0) == "-1.0" + + +def test_gsql_default_literal_string() -> None: + assert gsql_default_literal("a") == '"a"' + assert gsql_default_literal('say "hi"') == '"say \\"hi\\""' + + +def test_gsql_default_literal_rejects_nan() -> None: + with pytest.raises(ValueError, match="NaN"): + gsql_default_literal(float("nan")) + + +def test_gsql_default_literal_rejects_unsupported() -> None: + with pytest.raises(TypeError): + gsql_default_literal([1]) diff --git a/test/db/tigergraph/test_tigergraph_ddl_default_clauses.py b/test/db/tigergraph/test_tigergraph_ddl_default_clauses.py new file mode 100644 index 00000000..7cf41c4e --- /dev/null +++ b/test/db/tigergraph/test_tigergraph_ddl_default_clauses.py @@ -0,0 +1,52 @@ +"""TigerGraph DDL: GSQL DEFAULT from db_profile.default_property_values.""" + +from __future__ import annotations + +from graflo.architecture.database_features import DatabaseProfile, DefaultPropertyValues +from graflo.architecture.schema.db_aware import VertexConfigDBAware +from graflo.architecture.schema.vertex import Field, FieldType, Vertex, VertexConfig +from graflo.db.tigergraph.conn import TigerGraphConnection +from graflo.onto import DBType + + +def _bare_tg_conn() -> TigerGraphConnection: + return TigerGraphConnection.__new__(TigerGraphConnection) + + +def test_vertex_add_statement_primary_key_float_default() -> None: + """Mirrors: CREATE VERTEX Sensor (id STRING PRIMARY KEY, reading FLOAT DEFAULT -1.0).""" + conn = _bare_tg_conn() + vertex = Vertex( + name="Sensor", + properties=[ + Field(name="id", type=FieldType.STRING), + Field(name="reading", type=FieldType.FLOAT), + ], + identity=["id"], + ) + profile = DatabaseProfile( + db_flavor=DBType.TIGERGRAPH, + default_property_values=DefaultPropertyValues( + vertices={"Sensor": {"reading": -1.0}} + ), + ) + vc = VertexConfigDBAware(VertexConfig(vertices=[vertex]), profile) + stmt = conn._get_vertex_add_statement(vertex, vc, db_profile=profile) + assert "reading FLOAT DEFAULT -1.0" in stmt.replace("\n", " ") + assert "PRIMARY_ID" in stmt or "PRIMARY KEY" in stmt + + +def test_vertex_field_def_helper() -> None: + conn = _bare_tg_conn() + profile = DatabaseProfile( + default_property_values=DefaultPropertyValues( + vertices={"Sensor": {"reading": -1.25}} + ), + ) + line = conn._gsql_vertex_field_def( + logical_vertex_name="Sensor", + field_name="reading", + tg_type="FLOAT", + db_profile=profile, + ) + assert line == "reading FLOAT DEFAULT -1.25"