diff --git a/.github/workflows/_build_rust_artifacts.yml b/.github/workflows/_build_rust_artifacts.yml index db6910e263..cd528dc6f2 100644 --- a/.github/workflows/_build_rust_artifacts.yml +++ b/.github/workflows/_build_rust_artifacts.yml @@ -46,7 +46,7 @@ on: connector_plugins: type: string required: false - default: "iggy_connector_elasticsearch_sink,iggy_connector_elasticsearch_source,iggy_connector_iceberg_sink,iggy_connector_postgres_sink,iggy_connector_postgres_source,iggy_connector_quickwit_sink,iggy_connector_random_source,iggy_connector_stdout_sink" + default: "iggy_connector_elasticsearch_sink,iggy_connector_elasticsearch_source,iggy_connector_iceberg_sink,iggy_connector_jdbc_sink,iggy_connector_jdbc_source,iggy_connector_postgres_sink,iggy_connector_postgres_source,iggy_connector_quickwit_sink,iggy_connector_random_source,iggy_connector_stdout_sink" description: "Comma-separated list of connector plugin crates to build as shared libraries" outputs: artifact_name: diff --git a/.github/workflows/edge-release.yml b/.github/workflows/edge-release.yml index 7ca84fc5da..4eeb899aa1 100644 --- a/.github/workflows/edge-release.yml +++ b/.github/workflows/edge-release.yml @@ -104,6 +104,8 @@ jobs: - `iggy_connector_elasticsearch_sink` - `iggy_connector_elasticsearch_source` - `iggy_connector_iceberg_sink` + - `iggy_connector_jdbc_sink` + - `iggy_connector_jdbc_source` - `iggy_connector_postgres_sink` - `iggy_connector_postgres_source` - `iggy_connector_quickwit_sink` diff --git a/Cargo.lock b/Cargo.lock index be96205a9e..d7be6c7c42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2691,7 +2691,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading", + "libloading 0.8.9", ] [[package]] @@ -6129,6 +6129,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" +[[package]] +name = "humantime-serde" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c" +dependencies = [ + "humantime", + "serde", +] + [[package]] name = "hwlocality" version = "1.0.0-alpha.12" @@ -6931,6 +6941,45 @@ dependencies = [ "uuid", ] +[[package]] +name = "iggy_connector_jdbc_sink" +version = "0.1.0" +dependencies = [ + "async-trait", + "base64", + "dashmap", + "humantime", + "iggy_connector_sdk", + "jni 0.21.1", + "regex", + "secrecy", + "serde", + "serde_json", + "toml 1.1.2+spec-1.1.0", + "tracing", +] + +[[package]] +name = "iggy_connector_jdbc_source" +version = "0.1.0" +dependencies = [ + "async-trait", + "base64", + "chrono", + "dashmap", + "humantime-serde", + "iggy_connector_sdk", + "jni 0.21.1", + "regex", + "secrecy", + "serde", + "serde_json", + "tokio", + "toml 1.1.2+spec-1.1.0", + "tracing", + "uuid", +] + [[package]] name = "iggy_connector_mongodb_sink" version = "0.4.1-edge.1" @@ -7392,6 +7441,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "java-locator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09c46c1fe465c59b1474e665e85e1256c3893dd00927b8d55f63b09044c1e64f" +dependencies = [ + "glob", +] + [[package]] name = "jiff" version = "0.2.29" @@ -7443,7 +7501,9 @@ dependencies = [ "cesu8", "cfg-if", "combine", + "java-locator", "jni-sys 0.3.1", + "libloading 0.7.4", "log", "thiserror 1.0.69", "walkdir", @@ -7772,6 +7832,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libloading" version = "0.8.9" diff --git a/Cargo.toml b/Cargo.toml index 4d196d9632..0238c7ea0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,12 +39,14 @@ members = [ "core/connectors/sinks/http_sink", "core/connectors/sinks/iceberg_sink", "core/connectors/sinks/influxdb_sink", + "core/connectors/sinks/jdbc_sink", "core/connectors/sinks/mongodb_sink", "core/connectors/sinks/postgres_sink", "core/connectors/sinks/quickwit_sink", "core/connectors/sinks/stdout_sink", "core/connectors/sources/elasticsearch_source", "core/connectors/sources/influxdb_source", + "core/connectors/sources/jdbc_source", "core/connectors/sources/postgres_source", "core/connectors/sources/random_source", "core/consensus", diff --git a/core/connectors/README.md b/core/connectors/README.md index b7d24cfbab..9cee15944d 100644 --- a/core/connectors/README.md +++ b/core/connectors/README.md @@ -83,6 +83,7 @@ Each sink should have its own, custom configuration, which is passed along with - **Doris Sink** - loads JSON messages into Apache Doris tables via the Stream Load HTTP API - **Elasticsearch Sink** - sends messages to Elasticsearch indices - **Iceberg Sink** - writes data to Apache Iceberg tables via REST catalog +- **JDBC Sink** - writes messages as rows into any JDBC-compliant database (PostgreSQL, MySQL, Oracle, SQL Server, H2) via an embedded JVM - **PostgreSQL Sink** - stores messages in PostgreSQL database tables - **Quickwit Sink** - indexes messages in Quickwit search engine - **Stdout Sink** - prints messages to standard output (useful for debugging/development) @@ -96,6 +97,7 @@ Please refer to the **[Source documentation](https://github.com/apache/iggy/tree ### Available Sources - **Elasticsearch Source** - polls documents from Elasticsearch indices +- **JDBC Source** - reads rows from any JDBC-compliant database (PostgreSQL, MySQL, Oracle, SQL Server, H2) via an embedded JVM; bulk and incremental modes - **PostgreSQL Source** - reads rows from PostgreSQL tables with multiple consumption strategies (delete after read, mark as processed, timestamp tracking) - **Random Source** - generates random test messages (useful for testing/development) diff --git a/core/connectors/runtime/example_config/connectors/jdbc_bulk_mode.toml b/core/connectors/runtime/example_config/connectors/jdbc_bulk_mode.toml new file mode 100644 index 0000000000..de9e2f7f65 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_bulk_mode.toml @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Source Connector Configuration - BULK MODE +# Bulk mode works with ALL JDBC databases without any special requirements +# No tracking column needed - just executes your query and fetches results + +type = "source" +key = "jdbc_bulk_example" +enabled = true +version = 0 +name = "JDBC Bulk Mode Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +# This example uses PostgreSQL, but bulk mode works identically with: +# MySQL, Oracle, SQL Server, H2, Derby, DB2, etc. +jdbc_url = "jdbc:postgresql://localhost:5432/warehouse" + +# Database credentials can be in URL or separate +# jdbc_url = "jdbc:postgresql://localhost:5432/warehouse?user=myuser&password=mypass" +username = "warehouse_user" +password = "secret" + +driver_class = "org.postgresql.Driver" +driver_jar_path = "/opt/jdbc-drivers/postgresql-42.6.0.jar" + +# Bulk mode: Any valid SELECT query +# Can include JOINs, aggregations, complex WHERE clauses, etc. +query = """ +SELECT + p.product_id,\ + p.product_name, + p.category, + p.price, + COUNT(o.order_id) as total_orders, + SUM(o.quantity) as total_quantity +FROM products p +LEFT JOIN orders o ON p.product_id = o.product_id +GROUP BY p.product_id, p.product_name, p.category, p.price +""" + +# Poll once per hour for daily snapshots +poll_interval = "1h" + +# Large batch size for full table scans +batch_size = 10000 + +# BULK MODE - no tracking column needed! +mode = "bulk" + +# Bulk mode benefits: +# - No tracking column required +# - Works with any SELECT query +# - Supports complex queries (JOINs, aggregations, window functions) +# - Perfect for periodic snapshots +# - Universal compatibility with all databases + +snake_case_columns = true +include_metadata = false + +[[streams]] +stream = "warehouse" +topic = "product_summary" +partition_id = 1 +schema = "json" diff --git a/core/connectors/runtime/example_config/connectors/jdbc_h2.toml b/core/connectors/runtime/example_config/connectors/jdbc_h2.toml new file mode 100644 index 0000000000..82ee459ff5 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_h2.toml @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Source Connector Configuration for H2 Database +# H2 is useful for testing and development as it's an embedded Java database + +type = "source" +key = "jdbc_h2_example" +enabled = true +version = 0 +name = "JDBC H2 Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +# H2 connection URL (in-memory database) +jdbc_url = "jdbc:h2:mem:testdb;DB_CLOSE_DELAY=-1" + +# H2 JDBC driver +driver_class = "org.h2.Driver" + +# Path to H2 driver JAR +# Download from: https://repo1.maven.org/maven2/com/h2database/h2/2.2.224/h2-2.2.224.jar +# Note: Update this path to match where you downloaded the JAR file +driver_jar_path = "/tmp/jdbc-drivers/h2-2.2.224.jar" + +# H2 credentials (default) +username = "sa" +password = "" + +# Simple query for testing +query = "SELECT * FROM users WHERE id > {last_offset} ORDER BY id" + +poll_interval = "10s" +batch_size = 100 +tracking_column = "id" +initial_offset = "0" +mode = "incremental" +snake_case_columns = false +include_metadata = true + +[[streams]] +stream = "test" +topic = "users" +partition_id = 1 +schema = "json" diff --git a/core/connectors/runtime/example_config/connectors/jdbc_mysql.toml b/core/connectors/runtime/example_config/connectors/jdbc_mysql.toml new file mode 100644 index 0000000000..fd2ed5fc48 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_mysql.toml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Source Connector Configuration for MySQL +# This file demonstrates how to configure the JDBC source connector +# to read data from a MySQL database and publish to Iggy streams. + +type = "source" +key = "jdbc_mysql_example" +enabled = true +version = 0 +name = "JDBC MySQL Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +# JDBC connection URL +# Option 1: Separate credentials (recommended) +jdbc_url = "jdbc:mysql://localhost:3306/ecommerce?useSSL=false&serverTimezone=UTC" + +# Option 2: Embedded credentials in URL (alternative) +# jdbc_url = "jdbc:mysql://iggy_user:iggy_password@localhost:3306/ecommerce?useSSL=false&serverTimezone=UTC" + +# JDBC driver class name +driver_class = "com.mysql.cj.jdbc.Driver" + +# Path to JDBC driver JAR file +# Download from: https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.33/mysql-connector-j-8.0.33.jar +driver_jar_path = "/opt/jdbc-drivers/mysql-connector-j-8.0.33.jar" + +# Database credentials (optional if included in jdbc_url) +username = "iggy_user" +password = "iggy_password" + +# SQL query to execute +# Use {last_offset} placeholder for incremental reads +query = "SELECT * FROM orders WHERE updated_at > {last_offset} ORDER BY updated_at ASC" + +# How often to poll the database +poll_interval = "30s" + +# Maximum number of rows to fetch per poll +batch_size = 1000 + +# Column to track for incremental reads (must be in query result) +tracking_column = "updated_at" + +# Initial offset value for the first poll +initial_offset = "2024-01-01 00:00:00" + +# Source mode: "incremental" or "bulk" +# Note: Both modes work with ALL JDBC databases (MySQL, Oracle, PostgreSQL, etc.) +# - incremental: Tracks last offset, avoids duplicate reads +# - bulk: Full table scan, no offset tracking +mode = "incremental" + +# Convert column names to snake_case (e.g., OrderDate -> order_date) +snake_case_columns = true + +# Include metadata wrapper in output messages +include_metadata = true + +# Custom JVM options (optional) +jvm_options = ["-Xmx512m", "-Xms128m"] + +# Target Iggy stream and topic +[[streams]] +stream = "ecommerce" +topic = "orders" +partition_id = 1 +schema = "json" diff --git a/core/connectors/runtime/example_config/connectors/jdbc_oracle.toml b/core/connectors/runtime/example_config/connectors/jdbc_oracle.toml new file mode 100644 index 0000000000..da6c74eaa5 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_oracle.toml @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Source Connector Configuration for Oracle Database + +type = "source" +key = "jdbc_oracle_example" +enabled = true +version = 0 +name = "JDBC Oracle Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +# JDBC connection URL +# Option 1: Separate credentials +jdbc_url = "jdbc:oracle:thin:@localhost:1521:XE" + +# Option 2: Embedded credentials in URL (Oracle uses / separator) +# jdbc_url = "jdbc:oracle:thin:system/oracle@localhost:1521:XE" + +# JDBC driver class name +driver_class = "oracle.jdbc.OracleDriver" + +# Path to JDBC driver JAR file +# Download from: https://www.oracle.com/database/technologies/appdev/jdbc-downloads.html +driver_jar_path = "/opt/jdbc-drivers/ojdbc11.jar" + +# Database credentials (optional if included in jdbc_url) +username = "system" +password = "oracle" + +# SQL query to execute +# Oracle example with ROWNUM or use a numeric/timestamp column +query = "SELECT * FROM CUSTOMERS WHERE ID > {last_offset} ORDER BY ID" + +# How often to poll the database +poll_interval = "1m" + +# Maximum number of rows to fetch per poll +batch_size = 500 + +# Column to track for incremental reads (must be in query result) +tracking_column = "ID" + +# Initial offset value for the first poll +initial_offset = "0" + +# Source mode: "incremental" or "bulk" +# Works with ALL JDBC databases universally +mode = "incremental" + +# Convert column names to snake_case (e.g., OrderDate -> order_date) +snake_case_columns = true + +# Include metadata wrapper in output messages +include_metadata = true + +# Connection pooling (optional, requires HikariCP JAR in classpath) +enable_connection_pool = false +# max_pool_size = 10 +# min_idle = 2 +# connection_timeout_ms = 30000 + +# Custom JVM options (optional) +jvm_options = ["-Xmx512m", "-Xms256m"] + +# Target Iggy stream and topic +[[streams]] +stream = "crm" +topic = "customers" +partition_id = 1 +schema = "json" diff --git a/core/connectors/runtime/example_config/connectors/jdbc_sink_mysql.toml b/core/connectors/runtime/example_config/connectors/jdbc_sink_mysql.toml new file mode 100644 index 0000000000..84512bd2e0 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_sink_mysql.toml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Sink Connector Configuration for MySQL, with HikariCP pooling. + +type = "sink" +key = "jdbc_sink_mysql_example" +enabled = true +version = 0 +name = "JDBC MySQL Sink" +path = "target/release/libiggy_connector_jdbc_sink" +plugin_config_format = "toml" + +[[streams]] +stream = "test" +topics = ["events"] +schema = "json" + +[plugin_config] +jdbc_url = "jdbc:mysql://localhost:3306/mydb" +driver_class = "com.mysql.cj.jdbc.Driver" + +# Download from: https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar +driver_jar_path = "/tmp/jdbc-drivers/mysql-connector-j-8.3.0.jar" + +username = "root" +password = "root" + +target_table = "iggy_events" +auto_create_table = false + +batch_size = 500 +max_retries = 5 +retry_delay = "2s" + +payload_format = "text" +include_metadata = true +include_checksum = true +include_origin_timestamp = true + +# HikariCP connection pool (requires HikariCP on the classpath alongside the +# MySQL driver JAR, or a shaded driver JAR that bundles it). +enable_connection_pool = true +max_pool_size = 20 +min_idle = 5 +connection_timeout_ms = 30000 + +# Extra JVM options +jvm_options = ["-Xmx512m"] diff --git a/core/connectors/runtime/example_config/connectors/jdbc_sink_postgres.toml b/core/connectors/runtime/example_config/connectors/jdbc_sink_postgres.toml new file mode 100644 index 0000000000..f597da2fe1 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_sink_postgres.toml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Sink Connector Configuration for PostgreSQL. +# Consumes messages from the configured stream/topic and INSERTs each one as a +# row into the target table. + +type = "sink" +key = "jdbc_sink_postgres_example" +enabled = true +version = 0 +name = "JDBC PostgreSQL Sink" +path = "target/release/libiggy_connector_jdbc_sink" +plugin_config_format = "toml" + +[[streams]] +stream = "test" +topics = ["events"] +schema = "json" + +[plugin_config] +# PostgreSQL connection URL +jdbc_url = "jdbc:postgresql://localhost:5432/mydb" + +# PostgreSQL JDBC driver +driver_class = "org.postgresql.Driver" + +# Path to the PostgreSQL driver JAR +# Download from: https://jdbc.postgresql.org/download/postgresql-42.7.1.jar +driver_jar_path = "/tmp/jdbc-drivers/postgresql-42.7.1.jar" + +# Credentials (alternatively embed them in jdbc_url) +username = "postgres" +password = "postgres" + +# Destination table (created automatically when auto_create_table = true) +target_table = "iggy_events" +auto_create_table = true + +# Write tuning +batch_size = 100 +max_retries = 3 +retry_delay = "1s" + +# Store the payload as JSON text; set to "text" or "bytes" as needed +payload_format = "json" + +# Iggy metadata columns +include_metadata = true +include_checksum = true +include_origin_timestamp = true + +# Optional HikariCP connection pool +enable_connection_pool = false +max_pool_size = 10 +min_idle = 2 +connection_timeout_ms = 30000 diff --git a/core/connectors/runtime/example_config/connectors/jdbc_sqlserver.toml b/core/connectors/runtime/example_config/connectors/jdbc_sqlserver.toml new file mode 100644 index 0000000000..aa2da3fc22 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/jdbc_sqlserver.toml @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Example JDBC Source Connector Configuration for Microsoft SQL Server + +type = "source" +key = "jdbc_sqlserver_example" +enabled = true +version = 0 +name = "JDBC SQL Server Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +# JDBC connection URL +# Option 1: Separate credentials +jdbc_url = "jdbc:sqlserver://localhost:1433;databaseName=Sales;encrypt=false" + +# Option 2: Embedded credentials in URL +# jdbc_url = "jdbc:sqlserver://localhost:1433;databaseName=Sales;user=sa;password=YourPassword123;encrypt=false" + +# JDBC driver class name +driver_class = "com.microsoft.sqlserver.jdbc.SQLServerDriver" + +# Path to JDBC driver JAR file +# Download from: https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/ +driver_jar_path = "/opt/jdbc-drivers/mssql-jdbc-12.4.1.jre11.jar" + +# Database credentials (optional if included in jdbc_url) +username = "sa" +password = "YourPassword123" + +# SQL query to execute +query = "SELECT * FROM Orders WHERE OrderDate > {last_offset} ORDER BY OrderDate" + +poll_interval = "15s" +batch_size = 2000 +tracking_column = "OrderDate" +initial_offset = "2024-01-01" +mode = "incremental" + +# Convert SQL Server naming to snake_case +snake_case_columns = true +include_metadata = true + +# Connection pooling for high-throughput scenarios +enable_connection_pool = true +max_pool_size = 20 +min_idle = 5 +connection_timeout_ms = 30000 + +[[streams]] +stream = "sales" +topic = "orders" +partition_id = 1 +schema = "json" diff --git a/core/connectors/runtime/example_config/connectors/test_jdbc_h2.toml b/core/connectors/runtime/example_config/connectors/test_jdbc_h2.toml new file mode 100644 index 0000000000..06b65c21b2 --- /dev/null +++ b/core/connectors/runtime/example_config/connectors/test_jdbc_h2.toml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +type = "source" +key = "jdbc_h2_test" +enabled = true +version = 0 +name = "JDBC H2 Test Source" +path = "target/release/libiggy_connector_jdbc_source" +plugin_config_format = "toml" + +[plugin_config] +jdbc_url = "jdbc:h2:mem:testdb;DB_CLOSE_DELAY=-1;INIT=CREATE TABLE IF NOT EXISTS users (id INT PRIMARY KEY AUTO_INCREMENT, name VARCHAR(100), email VARCHAR(100), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP);INSERT INTO users (name, email) SELECT 'User ' || x, 'user' || x || '@test.com' FROM SYSTEM_RANGE(1, 100) WHERE NOT EXISTS (SELECT 1 FROM users);" +driver_class = "org.h2.Driver" +driver_jar_path = "/tmp/jdbc-drivers/h2-2.2.224.jar" +username = "sa" +password = "" +query = "SELECT * FROM users WHERE id > {last_offset} ORDER BY id" +poll_interval = "5s" +batch_size = 10 +tracking_column = "id" +initial_offset = "0" +mode = "incremental" +snake_case_columns = true +include_metadata = true + +[[streams]] +stream = "test" +topic = "users" +partition_id = 1 +schema = "json" diff --git a/core/connectors/sinks/README.md b/core/connectors/sinks/README.md index 367a220287..bb23159a51 100644 --- a/core/connectors/sinks/README.md +++ b/core/connectors/sinks/README.md @@ -12,6 +12,7 @@ Sink connectors are responsible for writing data from Iggy streams to external s | **elasticsearch_sink** | Sends messages to Elasticsearch indices for full-text search and analytics | | **iceberg_sink** | Writes data to Apache Iceberg tables via REST catalog with S3/GCS/Azure storage | | **influxdb_sink** | Writes messages to InfluxDB as line-protocol points; supports both V2 (org/bucket, Flux) and V3 (db, SQL) | +| **jdbc_sink** | Writes messages as rows into any JDBC-compliant database (PostgreSQL, MySQL, Oracle, SQL Server, H2) via an embedded JVM | | **postgres_sink** | Stores messages in PostgreSQL database tables with configurable schemas | | **quickwit_sink** | Indexes messages in Quickwit search engine for log analytics | | **stdout_sink** | Prints messages to standard output (useful for debugging and development) | diff --git a/core/connectors/sinks/jdbc_sink/Cargo.toml b/core/connectors/sinks/jdbc_sink/Cargo.toml new file mode 100644 index 0000000000..35ea5fea5a --- /dev/null +++ b/core/connectors/sinks/jdbc_sink/Cargo.toml @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "iggy_connector_jdbc_sink" +version = "0.1.0" +edition = "2024" +license = "Apache-2.0" +keywords = ["iggy", "messaging", "streaming", "jdbc", "sink"] +categories = ["database"] +description = "Generic JDBC sink connector for Iggy - writes stream messages into MySQL, Oracle, SQL Server, H2, and any JDBC-compliant database" +readme = "README.md" +publish = false + +[package.metadata.cargo-machete] +ignored = ["dashmap"] + +[lib] +crate-type = ["cdylib", "rlib"] + +[features] +default = [] + +[dependencies] +async-trait = { workspace = true } +base64 = { workspace = true } + +# Required by sink_connector! macro +dashmap = { workspace = true } + +# For parsing duration strings (retry_delay) +humantime = { workspace = true } + +# Connector SDK +iggy_connector_sdk = { workspace = true } + +# JNI for Java interop with invocation support +jni = { version = "0.21", features = ["invocation"] } + +# For sanitizing passwords in logs +regex = { workspace = true } +secrecy = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +toml = { workspace = true } diff --git a/core/connectors/sinks/jdbc_sink/README.md b/core/connectors/sinks/jdbc_sink/README.md new file mode 100644 index 0000000000..a4cc35dbb1 --- /dev/null +++ b/core/connectors/sinks/jdbc_sink/README.md @@ -0,0 +1,171 @@ + + +# JDBC Sink Connector + +A generic sink connector that consumes messages from Iggy topics and writes +them into any JDBC-compliant database (PostgreSQL, MySQL, Oracle, SQL Server, +H2, and others). It is the write-side counterpart to the +[JDBC source connector](../../sources/jdbc_source/README.md) and uses the same +embedded-JVM/JNI bridge, so a single JDBC driver JAR serves both directions. + +## How it works + +The connector starts an embedded JVM (loading your JDBC driver JAR onto the +classpath), opens a connection (optionally via a HikariCP pool), and on every +batch of consumed messages performs a **batched `INSERT`** using a JDBC +`PreparedStatement` (`addBatch` / `executeBatch`). + +### Write semantics + +**INSERT-only.** Each message becomes one row. This matches the behaviour of +the other Iggy sink connectors (PostgreSQL, MongoDB, Doris, Elasticsearch) — +none of which perform upserts or deletes. If you need upsert/merge semantics, +pre-create the table with an appropriate constraint and handle conflicts at the +database level, or open a feature request. + +### Row layout + +Each row contains the message `id` plus the payload, and (optionally) Iggy +metadata columns: + +| Column | Type (auto-create) | Included when | +|-------------------------|----------------------|-----------------------------------| +| `id` | `VARCHAR(40)` | always (message id as decimal) | +| `iggy_offset` | `BIGINT` | `include_metadata = true` | +| `iggy_timestamp` | `BIGINT` (µs epoch) | `include_metadata = true` | +| `iggy_stream` | `VARCHAR(255)` | `include_metadata = true` | +| `iggy_topic` | `VARCHAR(255)` | `include_metadata = true` | +| `iggy_partition_id` | `INTEGER` | `include_metadata = true` | +| `iggy_checksum` | `BIGINT` | `include_checksum = true` | +| `iggy_origin_timestamp` | `BIGINT` (µs epoch) | `include_origin_timestamp = true` | +| `payload` | `TEXT` / `VARBINARY` | always | + +Timestamps are stored as epoch microseconds (`BIGINT`) for cross-database +portability — `BIGINT` and `setLong` behave identically across every JDBC +driver (including SQL Server, where the `TIMESTAMP` keyword is a rowversion +type rather than a datetime), and the value is lossless with no timezone +coercion. + +To read the timestamp columns as native dates, convert at query time: + +```sql +-- PostgreSQL +SELECT to_timestamp(iggy_timestamp / 1000000.0) AS ts FROM iggy_events; + +-- MySQL +SELECT FROM_UNIXTIME(iggy_timestamp / 1000000) AS ts FROM iggy_events; + +-- H2 +SELECT TIMESTAMP_WITH_TIME_ZONE FROM ( + SELECT DATEADD('MICROSECOND', iggy_timestamp, TIMESTAMP '1970-01-01 00:00:00') FROM iggy_events +); +``` + +## Configuration + +| Field | Type | Required | Default | Description | +|---------------------------|----------------|----------|-------------|--------------------------------------------------------------| +| `jdbc_url` | string | yes | — | JDBC connection URL (masked in logs) | +| `driver_class` | string | yes | — | JDBC driver class, e.g. `org.postgresql.Driver` | +| `driver_jar_path` | string | yes | — | Absolute path to the driver JAR | +| `target_table` | string | yes | — | Destination table | +| `username` | string | no | — | DB username (or embed in URL) | +| `password` | string | no | — | DB password (masked in logs) | +| `batch_size` | int | no | `100` | Messages per `INSERT` batch | +| `auto_create_table` | bool | no | `false` | Create the table on open if missing | +| `include_metadata` | bool | no | `true` | Add offset/timestamp/stream/topic/partition columns | +| `include_checksum` | bool | no | `true` | Add `iggy_checksum` column | +| `include_origin_timestamp`| bool | no | `true` | Add `iggy_origin_timestamp` column | +| `payload_format` | string | no | `"text"` | `text`, `json` (validated), or `bytes` | +| `payload_column` | string | no | `"payload"` | Name of the payload column | +| `verbose_logging` | bool | no | `false` | Log per-batch progress at INFO | +| `max_retries` | int | no | `3` | Retry attempts for a failing batch | +| `retry_delay` | string | no | `"1s"` | Delay between retries (humantime) | +| `jvm_options` | array | no | `[]` | Extra JVM flags, e.g. `["-Xmx512m"]` | +| `enable_connection_pool` | bool | no | `false` | Use a HikariCP pool | +| `max_pool_size` | int | no | `10` | Max pool size (pool mode) | +| `min_idle` | int | no | `2` | Min idle connections (pool mode) | +| `connection_timeout_ms` | int | no | `30000` | Connection timeout (pool mode) | + +### Example (PostgreSQL) + +```toml +type = "sink" +key = "jdbc_sink_pg" +enabled = true +name = "JDBC PostgreSQL Sink" +path = "target/release/libiggy_connector_jdbc_sink" +plugin_config_format = "toml" + +[[streams]] +stream = "test" +topic = "events" +schema = "json" + +[plugin_config] +jdbc_url = "jdbc:postgresql://localhost:5432/mydb" +driver_class = "org.postgresql.Driver" +driver_jar_path = "/tmp/jdbc-drivers/postgresql-42.7.1.jar" +username = "postgres" +password = "postgres" +target_table = "iggy_events" +auto_create_table = true +payload_format = "json" +batch_size = 100 +``` + +See [`example_config/connectors`](../../runtime/example_config/connectors) for +ready-to-use PostgreSQL and MySQL configurations. + +## Security notes + +- Never commit credentials. Prefer environment-variable overrides + (`IGGY_CONNECTORS_SINK__PLUGIN_CONFIG_*`) or embed credentials in a + secret-managed `jdbc_url`. +- `jdbc_url` and `password` are wrapped in `SecretString`; the connector masks + passwords in all log output and never serializes secrets back out. + +## Building + +```bash +cargo build --release -p iggy_connector_jdbc_sink +# produces target/release/libiggy_connector_jdbc_sink.{so,dylib,dll} +``` + +The connector requires a JVM (JDK 8+) at runtime. Point `driver_jar_path` at the +JDBC driver for your database; for pooled mode the JAR (or classpath) must also +provide HikariCP. + +## Runtime notes & limitations + +- **Embedded JVM, one per process.** JNI permits a single `JavaVM` per OS + process. All JDBC *sink* instances in the connectors runtime share one JVM + (the first instance's `jvm_options`/classpath win). A JDBC source and a JDBC + sink are separate shared libraries and **cannot both create a JVM in the same + runtime process** — run them in separate connectors-runtime processes. +- **Blocking I/O.** JDBC calls go through JNI and are synchronous; each + `consume()` runs blocking work on the runtime worker thread (the same model as + the JDBC source). Size the runtime / batch sizes accordingly. +- **Error handling.** Transient failures (SQLState class `08`/`40`/`53`/`57`/`58` + — connectivity, deadlock, resource/operator) are retried up to `max_retries` + and then surfaced so a restart can re-process the batch. Permanent failures + (constraint/syntax/data errors, e.g. SQLState `22`/`23`/`42`) are logged, + counted, and the offending batch is **skipped** so one poison batch does not + permanently halt the sink. diff --git a/core/connectors/sinks/jdbc_sink/config.toml b/core/connectors/sinks/jdbc_sink/config.toml new file mode 100644 index 0000000000..2bc087626b --- /dev/null +++ b/core/connectors/sinks/jdbc_sink/config.toml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +type = "sink" +key = "jdbc" +enabled = true +version = 0 +name = "JDBC sink" +path = "../../target/release/libiggy_connector_jdbc_sink" +verbose = false + +[[streams]] +stream = "user_events" +topics = ["users", "orders"] +schema = "json" +batch_length = 100 +poll_interval = "5ms" +consumer_group = "jdbc_sink" + +[plugin_config] +jdbc_url = "jdbc:postgresql://localhost:5432/database" +driver_class = "org.postgresql.Driver" +driver_jar_path = "/tmp/jdbc-drivers/postgresql-42.7.1.jar" +username = "postgres" +password = "postgres" +target_table = "iggy_messages" +batch_size = 100 +auto_create_table = true +include_metadata = true +include_checksum = true +include_origin_timestamp = true +payload_format = "json" +enable_connection_pool = false diff --git a/core/connectors/sinks/jdbc_sink/src/lib.rs b/core/connectors/sinks/jdbc_sink/src/lib.rs new file mode 100644 index 0000000000..6a1604f28b --- /dev/null +++ b/core/connectors/sinks/jdbc_sink/src/lib.rs @@ -0,0 +1,1595 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Generic JDBC sink connector for Iggy. +//! +//! Consumes messages from Iggy topics and writes them, in batches, into any +//! JDBC-compliant database (MySQL, PostgreSQL, Oracle, SQL Server, H2, ...). +//! +//! The connector talks to the database through an embedded JVM via JNI, using +//! the standard `java.sql` API and (optionally) a HikariCP connection pool, +//! mirroring the JDBC *source* connector so a single JDBC driver JAR works for +//! both directions. +//! +//! Write semantics are **INSERT-only** (matching the other Iggy sink +//! connectors): each message becomes one row. The payload is stored in a +//! single column (`text`/`json`/`bytes`), alongside optional Iggy metadata +//! columns. + +use async_trait::async_trait; +use iggy_connector_sdk::{ + ConsumedMessage, Error, MessagesMetadata, Sink, TopicMetadata, sink_connector, +}; +use jni::objects::{GlobalRef, JObject, JString, JThrowable, JValue}; +use jni::{JNIEnv, JavaVM}; +use regex::Regex; +use secrecy::{ExposeSecret, SecretString}; +use serde::{Deserialize, Serialize}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; +use tracing::{debug, error, info, warn}; + +const CONNECTOR_NAME: &str = "JDBC sink"; +const DEFAULT_BATCH_SIZE: u32 = 100; +const DEFAULT_MAX_RETRIES: u32 = 3; +const DEFAULT_RETRY_DELAY: &str = "1s"; +const DEFAULT_PAYLOAD_COLUMN: &str = "payload"; + +mod secret_string_serde { + use secrecy::SecretString; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { + let s = String::deserialize(d)?; + Ok(SecretString::from(s)) + } + #[allow(unused_variables)] + pub fn serialize(val: &SecretString, s: S) -> Result { + "".serialize(s) + } +} + +mod opt_secret_string_serde { + use secrecy::SecretString; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let s: Option = Option::deserialize(d)?; + Ok(s.map(SecretString::from)) + } + #[allow(unused_variables)] + pub fn serialize(val: &Option, s: S) -> Result { + Option::::None.serialize(s) + } +} + +/// Cached compiled regex patterns for password sanitization in log output. +static RE_USER_PASS_AT: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"://([^:]+):([^@?;/]+)@").unwrap()); +static RE_PASSWORD_PARAM: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"(?i)(password|pwd|pass)=([^;&\s]+)").unwrap()); +static RE_ORACLE_PASS: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"thin:([^/]+)/([^@]+)@").unwrap()); + +/// How the message payload is written into the destination `payload` column. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum PayloadFormat { + /// Store the raw payload bytes as a UTF-8 string (default). + #[default] + Text, + /// Validate the payload as JSON, then store the JSON text as a string. + Json, + /// Store the raw payload bytes as a binary column. + Bytes, +} + +impl PayloadFormat { + fn from_config(s: Option<&str>) -> Self { + match s.map(|s| s.to_lowercase()).as_deref() { + Some("json") | Some("jsonb") => PayloadFormat::Json, + Some("bytes") | Some("binary") | Some("bytea") => PayloadFormat::Bytes, + _ => PayloadFormat::Text, + } + } + + /// Portable-ish DDL type used by `auto_create_table`. Production users are + /// encouraged to pre-create tables with database-native types instead. + fn ddl_type(&self) -> &'static str { + match self { + PayloadFormat::Text | PayloadFormat::Json => "TEXT", + PayloadFormat::Bytes => "VARBINARY(65535)", + } + } +} + +/// Configuration for the JDBC sink connector. +#[derive(Clone, Deserialize, Serialize)] +pub struct JdbcSinkConfig { + /// JDBC connection URL (e.g. "jdbc:mysql://localhost:3306/mydb"). + #[serde(with = "secret_string_serde")] + pub jdbc_url: SecretString, + + /// JDBC driver class name (e.g. "com.mysql.cj.jdbc.Driver"). + pub driver_class: String, + + /// Path to the JDBC driver JAR file. + pub driver_jar_path: String, + + /// Database username (optional if embedded in `jdbc_url`). + #[serde(default)] + pub username: Option, + + /// Database password (optional if embedded in `jdbc_url`); masked in logs. + #[serde(default, with = "opt_secret_string_serde")] + pub password: Option, + + /// Target table to insert rows into. + pub target_table: String, + + /// Max messages per INSERT batch (default 100). + #[serde(default)] + pub batch_size: Option, + + /// Create the target table on `open` if it does not exist (default false). + #[serde(default)] + pub auto_create_table: Option, + + /// Include Iggy metadata columns: offset, timestamp, stream, topic, + /// partition id (default true). + #[serde(default)] + pub include_metadata: Option, + + /// Include the `iggy_checksum` column (default true). + #[serde(default)] + pub include_checksum: Option, + + /// Include the `iggy_origin_timestamp` column (default true). + #[serde(default)] + pub include_origin_timestamp: Option, + + /// Payload column format: "text" (default), "json", or "bytes". + #[serde(default)] + pub payload_format: Option, + + /// Name of the payload column (default "payload"). + #[serde(default)] + pub payload_column: Option, + + /// Log at INFO instead of DEBUG for per-batch progress (default false). + #[serde(default)] + pub verbose_logging: Option, + + /// Max retry attempts for a failing batch (default 3). + #[serde(default)] + pub max_retries: Option, + + /// Delay between retries, e.g. "1s", "500ms" (default "1s"). + #[serde(default)] + pub retry_delay: Option, + + /// Extra JVM options (e.g. ["-Xmx512m"]). + #[serde(default)] + pub jvm_options: Vec, + + /// Enable HikariCP connection pooling (default false). + #[serde(default)] + pub enable_connection_pool: bool, + + /// Maximum pool size (default 10). + #[serde(default = "default_pool_size")] + pub max_pool_size: u32, + + /// Minimum idle connections (default 2). + #[serde(default = "default_min_idle")] + pub min_idle: u32, + + /// Connection timeout in milliseconds (default 30000). + #[serde(default = "default_connection_timeout")] + pub connection_timeout_ms: u64, +} + +fn default_pool_size() -> u32 { + 10 +} + +fn default_min_idle() -> u32 { + 2 +} + +fn default_connection_timeout() -> u64 { + 30000 +} + +impl std::fmt::Debug for JdbcSinkConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JdbcSinkConfig") + .field( + "jdbc_url", + &sanitize_jdbc_url(self.jdbc_url.expose_secret()), + ) + .field("driver_class", &self.driver_class) + .field("driver_jar_path", &self.driver_jar_path) + .field("username", &self.username) + .field("password", &self.password.as_ref().map(|_| "***")) + .field("target_table", &self.target_table) + .field("batch_size", &self.batch_size) + .field("auto_create_table", &self.auto_create_table) + .field("include_metadata", &self.include_metadata) + .field("include_checksum", &self.include_checksum) + .field("include_origin_timestamp", &self.include_origin_timestamp) + .field("payload_format", &self.payload_format) + .field("payload_column", &self.payload_column) + .field("enable_connection_pool", &self.enable_connection_pool) + .field("max_pool_size", &self.max_pool_size) + .field("min_idle", &self.min_idle) + .field("connection_timeout_ms", &self.connection_timeout_ms) + .finish() + } +} + +/// Runtime counters for observability. +#[derive(Debug, Default)] +struct State { + messages_processed: u64, + insertion_errors: u64, +} + +/// JDBC sink connector. +#[derive(Debug)] +pub struct JdbcSink { + id: u32, + config: JdbcSinkConfig, + jvm: Option>, + connection: Option, + connection_pool: Option, // HikariDataSource if pooling enabled + state: Mutex, + verbose: bool, + retry_delay: Duration, +} + +/// Sanitize a JDBC URL by masking passwords for logging. +fn sanitize_jdbc_url(url: &str) -> String { + let url = RE_USER_PASS_AT.replace_all(url, "://$1:***@"); + let url = RE_PASSWORD_PARAM.replace_all(&url, "$1=***"); + let url = RE_ORACLE_PASS.replace_all(&url, "thin:$1/***@"); + url.to_string() +} + +/// Quote a SQL identifier (table/column name) with double quotes, escaping any +/// embedded quotes, and reject names containing null bytes. Prevents SQL +/// injection through configured table/column names. +fn quote_identifier(name: &str) -> Result { + if name.is_empty() { + return Err(Error::InvalidConfigValue( + "Identifier cannot be empty".to_string(), + )); + } + if name.contains('\0') { + return Err(Error::InvalidConfigValue( + "Identifier cannot contain null characters".to_string(), + )); + } + let escaped = name.replace('"', "\"\""); + Ok(format!("\"{escaped}\"")) +} + +impl JdbcSink { + /// Construct a new JDBC sink. Invoked by the `sink_connector!` macro with + /// the deserialized plugin config. + pub fn new(id: u32, config: JdbcSinkConfig) -> Self { + let verbose = config.verbose_logging.unwrap_or(false); + let delay_str = config.retry_delay.as_deref().unwrap_or(DEFAULT_RETRY_DELAY); + let retry_delay = humantime::parse_duration(delay_str).unwrap_or(Duration::from_secs(1)); + Self { + id, + config, + jvm: None, + connection: None, + connection_pool: None, + state: Mutex::new(State::default()), + verbose, + retry_delay, + } + } + + fn batch_size(&self) -> usize { + self.config.batch_size.unwrap_or(DEFAULT_BATCH_SIZE).max(1) as usize + } + + fn include_metadata(&self) -> bool { + self.config.include_metadata.unwrap_or(true) + } + + fn include_checksum(&self) -> bool { + self.config.include_checksum.unwrap_or(true) + } + + fn include_origin_timestamp(&self) -> bool { + self.config.include_origin_timestamp.unwrap_or(true) + } + + fn payload_format(&self) -> PayloadFormat { + PayloadFormat::from_config(self.config.payload_format.as_deref()) + } + + fn payload_column(&self) -> &str { + self.config + .payload_column + .as_deref() + .unwrap_or(DEFAULT_PAYLOAD_COLUMN) + } + + fn max_retries(&self) -> u32 { + self.config + .max_retries + .unwrap_or(DEFAULT_MAX_RETRIES) + .max(1) + } + + /// Obtain the process-wide JVM, creating it on first use. JNI permits only a + /// single JVM per OS process, so this is shared across all JDBC sink + /// instances (see [`get_or_create_jvm`]). + fn initialize_jvm(&mut self) -> Result<(), Error> { + info!("Initializing JVM for JDBC sink connector [{}]", self.id); + let jvm = get_or_create_jvm(&self.config.driver_jar_path, &self.config.jvm_options)?; + self.jvm = Some(jvm); + Ok(()) + } + + /// Load the JDBC driver and create a direct connection or a HikariCP pool. + fn create_connection(&mut self) -> Result<(), Error> { + let jvm = self + .jvm + .as_ref() + .ok_or_else(|| Error::InitError("JVM not initialized".to_string()))?; + + let mut env = jvm + .attach_current_thread() + .map_err(|e| Error::InitError(format!("Failed to attach thread to JVM: {e}")))?; + + info!( + "Loading JDBC driver via Class.forName: {}", + self.config.driver_class + ); + + let class_class = env + .find_class("java/lang/Class") + .map_err(|e| Error::InitError(format!("Failed to find Class: {e}")))?; + let driver_class_name = env + .new_string(&self.config.driver_class) + .map_err(|e| Error::InitError(format!("Failed to create class name string: {e}")))?; + env.call_static_method( + class_class, + "forName", + "(Ljava/lang/String;)Ljava/lang/Class;", + &[JValue::Object(&driver_class_name.into())], + ) + .map_err(|e| { + Error::InitError(format!( + "Failed to load driver class '{}': {e}", + self.config.driver_class + )) + })?; + + info!("JDBC driver loaded and registered successfully"); + + if self.config.enable_connection_pool { + info!( + "Setting up HikariCP connection pool to: {}", + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()) + ); + let pool = self.create_connection_pool_internal(&mut env)?; + self.connection_pool = Some(pool); + } else { + info!( + "Creating direct JDBC connection to: {}", + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()) + ); + let conn = self.create_direct_connection_internal(&mut env)?; + self.connection = Some(conn); + } + + Ok(()) + } + + /// Create a direct JDBC connection via DriverManager. + fn create_direct_connection_internal(&self, env: &mut JNIEnv) -> Result { + // Set the thread context class loader to the driver's loader so + // DriverManager can locate the driver loaded from the JAR. + let thread_class = env + .find_class("java/lang/Thread") + .map_err(|e| Error::InitError(format!("Failed to find Thread class: {e}")))?; + let current_thread = env + .call_static_method(thread_class, "currentThread", "()Ljava/lang/Thread;", &[]) + .map_err(|e| Error::InitError(format!("Failed to get current thread: {e}")))? + .l() + .map_err(|e| Error::InitError(format!("Failed to extract thread object: {e}")))?; + + let driver_class = env + .find_class(self.config.driver_class.replace('.', "/")) + .map_err(|e| Error::InitError(format!("Failed to find driver class: {e}")))?; + let driver_class_loader = env + .call_method( + &driver_class, + "getClassLoader", + "()Ljava/lang/ClassLoader;", + &[], + ) + .map_err(|e| Error::InitError(format!("Failed to get driver class loader: {e}")))? + .l() + .map_err(|e| Error::InitError(format!("Failed to extract class loader: {e}")))?; + env.call_method( + ¤t_thread, + "setContextClassLoader", + "(Ljava/lang/ClassLoader;)V", + &[JValue::Object(&driver_class_loader)], + ) + .map_err(|e| Error::InitError(format!("Failed to set context class loader: {e}")))?; + + let driver_manager = env + .find_class("java/sql/DriverManager") + .map_err(|e| Error::InitError(format!("Failed to find DriverManager: {e}")))?; + let jdbc_url = env + .new_string(self.config.jdbc_url.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create JDBC URL string: {e}")))?; + + let connection = if let (Some(username), Some(password)) = + (&self.config.username, &self.config.password) + { + let username_jstring = env + .new_string(username) + .map_err(|e| Error::InitError(format!("Failed to create username string: {e}")))?; + let password_jstring = env + .new_string(password.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create password string: {e}")))?; + env.call_static_method( + driver_manager, + "getConnection", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)Ljava/sql/Connection;", + &[ + JValue::Object(&jdbc_url.into()), + JValue::Object(&username_jstring.into()), + JValue::Object(&password_jstring.into()), + ], + ) + .map_err(|e| { + Error::InitError(format!( + "Failed to create JDBC connection with credentials: {e}" + )) + })? + } else { + env.call_static_method( + driver_manager, + "getConnection", + "(Ljava/lang/String;)Ljava/sql/Connection;", + &[JValue::Object(&jdbc_url.into())], + ) + .map_err(|e| { + Error::InitError(format!("Failed to create JDBC connection from URL: {e}")) + })? + }; + + let connection_obj = connection + .l() + .map_err(|e| Error::InitError(format!("Failed to get connection object: {e}")))?; + let global_ref = env + .new_global_ref(connection_obj) + .map_err(|e| Error::InitError(format!("Failed to create global reference: {e}")))?; + + info!("Direct database connection established successfully"); + Ok(global_ref) + } + + /// Create a HikariCP connection pool. + fn create_connection_pool_internal(&self, env: &mut JNIEnv) -> Result { + info!( + "Initializing HikariCP with max_pool_size={}, min_idle={}", + self.config.max_pool_size, self.config.min_idle + ); + + let hikari_config_class = env.find_class("com/zaxxer/hikari/HikariConfig").map_err( + |e| { + Error::InitError(format!( + "Failed to find HikariConfig class. Ensure HikariCP JAR is in classpath: {e}" + )) + }, + )?; + let hikari_config = env + .new_object(hikari_config_class, "()V", &[]) + .map_err(|e| Error::InitError(format!("Failed to create HikariConfig: {e}")))?; + + let jdbc_url_jstring = env + .new_string(self.config.jdbc_url.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create JDBC URL: {e}")))?; + env.call_method( + &hikari_config, + "setJdbcUrl", + "(Ljava/lang/String;)V", + &[JValue::Object(&jdbc_url_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set JDBC URL: {e}")))?; + + if let Some(username) = &self.config.username { + let username_jstring = env + .new_string(username) + .map_err(|e| Error::InitError(format!("Failed to create username: {e}")))?; + env.call_method( + &hikari_config, + "setUsername", + "(Ljava/lang/String;)V", + &[JValue::Object(&username_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set username: {e}")))?; + } + + if let Some(password) = &self.config.password { + let password_jstring = env + .new_string(password.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create password: {e}")))?; + env.call_method( + &hikari_config, + "setPassword", + "(Ljava/lang/String;)V", + &[JValue::Object(&password_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set password: {e}")))?; + } + + let driver_class_jstring = env + .new_string(&self.config.driver_class) + .map_err(|e| Error::InitError(format!("Failed to create driver class name: {e}")))?; + env.call_method( + &hikari_config, + "setDriverClassName", + "(Ljava/lang/String;)V", + &[JValue::Object(&driver_class_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set driver class: {e}")))?; + + env.call_method( + &hikari_config, + "setMaximumPoolSize", + "(I)V", + &[JValue::Int( + self.config.max_pool_size.min(i32::MAX as u32) as i32 + )], + ) + .map_err(|e| Error::InitError(format!("Failed to set max pool size: {e}")))?; + env.call_method( + &hikari_config, + "setMinimumIdle", + "(I)V", + &[JValue::Int(self.config.min_idle.min(i32::MAX as u32) as i32)], + ) + .map_err(|e| Error::InitError(format!("Failed to set min idle: {e}")))?; + env.call_method( + &hikari_config, + "setConnectionTimeout", + "(J)V", + &[JValue::Long(self.config.connection_timeout_ms as i64)], + ) + .map_err(|e| Error::InitError(format!("Failed to set connection timeout: {e}")))?; + + let hikari_datasource_class = env + .find_class("com/zaxxer/hikari/HikariDataSource") + .map_err(|e| Error::InitError(format!("Failed to find HikariDataSource class: {e}")))?; + let datasource = env + .new_object( + hikari_datasource_class, + "(Lcom/zaxxer/hikari/HikariConfig;)V", + &[JValue::Object(&hikari_config)], + ) + .map_err(|e| Error::InitError(format!("Failed to create HikariDataSource: {e}")))?; + let global_ref = env.new_global_ref(datasource).map_err(|e| { + Error::InitError(format!("Failed to create global reference for pool: {e}")) + })?; + + info!("HikariCP connection pool created successfully"); + Ok(global_ref) + } + + /// Acquire a connection. Returns the connection object and whether it was + /// borrowed from the pool (and therefore must be closed after use). + fn acquire_connection<'local>( + &self, + env: &mut JNIEnv<'local>, + ) -> Result<(JObject<'local>, bool), Error> { + if let Some(pool) = &self.connection_pool { + let connection = env + .call_method( + pool.as_obj(), + "getConnection", + "()Ljava/sql/Connection;", + &[], + ) + .map_err(|e| Error::Connection(format!("Failed to get connection from pool: {e}")))? + .l() + .map_err(|e| { + Error::Connection(format!("Failed to extract connection object: {e}")) + })?; + Ok((connection, true)) + } else if let Some(conn) = &self.connection { + let local_ref = env + .new_local_ref(conn.as_obj()) + .map_err(|e| Error::Connection(format!("Failed to create local ref: {e}")))?; + Ok((local_ref, false)) + } else { + Err(Error::Connection( + "No connection or pool available".to_string(), + )) + } + } + + /// Create the target table if `auto_create_table` is enabled. Uses + /// portable-ish ANSI types; production deployments should pre-create the + /// table with database-native types. + fn ensure_table_exists(&self, env: &mut JNIEnv) -> Result<(), Error> { + if !self.config.auto_create_table.unwrap_or(false) { + return Ok(()); + } + + let sql = self.build_create_table_sql()?; + info!("Ensuring target table exists: {sql}"); + + let (connection, pooled) = self.acquire_connection(env)?; + let result = self.execute_update(env, &connection, &sql); + if pooled { + let _ = env.call_method(&connection, "close", "()V", &[]); + } + result.map(|_| ()).map_err(|e| { + Error::InitError(format!( + "Failed to create table '{}': {e}", + self.config.target_table + )) + }) + } + + /// Validate connectivity at open time with a trivial `SELECT 1`, so a bad + /// URL/credentials fails fast on open rather than on the first batch. + fn smoke_test(&self, env: &mut JNIEnv) -> Result<(), Error> { + let (connection, pooled) = self.acquire_connection(env)?; + let result = self.execute_update(env, &connection, "SELECT 1"); + if pooled { + let _ = env.call_method(&connection, "close", "()V", &[]); + } + result.map(|_| ()).map_err(|e| { + Error::InitError(format!("Database connectivity test (SELECT 1) failed: {e}")) + }) + } + + /// Build the `CREATE TABLE IF NOT EXISTS` statement for the configured + /// column layout. + fn build_create_table_sql(&self) -> Result { + let quoted_table = quote_identifier(&self.config.target_table)?; + let quoted_payload = quote_identifier(self.payload_column())?; + + let mut sql = format!("CREATE TABLE IF NOT EXISTS {quoted_table} ("); + sql.push_str("id VARCHAR(40)"); + + if self.include_metadata() { + sql.push_str(", iggy_offset BIGINT"); + sql.push_str(", iggy_timestamp BIGINT"); + sql.push_str(", iggy_stream VARCHAR(255)"); + sql.push_str(", iggy_topic VARCHAR(255)"); + sql.push_str(", iggy_partition_id INTEGER"); + } + if self.include_checksum() { + sql.push_str(", iggy_checksum BIGINT"); + } + if self.include_origin_timestamp() { + sql.push_str(", iggy_origin_timestamp BIGINT"); + } + + sql.push_str(&format!( + ", {quoted_payload} {}", + self.payload_format().ddl_type() + )); + sql.push(')'); + Ok(sql) + } + + /// Build a parameterized single-row INSERT statement (one `?` placeholder + /// per bound column) and report how many parameters each row binds. + fn build_insert_sql(&self) -> Result<(String, u32), Error> { + let quoted_table = quote_identifier(&self.config.target_table)?; + let quoted_payload = quote_identifier(self.payload_column())?; + + let mut columns = vec!["id".to_string()]; + if self.include_metadata() { + columns.push("iggy_offset".to_string()); + columns.push("iggy_timestamp".to_string()); + columns.push("iggy_stream".to_string()); + columns.push("iggy_topic".to_string()); + columns.push("iggy_partition_id".to_string()); + } + if self.include_checksum() { + columns.push("iggy_checksum".to_string()); + } + if self.include_origin_timestamp() { + columns.push("iggy_origin_timestamp".to_string()); + } + columns.push(quoted_payload); + + let params_per_row = columns.len() as u32; + let placeholders = vec!["?"; columns.len()].join(", "); + let sql = format!( + "INSERT INTO {quoted_table} ({}) VALUES ({placeholders})", + columns.join(", ") + ); + Ok((sql, params_per_row)) + } + + /// Write all messages, chunked into batches. Errors on a batch are counted + /// and logged; the call only returns `Err` when a batch ultimately fails so + /// the runtime can surface the failure. + fn write_messages( + &self, + env: &mut JNIEnv, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: &[ConsumedMessage], + ) -> Result<(), Error> { + if messages.is_empty() { + return Ok(()); + } + + let (sql, _params_per_row) = self.build_insert_sql()?; + let (connection, pooled) = self.acquire_connection(env)?; + + // A transient failure (connectivity, deadlock) is surfaced so the + // runtime stops the sink and a restart retries the batch. A permanent + // failure (bad data, constraint/syntax error) is counted and SKIPPED so + // one poison batch does not permanently halt ingestion. + let mut first_transient_error: Option = None; + let table = &self.config.target_table; + for batch in messages.chunks(self.batch_size()) { + if let Err((e, is_transient)) = self.insert_batch_with_retry( + env, + &connection, + &sql, + batch, + topic_metadata, + messages_metadata, + ) { + let mut state = self.state.lock().expect("state mutex poisoned"); + state.insertion_errors += batch.len() as u64; + if is_transient { + error!("Transient failure inserting batch into '{table}': {e}"); + if first_transient_error.is_none() { + first_transient_error = Some(e); + } + } else { + error!( + "Permanent failure inserting batch into '{table}'; skipping {} message(s): {e}", + batch.len() + ); + } + } + } + + if pooled { + let _ = env.call_method(&connection, "close", "()V", &[]); + } + + if let Some(e) = first_transient_error { + return Err(e); + } + + let msg_count = messages.len(); + { + let mut state = self.state.lock().expect("state mutex poisoned"); + state.messages_processed += msg_count as u64; + } + let table = &self.config.target_table; + if self.verbose { + info!( + "JDBC sink [{}] wrote {msg_count} messages to '{table}'", + self.id + ); + } else { + debug!( + "JDBC sink [{}] wrote {msg_count} messages to '{table}'", + self.id + ); + } + Ok(()) + } + + /// Execute one batch insert, retrying on transient failures. Returns + /// `(error, is_transient)` so the caller can decide whether to surface + /// (transient → restart retries) or skip (permanent data error). + fn insert_batch_with_retry( + &self, + env: &mut JNIEnv, + connection: &JObject, + sql: &str, + messages: &[ConsumedMessage], + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + ) -> Result<(), (Error, bool)> { + let max_retries = self.max_retries(); + let mut attempts = 0u32; + loop { + attempts += 1; + match self.insert_batch( + env, + connection, + sql, + messages, + topic_metadata, + messages_metadata, + ) { + Ok(()) => return Ok(()), + Err((e, is_transient)) => { + if !is_transient || attempts >= max_retries { + return Err((e, is_transient)); + } + warn!( + "Transient error inserting batch (attempt {attempts}/{max_retries}): {e}. Retrying..." + ); + std::thread::sleep(self.retry_delay * attempts); + } + } + } + } + + /// Prepare the statement, bind every message as a batch row, and execute. + /// Returns `(error, is_transient)` on failure so the caller can decide + /// whether to retry. + fn insert_batch( + &self, + env: &mut JNIEnv, + connection: &JObject, + sql: &str, + messages: &[ConsumedMessage], + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + ) -> Result<(), (Error, bool)> { + let sql_jstring = env.new_string(sql).map_err(|e| { + ( + Error::Connection(format!("Failed to create SQL string: {e}")), + false, + ) + })?; + let statement = match env + .call_method( + connection, + "prepareStatement", + "(Ljava/lang/String;)Ljava/sql/PreparedStatement;", + &[JValue::Object(&sql_jstring.into())], + ) + .and_then(|v| v.l()) + { + Ok(s) => s, + Err(_) => return Err(classify_jni_failure(env, "prepare statement")), + }; + + let format = self.payload_format(); + for message in messages { + // Bind + addBatch each message inside its own JNI local-reference + // frame: the bound parameter refs (id/stream/topic strings, payload + // string/byte[]) are reclaimed every iteration. addBatch copies the + // parameters into the batch, so nothing needs to escape the frame. + // Without this a large batch would overflow the local reference + // table and abort the JVM. + if env.push_local_frame(32).is_err() { + let _ = env.call_method(&statement, "close", "()V", &[]); + return Err(( + Error::CannotStoreData("Failed to push JNI local frame".to_string()), + true, + )); + } + let bind = self.bind_row( + env, + &statement, + message, + topic_metadata, + messages_metadata, + format, + ); + let add_failed = + bind.is_ok() && env.call_method(&statement, "addBatch", "()V", &[]).is_err(); + // SAFETY: addBatch already copied the parameter values into the + // batch; no JNI local reference from this iteration needs to escape. + let _ = unsafe { env.pop_local_frame(&JObject::null()) }; + + if let Err(e) = bind { + let _ = env.call_method(&statement, "close", "()V", &[]); + // Binding errors (bad UTF-8/JSON) are data problems, permanent. + return Err((e, false)); + } + if add_failed { + let err = classify_jni_failure(env, "add batch"); + let _ = env.call_method(&statement, "close", "()V", &[]); + return Err(err); + } + } + + let failed = env + .call_method(&statement, "executeBatch", "()[I", &[]) + .is_err(); + let result = if failed { + Err(classify_jni_failure(env, "execute batch")) + } else { + Ok(()) + }; + let _ = env.call_method(&statement, "close", "()V", &[]); + result + } + + /// Bind a single message's columns onto the prepared statement (1-indexed). + fn bind_row( + &self, + env: &mut JNIEnv, + statement: &JObject, + message: &ConsumedMessage, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + format: PayloadFormat, + ) -> Result<(), Error> { + let mut idx: i32 = 1; + + set_string(env, statement, idx, &message.id.to_string())?; + idx += 1; + + if self.include_metadata() { + set_long(env, statement, idx, message.offset as i64)?; + idx += 1; + set_long(env, statement, idx, message.timestamp as i64)?; + idx += 1; + set_string(env, statement, idx, &topic_metadata.stream)?; + idx += 1; + set_string(env, statement, idx, &topic_metadata.topic)?; + idx += 1; + set_int(env, statement, idx, messages_metadata.partition_id as i32)?; + idx += 1; + } + if self.include_checksum() { + set_long(env, statement, idx, message.checksum as i64)?; + idx += 1; + } + if self.include_origin_timestamp() { + set_long(env, statement, idx, message.origin_timestamp as i64)?; + idx += 1; + } + + let payload_bytes = message + .payload + .try_to_bytes() + .map_err(|e| Error::Serialization(format!("Failed to read payload bytes: {e}")))?; + + match format { + PayloadFormat::Text => { + // Tolerant of binary schemas (Raw/Avro/FlatBuffer): valid UTF-8 is + // stored verbatim, otherwise the bytes are base64-encoded so a + // binary payload is preserved rather than rejected. + let text = payload_text_repr(payload_bytes); + set_string(env, statement, idx, &text)?; + } + PayloadFormat::Json => { + // Validate it parses as JSON, then store the (compact) text. + let value: serde_json::Value = + serde_json::from_slice(&payload_bytes).map_err(|e| { + Error::InvalidRecordValue(format!("Payload is not valid JSON: {e}")) + })?; + set_string(env, statement, idx, &value.to_string())?; + } + PayloadFormat::Bytes => { + set_bytes(env, statement, idx, &payload_bytes)?; + } + } + Ok(()) + } + + /// Execute a non-query statement (DDL) via `Statement.execute`. + fn execute_update( + &self, + env: &mut JNIEnv, + connection: &JObject, + sql: &str, + ) -> Result<(), Error> { + let statement = env + .call_method(connection, "createStatement", "()Ljava/sql/Statement;", &[]) + .and_then(|v| v.l()) + .map_err(|e| Error::Connection(format!("Failed to create statement: {e}")))?; + let sql_jstring = env + .new_string(sql) + .map_err(|e| Error::Connection(format!("Failed to create SQL string: {e}")))?; + let result = env + .call_method( + &statement, + "execute", + "(Ljava/lang/String;)Z", + &[JValue::Object(&sql_jstring.into())], + ) + .map(|_| ()) + .map_err(|e| Error::CannotStoreData(format!("Failed to execute statement: {e}"))); + let _ = env.call_method(&statement, "close", "()V", &[]); + result + } +} + +fn set_string(env: &mut JNIEnv, statement: &JObject, idx: i32, value: &str) -> Result<(), Error> { + let jstr = env + .new_string(value) + .map_err(|e| Error::Connection(format!("Failed to create string param: {e}")))?; + env.call_method( + statement, + "setString", + "(ILjava/lang/String;)V", + &[JValue::Int(idx), JValue::Object(&jstr.into())], + ) + .map(|_| ()) + .map_err(|e| Error::CannotStoreData(format!("Failed to set string param {idx}: {e}"))) +} + +fn set_long(env: &mut JNIEnv, statement: &JObject, idx: i32, value: i64) -> Result<(), Error> { + env.call_method( + statement, + "setLong", + "(IJ)V", + &[JValue::Int(idx), JValue::Long(value)], + ) + .map(|_| ()) + .map_err(|e| Error::CannotStoreData(format!("Failed to set long param {idx}: {e}"))) +} + +fn set_int(env: &mut JNIEnv, statement: &JObject, idx: i32, value: i32) -> Result<(), Error> { + env.call_method( + statement, + "setInt", + "(II)V", + &[JValue::Int(idx), JValue::Int(value)], + ) + .map(|_| ()) + .map_err(|e| Error::CannotStoreData(format!("Failed to set int param {idx}: {e}"))) +} + +fn set_bytes(env: &mut JNIEnv, statement: &JObject, idx: i32, value: &[u8]) -> Result<(), Error> { + let byte_array = env + .byte_array_from_slice(value) + .map_err(|e| Error::Connection(format!("Failed to create byte array param: {e}")))?; + env.call_method( + statement, + "setBytes", + "(I[B)V", + &[JValue::Int(idx), JValue::Object(&byte_array.into())], + ) + .map(|_| ()) + .map_err(|e| Error::CannotStoreData(format!("Failed to set bytes param {idx}: {e}"))) +} + +/// Process-wide JVM. JNI allows only one `JavaVM` per OS process, so every JDBC +/// sink instance in this dynamic library shares this one. +static GLOBAL_JVM: Mutex>> = Mutex::new(None); + +/// Return the process JVM, creating it on first use within this dynamic library. +/// The first caller's `jvm_options`/classpath win; later callers (e.g. a second +/// JDBC sink) reuse the existing VM instead of failing with `JNI_EEXIST`. +/// +/// Limitation: a JDBC source and a JDBC sink are separate dynamic libraries and +/// do not share this static, so configuring both in the *same* connectors +/// runtime process is not supported. Run them in separate runtime processes. +fn get_or_create_jvm(driver_jar_path: &str, jvm_options: &[String]) -> Result, Error> { + let mut guard = GLOBAL_JVM.lock().expect("jvm mutex poisoned"); + if let Some(jvm) = guard.as_ref() { + info!("Reusing existing process JVM"); + return Ok(jvm.clone()); + } + + let classpath_option = format!("-Djava.class.path={driver_jar_path}"); + let mut args_builder = jni::InitArgsBuilder::new() + .version(jni::JNIVersion::V8) + .option(&classpath_option); + for option in jvm_options { + args_builder = args_builder.option(option); + } + let jvm_args = args_builder + .build() + .map_err(|e| Error::InitError(format!("Failed to build JVM arguments: {e:?}")))?; + let jvm = JavaVM::new(jvm_args) + .map_err(|e| Error::InitError(format!("Failed to create JVM: {e:?}")))?; + + info!("JVM initialized successfully (classpath: {driver_jar_path})"); + let arc = Arc::new(jvm); + *guard = Some(arc.clone()); + Ok(arc) +} + +/// Render payload bytes for a text column: valid UTF-8 is returned verbatim, +/// otherwise the raw bytes are base64-encoded so binary payloads (e.g. Avro, +/// FlatBuffer, arbitrary Raw) survive instead of being rejected. +fn payload_text_repr(bytes: Vec) -> String { + match String::from_utf8(bytes) { + Ok(text) => text, + Err(e) => { + use base64::Engine; + base64::engine::general_purpose::STANDARD.encode(e.as_bytes()) + } + } +} + +/// Classify a JDBC `SQLState` (its 2-char class) as transient (worth retrying) +/// vs permanent. `08` = connection exception, `40` = transaction rollback +/// (serialization failure / deadlock), `53` = insufficient resources, `57` = +/// operator intervention (e.g. admin shutdown), `58` = system error. Everything +/// else (`22` data exception, `23` integrity-constraint violation, `42` +/// syntax/access) is permanent (retrying will not help). Unknown/absent state +/// is treated as permanent so bad data does not loop forever. +fn is_transient_sql_state(sql_state: Option<&str>) -> bool { + match sql_state { + Some(s) if s.len() >= 2 => matches!(&s[..2], "08" | "40" | "53" | "57" | "58"), + _ => false, + } +} + +/// Inspect and CLEAR the pending Java exception after a failed JNI call, build a +/// classified `(Error, is_transient)` from its `SQLState` and message. Must be +/// called immediately after the failing call (before any other JNI call), and +/// clearing is required so subsequent JNI calls on this thread are not aborted. +fn classify_jni_failure(env: &mut JNIEnv, action: &str) -> (Error, bool) { + let (sql_state, message) = take_pending_sql_exception(env); + let transient = is_transient_sql_state(sql_state.as_deref()); + let state = sql_state.as_deref().unwrap_or("?"); + let msg = format!("Failed to {action} (SQLState {state}): {message}"); + if transient { + (Error::CannotStoreData(msg), true) + } else { + (Error::InvalidRecordValue(msg), false) + } +} + +/// Take the pending Java exception, clearing it, and return its `SQLState` (if it +/// is a `java.sql.SQLException`) and its message. +fn take_pending_sql_exception(env: &mut JNIEnv) -> (Option, String) { + let throwable = match env.exception_occurred() { + Ok(t) if !t.is_null() => t, + _ => return (None, "unknown error".to_string()), + }; + // Clear immediately so subsequent JNI calls on this thread work. + let _ = env.exception_clear(); + + let message = throwable_string_method(env, &throwable, "getMessage") + .unwrap_or_else(|| "unknown error".to_string()); + let sql_state = if env + .is_instance_of(&throwable, "java/sql/SQLException") + .unwrap_or(false) + { + throwable_string_method(env, &throwable, "getSQLState") + } else { + None + }; + (sql_state, message) +} + +/// Call a no-arg `String`-returning method on a throwable, returning None on any +/// JNI error or null result. +fn throwable_string_method( + env: &mut JNIEnv, + throwable: &JThrowable, + method: &str, +) -> Option { + let obj = env + .call_method(throwable, method, "()Ljava/lang/String;", &[]) + .ok()? + .l() + .ok()?; + if obj.is_null() { + return None; + } + env.get_string(&JString::from(obj)).ok().map(|s| s.into()) +} + +#[async_trait] +impl Sink for JdbcSink { + async fn open(&mut self) -> Result<(), Error> { + info!( + "Opening {CONNECTOR_NAME} connector [{}]. Target table: {}, URL: {}", + self.id, + self.config.target_table, + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()) + ); + + if self.config.target_table.is_empty() { + return Err(Error::InvalidConfigValue( + "target_table must not be empty".to_string(), + )); + } + + self.initialize_jvm()?; + self.create_connection()?; + + let jvm = self + .jvm + .as_ref() + .ok_or_else(|| Error::InitError("JVM not initialized".to_string()))?; + let mut env = jvm + .attach_current_thread() + .map_err(|e| Error::InitError(format!("Failed to attach thread: {e}")))?; + self.smoke_test(&mut env)?; + self.ensure_table_exists(&mut env)?; + + info!( + "{CONNECTOR_NAME} connector [{}] opened successfully", + self.id + ); + Ok(()) + } + + async fn consume( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: MessagesMetadata, + messages: Vec, + ) -> Result<(), Error> { + let jvm = self + .jvm + .as_ref() + .ok_or_else(|| Error::InitError("JVM not initialized".to_string()))?; + let mut env = jvm + .attach_current_thread() + .map_err(|e| Error::InitError(format!("Failed to attach thread: {e}")))?; + + self.write_messages(&mut env, topic_metadata, &messages_metadata, &messages) + } + + async fn close(&mut self) -> Result<(), Error> { + info!("Closing {CONNECTOR_NAME} connector [{}]", self.id); + + if let Some(jvm) = &self.jvm + && let Ok(mut env) = jvm.attach_current_thread() + { + if let Some(pool) = &self.connection_pool { + let _ = env.call_method(pool.as_obj(), "close", "()V", &[]); + info!("Connection pool closed"); + } + if let Some(connection) = &self.connection { + let _ = env.call_method(connection.as_obj(), "close", "()V", &[]); + info!("Database connection closed"); + } + } + + let state = self.state.lock().expect("state mutex poisoned"); + info!( + "{CONNECTOR_NAME} connector [{}] closed. Processed {} messages with {} errors", + self.id, state.messages_processed, state.insertion_errors + ); + Ok(()) + } +} + +// Export the connector via SDK macro. +sink_connector!(JdbcSink); + +#[cfg(test)] +mod tests { + use super::*; + + fn test_config() -> JdbcSinkConfig { + JdbcSinkConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + target_table: "messages".to_string(), + batch_size: None, + auto_create_table: None, + include_metadata: None, + include_checksum: None, + include_origin_timestamp: None, + payload_format: None, + payload_column: None, + verbose_logging: None, + max_retries: None, + retry_delay: None, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + } + } + + #[test] + fn given_mysql_url_should_mask_password() { + let url = "jdbc:mysql://root:SuperSecret123@localhost:3306/mydb"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, "jdbc:mysql://root:***@localhost:3306/mydb"); + assert!(!sanitized.contains("SuperSecret123")); + } + + #[test] + fn given_query_param_password_should_mask_case_insensitive() { + for url in [ + "jdbc:postgresql://localhost?password=secret", + "jdbc:postgresql://localhost?PASSWORD=secret", + "jdbc:postgresql://localhost?pwd=secret", + ] { + let sanitized = sanitize_jdbc_url(url); + assert!(!sanitized.contains("secret"), "failed for {url}"); + assert!(sanitized.contains("***")); + } + } + + #[test] + fn given_oracle_url_should_mask_password() { + let url = "jdbc:oracle:thin:system/oracle123@localhost:1521:XE"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, "jdbc:oracle:thin:system/***@localhost:1521:XE"); + assert!(!sanitized.contains("oracle123")); + } + + #[test] + fn given_url_without_password_should_be_unchanged() { + let url = "jdbc:h2:mem:testdb"; + assert_eq!(sanitize_jdbc_url(url), url); + } + + #[test] + fn given_text_payload_repr_should_passthrough_utf8_and_base64_binary() { + // Valid UTF-8 passes through unchanged. + assert_eq!(payload_text_repr(b"hello".to_vec()), "hello"); + assert_eq!(payload_text_repr(br#"{"a":1}"#.to_vec()), r#"{"a":1}"#); + // Invalid UTF-8 (binary) is base64-encoded rather than rejected. + let binary = vec![0xff, 0xfe, 0x00, 0x01]; + let encoded = payload_text_repr(binary.clone()); + use base64::Engine; + assert_eq!( + encoded, + base64::engine::general_purpose::STANDARD.encode(&binary) + ); + } + + #[test] + fn given_sql_state_should_classify_transient_vs_permanent() { + // Transient classes: connection (08), rollback/serialization (40), + // resources (53), operator intervention (57), system error (58). + for s in [ + "08001", "08006", "40001", "40P01", "53300", "57P01", "58030", + ] { + assert!(is_transient_sql_state(Some(s)), "{s} should be transient"); + } + // Permanent: data (22), constraint (23), syntax/access (42), plus + // unknown/absent. + for s in ["22001", "23505", "42601", "42P01", "99999"] { + assert!(!is_transient_sql_state(Some(s)), "{s} should be permanent"); + } + assert!(!is_transient_sql_state(None)); + assert!(!is_transient_sql_state(Some(""))); + assert!(!is_transient_sql_state(Some("0"))); + } + + #[test] + fn given_payload_format_strings_should_map_correctly() { + assert_eq!( + PayloadFormat::from_config(Some("json")), + PayloadFormat::Json + ); + assert_eq!( + PayloadFormat::from_config(Some("JSONB")), + PayloadFormat::Json + ); + assert_eq!( + PayloadFormat::from_config(Some("bytes")), + PayloadFormat::Bytes + ); + assert_eq!( + PayloadFormat::from_config(Some("binary")), + PayloadFormat::Bytes + ); + assert_eq!( + PayloadFormat::from_config(Some("text")), + PayloadFormat::Text + ); + assert_eq!(PayloadFormat::from_config(None), PayloadFormat::Text); + assert_eq!( + PayloadFormat::from_config(Some("weird")), + PayloadFormat::Text + ); + } + + #[test] + fn given_all_columns_enabled_should_build_full_insert() { + let sink = JdbcSink::new(1, test_config()); + let (sql, params) = sink.build_insert_sql().expect("build insert"); + assert!(sql.starts_with("INSERT INTO \"messages\" (")); + assert!(sql.contains("iggy_offset")); + assert!(sql.contains("iggy_timestamp")); + assert!(sql.contains("iggy_stream")); + assert!(sql.contains("iggy_topic")); + assert!(sql.contains("iggy_partition_id")); + assert!(sql.contains("iggy_checksum")); + assert!(sql.contains("iggy_origin_timestamp")); + assert!(sql.contains("\"payload\"")); + // id + 5 metadata + checksum + origin_ts + payload = 9 + assert_eq!(params, 9); + assert_eq!(sql.matches('?').count(), 9); + } + + #[test] + fn given_metadata_disabled_should_build_minimal_insert() { + let mut config = test_config(); + config.include_metadata = Some(false); + config.include_checksum = Some(false); + config.include_origin_timestamp = Some(false); + let sink = JdbcSink::new(1, config); + let (sql, params) = sink.build_insert_sql().expect("build insert"); + assert!(!sql.contains("iggy_offset")); + assert!(!sql.contains("iggy_checksum")); + assert!(!sql.contains("iggy_origin_timestamp")); + assert!(sql.contains("\"payload\"")); + // id + payload = 2 + assert_eq!(params, 2); + assert_eq!(sql.matches('?').count(), 2); + } + + #[test] + fn given_custom_payload_column_should_use_it() { + let mut config = test_config(); + config.payload_column = Some("body".to_string()); + let sink = JdbcSink::new(1, config); + let (sql, _) = sink.build_insert_sql().expect("build insert"); + assert!(sql.contains("\"body\"")); + assert!(!sql.contains("\"payload\"")); + } + + #[test] + fn given_auto_create_should_build_create_table_with_payload_type() { + let mut config = test_config(); + config.payload_format = Some("text".to_string()); + let sink = JdbcSink::new(1, config); + let sql = sink.build_create_table_sql().expect("build create"); + assert!(sql.starts_with("CREATE TABLE IF NOT EXISTS \"messages\" (")); + assert!(sql.contains("id VARCHAR(40)")); + assert!(sql.contains("\"payload\" TEXT")); + } + + #[test] + fn given_bytes_format_create_table_should_use_binary_type() { + let mut config = test_config(); + config.payload_format = Some("bytes".to_string()); + let sink = JdbcSink::new(1, config); + let sql = sink.build_create_table_sql().expect("build create"); + assert!(sql.contains("\"payload\" VARBINARY")); + } + + #[test] + fn given_table_name_with_quotes_should_escape() { + assert_eq!( + quote_identifier("tbl\"name").expect("quote"), + "\"tbl\"\"name\"" + ); + } + + #[test] + fn given_injection_attempt_in_table_should_escape() { + let q = quote_identifier("messages\"; DROP TABLE users; --").expect("quote"); + assert_eq!(q, "\"messages\"\"; DROP TABLE users; --\""); + } + + #[test] + fn given_empty_identifier_should_fail() { + assert!(quote_identifier("").is_err()); + } + + #[test] + fn given_null_byte_identifier_should_fail() { + assert!(quote_identifier("a\0b").is_err()); + } + + #[test] + fn given_default_config_should_use_defaults() { + let sink = JdbcSink::new(1, test_config()); + assert_eq!(sink.batch_size(), DEFAULT_BATCH_SIZE as usize); + assert_eq!(sink.max_retries(), DEFAULT_MAX_RETRIES); + assert_eq!(sink.retry_delay, Duration::from_secs(1)); + assert!(sink.include_metadata()); + assert!(sink.include_checksum()); + assert!(sink.include_origin_timestamp()); + assert_eq!(sink.payload_column(), DEFAULT_PAYLOAD_COLUMN); + assert_eq!(sink.payload_format(), PayloadFormat::Text); + } + + #[test] + fn given_zero_batch_size_should_floor_to_one() { + let mut config = test_config(); + config.batch_size = Some(0); + let sink = JdbcSink::new(1, config); + assert_eq!(sink.batch_size(), 1); + } + + #[test] + fn given_custom_retry_delay_should_parse_humantime() { + let mut config = test_config(); + config.retry_delay = Some("500ms".to_string()); + let sink = JdbcSink::new(1, config); + assert_eq!(sink.retry_delay, Duration::from_millis(500)); + } + + #[test] + fn given_debug_output_should_not_leak_secrets() { + let mut config = test_config(); + config.jdbc_url = SecretString::from("jdbc:mysql://root:TopSecret@host:3306/db"); + config.password = Some(SecretString::from("TopSecret")); + let debug = format!("{config:?}"); + assert!(!debug.contains("TopSecret")); + assert!(debug.contains("***")); + } + + #[test] + fn given_minimal_toml_should_deserialize_with_defaults() { + let toml_str = r#" + jdbc_url = "jdbc:h2:mem:test" + driver_class = "org.h2.Driver" + driver_jar_path = "/tmp/h2.jar" + target_table = "events" + "#; + let config: JdbcSinkConfig = toml::from_str(toml_str).expect("parse minimal toml"); + assert_eq!(config.target_table, "events"); + assert_eq!(config.driver_class, "org.h2.Driver"); + assert!(config.batch_size.is_none()); + assert!(config.username.is_none()); + assert!(config.password.is_none()); + assert!(!config.enable_connection_pool); + assert_eq!(config.max_pool_size, 10); + assert_eq!(config.min_idle, 2); + assert_eq!(config.connection_timeout_ms, 30000); + } + + #[test] + fn given_full_toml_should_deserialize_all_fields() { + let toml_str = r#" + jdbc_url = "jdbc:mysql://localhost:3306/mydb" + driver_class = "com.mysql.cj.jdbc.Driver" + driver_jar_path = "/opt/drivers/mysql.jar" + username = "admin" + password = "s3cret" + target_table = "orders" + batch_size = 500 + auto_create_table = true + include_metadata = false + include_checksum = false + include_origin_timestamp = false + payload_format = "json" + payload_column = "doc" + verbose_logging = true + max_retries = 5 + retry_delay = "2s" + jvm_options = ["-Xmx512m"] + enable_connection_pool = true + max_pool_size = 20 + min_idle = 5 + connection_timeout_ms = 60000 + "#; + let config: JdbcSinkConfig = toml::from_str(toml_str).expect("parse full toml"); + assert_eq!(config.username.as_deref(), Some("admin")); + assert!(config.password.is_some()); + assert_eq!(config.batch_size, Some(500)); + assert_eq!(config.auto_create_table, Some(true)); + assert_eq!(config.payload_format.as_deref(), Some("json")); + assert_eq!(config.payload_column.as_deref(), Some("doc")); + assert_eq!(config.max_retries, Some(5)); + assert_eq!(config.jvm_options, vec!["-Xmx512m"]); + assert!(config.enable_connection_pool); + assert_eq!(config.max_pool_size, 20); + + let sink = JdbcSink::new(1, config); + // metadata/checksum/origin all disabled → id + payload = 2 params + let (_, params) = sink.build_insert_sql().expect("build insert"); + assert_eq!(params, 2); + assert_eq!(sink.batch_size(), 500); + } +} diff --git a/core/connectors/sources/README.md b/core/connectors/sources/README.md index 34989aef00..cea774355a 100644 --- a/core/connectors/sources/README.md +++ b/core/connectors/sources/README.md @@ -10,6 +10,7 @@ Source connectors are responsible for ingesting data from external sources into | ------ | ----------- | | **elasticsearch_source** | Polls documents from Elasticsearch indices with timestamp-based tracking | | **influxdb_source** | Polls InfluxDB with cursor-based timestamp tracking; supports V2 (Flux, annotated CSV) and V3 (SQL, JSONL) | +| **jdbc_source** | Reads rows from any JDBC-compliant database (PostgreSQL, MySQL, Oracle, SQL Server, H2) via an embedded JVM; bulk and incremental modes | | **postgres_source** | Reads rows from PostgreSQL tables with multiple strategies: delete after read, mark as processed, or timestamp tracking | | **random_source** | Generates random test messages (useful for testing and development) | diff --git a/core/connectors/sources/jdbc_source/Cargo.toml b/core/connectors/sources/jdbc_source/Cargo.toml new file mode 100644 index 0000000000..34662f1865 --- /dev/null +++ b/core/connectors/sources/jdbc_source/Cargo.toml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "iggy_connector_jdbc_source" +version = "0.1.0" +edition = "2024" +license = "Apache-2.0" +keywords = ["iggy", "messaging", "streaming", "jdbc", "source"] +categories = ["database"] +description = "Generic JDBC source connector for Iggy - supports MySQL, Oracle, SQL Server, H2, and any JDBC-compliant database" +readme = "README.md" + +[package.metadata.cargo-machete] +ignored = ["dashmap", "humantime-serde"] + +[lib] +crate-type = ["cdylib", "rlib"] + +[features] +default = [] + +[dependencies] +async-trait = { workspace = true } +base64 = { workspace = true } +chrono = { workspace = true } + +# Required by source_connector! macro +dashmap = { workspace = true } + +# For parsing duration strings +humantime-serde = "1.1" + +# Connector SDK +iggy_connector_sdk = { workspace = true } + +# JNI for Java interop with invocation support +jni = { version = "0.21", features = ["invocation"] } + +# For sanitizing passwords in logs +regex = { workspace = true } +secrecy = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +uuid = { workspace = true, features = ["v4"] } + +[dev-dependencies] +toml = { workspace = true } diff --git a/core/connectors/sources/jdbc_source/README.md b/core/connectors/sources/jdbc_source/README.md new file mode 100644 index 0000000000..ed2e61be3e --- /dev/null +++ b/core/connectors/sources/jdbc_source/README.md @@ -0,0 +1,471 @@ +# JDBC Source Connector + +A generic JDBC source connector for Iggy that supports any JDBC-compliant database including MySQL, PostgreSQL, Oracle, SQL Server, H2, Derby, and more. + +## Overview + +This connector reads data from relational databases using JDBC (Java Database Connectivity) and publishes it as messages to Iggy streams. It supports both bulk and incremental data synchronization modes. + +## Features + +- **Universal Database Support**: Works with any database that has a JDBC driver +- **Incremental Sync**: Track changes using timestamps or auto-increment IDs +- **Bulk Mode**: Full table scans for initial loads or snapshots +- **Type Mapping**: Automatic conversion of SQL types to JSON +- **Configurable Polling**: Control how frequently data is fetched +- **State Management**: Automatically tracks offsets to prevent duplicate reads +- **Flexible Queries**: Support for custom SQL queries with placeholders + +## Supported Databases + +**ALL JDBC-compliant databases are supported for both bulk and incremental modes:** + +- MySQL / MariaDB +- PostgreSQL +- Oracle Database +- Microsoft SQL Server +- H2 Database +- Apache Derby +- IBM DB2 +- SQLite (via JDBC) +- SAP HANA +- Teradata +- Snowflake +- Amazon Redshift +- Google BigQuery +- Any other JDBC-compliant database + +**Key Point:** The JDBC connector provides a **single, universal implementation** that works with all these databases. You don't need separate connectors for MySQL, Oracle, etc. Just swap the JDBC driver JAR and connection string! + +## Prerequisites + +1. **Java Runtime Environment (JRE)**: JRE 8 or later must be installed +2. **JDBC Driver**: Download the appropriate JDBC driver JAR for your database + +### Downloading JDBC Drivers + +**MySQL:** + +```bash +wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.33/mysql-connector-j-8.0.33.jar +``` + +**PostgreSQL:** + +```bash +wget https://jdbc.postgresql.org/download/postgresql-42.6.0.jar +``` + +**Oracle:** + +- Download from [Oracle JDBC Driver Downloads](https://www.oracle.com/database/technologies/appdev/jdbc-downloads.html) + +**SQL Server:** + +```bash +wget https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/12.4.1.jre11/mssql-jdbc-12.4.1.jre11.jar +``` + +**H2:** + +```bash +wget https://repo1.maven.org/maven2/com/h2database/h2/2.2.224/h2-2.2.224.jar +``` + +## Configuration + +### Basic Configuration (Incremental Sync) + +```toml +type = "source" +key = "jdbc_mysql_source" +enabled = true + +[plugin_config] +jdbc_url = "jdbc:mysql://localhost:3306/ecommerce" +driver_class = "com.mysql.cj.jdbc.Driver" +driver_jar_path = "/opt/jdbc-drivers/mysql-connector-j-8.0.33.jar" +username = "iggy_user" +password = "secret_password" +query = "SELECT * FROM orders WHERE updated_at > {last_offset} ORDER BY updated_at ASC" +poll_interval = "30s" +batch_size = 1000 +tracking_column = "updated_at" +initial_offset = "2024-01-01 00:00:00" +mode = "incremental" +snake_case_columns = true +include_metadata = true + +[[streams]] +stream = "ecommerce" +topic = "orders" +partition_id = 1 +``` + +### Bulk Mode Configuration + +```toml +type = "source" +key = "jdbc_bulk_source" +enabled = true + +[plugin_config] +jdbc_url = "jdbc:postgresql://localhost:5432/warehouse" +driver_class = "org.postgresql.Driver" +driver_jar_path = "/opt/jdbc-drivers/postgresql-42.6.0.jar" +username = "warehouse_user" +password = "secret" +query = "SELECT * FROM product_catalog" +poll_interval = "1h" +batch_size = 5000 +mode = "bulk" +snake_case_columns = false +include_metadata = true + +[[streams]] +stream = "warehouse" +topic = "products" +``` + +### Oracle Database Example + +```toml +type = "source" +key = "jdbc_oracle_source" +enabled = true + +[plugin_config] +jdbc_url = "jdbc:oracle:thin:@localhost:1521:XE" +driver_class = "oracle.jdbc.OracleDriver" +driver_jar_path = "/opt/jdbc-drivers/ojdbc11.jar" +username = "system" +password = "oracle" +query = "SELECT * FROM CUSTOMERS WHERE ID > {last_offset} ORDER BY ID" +poll_interval = "1m" +batch_size = 500 +tracking_column = "ID" +initial_offset = "0" +mode = "incremental" +jvm_options = ["-Xmx256m", "-Xms128m"] + +[[streams]] +stream = "crm" +topic = "customers" +``` + +### SQL Server Example + +```toml +type = "source" +key = "jdbc_sqlserver_source" +enabled = true + +[plugin_config] +jdbc_url = "jdbc:sqlserver://localhost:1433;databaseName=Sales;encrypt=false" +driver_class = "com.microsoft.sqlserver.jdbc.SQLServerDriver" +driver_jar_path = "/opt/jdbc-drivers/mssql-jdbc-12.4.1.jre11.jar" +username = "sa" +password = "YourPassword123" +query = "SELECT * FROM Orders WHERE OrderDate > {last_offset} ORDER BY OrderDate" +poll_interval = "15s" +batch_size = 2000 +tracking_column = "OrderDate" +initial_offset = "2024-01-01" +mode = "incremental" + +[[streams]] +stream = "sales" +topic = "orders" +``` + +## Configuration Parameters + +| Parameter | Type | Required | Default | Description | +| ----------- | ------ | ---------- | --------- | ------------- | +| `jdbc_url` | string | Yes | - | JDBC connection URL (can include credentials) | +| `driver_class` | string | Yes | - | JDBC driver class name | +| `driver_jar_path` | string | Yes | - | Path to JDBC driver JAR file | +| `username` | string | No | - | Database username (optional if in jdbc_url) | +| `password` | string | No | - | Database password (optional if in jdbc_url) | +| `query` | string | Yes | - | SQL query to execute (supports `{last_offset}` placeholder) | +| `poll_interval` | duration | Yes | - | How often to poll (e.g., "30s", "5m", "1h") | +| `batch_size` | u32 | No | 1000 | Maximum rows to fetch per poll | +| `tracking_column` | string | No | - | Column to track for incremental reads | +| `initial_offset` | string | No | - | Starting offset value for first poll | +| `mode` | string | No | "incremental" | Sync mode: "incremental" or "bulk" (bulk works with ALL databases) | +| `enable_connection_pool` | bool | No | false | Enable HikariCP connection pooling | +| `max_pool_size` | u32 | No | 10 | Maximum connections in pool | +| `min_idle` | u32 | No | 2 | Minimum idle connections | +| `connection_timeout_ms` | u64 | No | 30000 | Connection timeout in milliseconds | +| `jvm_options` | array | No | [] | Custom JVM options (e.g., ["-Xmx1g"]) | +| `snake_case_columns` | bool | No | false | Convert column names to snake_case | +| `include_metadata` | bool | No | true | Include metadata (table, operation, timestamp) | + +## Query Placeholders + +The `query` parameter supports placeholders for dynamic queries: + +- `{last_offset}`: Replaced with the last tracked offset value +- Automatically wrapped in quotes for string types + +**Example:** + +```sql +-- Configuration +tracking_column = "id" +query = "SELECT * FROM users WHERE id > {last_offset} ORDER BY id" + +-- First poll (no offset yet) +SELECT * FROM users WHERE id > '0' ORDER BY id + +-- After processing rows up to id=100 +SELECT * FROM users WHERE id > '100' ORDER BY id +``` + +## Output Format + +Each database row is converted to a JSON message: + +### With Metadata (default) + +```json +{ + "table_name": null, + "operation_type": "SELECT", + "timestamp": "2024-01-09T10:30:00Z", + "data": { + "id": 123, + "name": "John Doe", + "email": "john@example.com", + "created_at": "2024-01-08T15:20:00" + } +} +``` + +### Without Metadata + +```json +{ + "id": 123, + "name": "John Doe", + "email": "john@example.com", + "created_at": "2024-01-08T15:20:00" +} +``` + +## Type Mapping + +JDBC SQL types are automatically mapped to JSON: + +| SQL Type | JSON Type | Notes | +| ---------- | ----------- | ------- | +| BIT, BOOLEAN | boolean | - | +| TINYINT, SMALLINT, INTEGER | number | Integer | +| BIGINT | number | Long integer (values above 2^53 may lose precision in JSON consumers that parse numbers as f64) | +| FLOAT, REAL | number | Float | +| DOUBLE | number | Double | +| NUMERIC, DECIMAL | string | Emitted as a string to preserve arbitrary precision (e.g. money) | +| CHAR, VARCHAR, TEXT | string | - | +| DATE, TIME, TIMESTAMP | string | Driver string form | +| BINARY, VARBINARY, LONGVARBINARY | string | Base64 encoded | +| NULL | null | - | + +## Runtime notes & limitations + +- **Embedded JVM, one per process.** JNI permits a single `JavaVM` per OS + process. All JDBC *source* instances in the connectors runtime share one JVM + (the first instance's `jvm_options`/classpath win). A JDBC source and a JDBC + sink are separate shared libraries and **cannot both create a JVM in the same + runtime process** — run them in separate connectors-runtime processes. +- **Blocking I/O.** JDBC calls go through JNI and are synchronous; each `poll()` + runs blocking work on the runtime worker thread. Size the runtime and + `poll_interval`/`batch_size` accordingly. +- **Connection recovery.** In direct (non-pooled) mode the connection is + validated with `Connection.isValid` each poll and transparently re-established + if it has dropped. In pooled mode each poll borrows and returns a connection + from HikariCP, which validates connections itself. + +## Troubleshooting + +### Connection Failures + +**Error**: "Failed to create JDBC connection" + +**Solution**: + +- Verify JDBC URL format for your database +- Check username/password +- Ensure database server is accessible +- Verify firewall rules + +### Driver Not Found + +**Error**: "Failed to find driver class" + +**Solution**: + +- Verify `driver_jar_path` points to correct JAR file +- Check `driver_class` name matches your JDBC driver +- Ensure JAR file has read permissions + +### JVM Issues + +**Error**: "Failed to create JVM" + +**Solution**: + +- Ensure Java is installed: `java -version` +- Increase JVM memory: + + ```toml + jvm_options = ["-Xmx1g", "-Xms512m"] + ``` + +### No Data Being Fetched + +**Check**: + +- Verify query returns results when run directly in database +- Check `initial_offset` value +- Review connector logs for errors +- Ensure `tracking_column` exists in query result + +## Performance Tuning + +### Optimize Batch Size + +```toml +# Small batches for low latency +batch_size = 100 +poll_interval = "5s" + +# Large batches for throughput +batch_size = 10000 +poll_interval = "1m" +``` + +### JVM Memory Tuning + +```toml +jvm_options = [ + "-Xmx1g", # Maximum heap size + "-Xms512m", # Initial heap size + "-XX:+UseG1GC" # Use G1 garbage collector +] +``` + +### Query Optimization + +- Add indexes on tracking columns +- Use efficient WHERE clauses +- Avoid SELECT * in production (specify columns) +- Consider database-specific optimizations + +## Connection String Formats + +### MySQL + +```toml +# Option 1: Separate credentials +jdbc_url = "jdbc:mysql://localhost:3306/mydb" +username = "user" +password = "pass" + +# Option 2: Embedded in URL +jdbc_url = "jdbc:mysql://user:pass@localhost:3306/mydb" +``` + +### PostgreSQL + +```toml +# Option 1: Separate credentials +jdbc_url = "jdbc:postgresql://localhost:5432/mydb" +username = "user" +password = "pass" + +# Option 2: Embedded in URL +jdbc_url = "jdbc:postgresql://localhost:5432/mydb?user=myuser&password=mypass" +``` + +### Oracle + +```toml +# Option 1: Separate credentials +jdbc_url = "jdbc:oracle:thin:@localhost:1521:XE" +username = "system" +password = "oracle" + +# Option 2: Embedded in URL (Oracle uses @ for host) +jdbc_url = "jdbc:oracle:thin:system/oracle@localhost:1521:XE" +``` + +### SQL Server + +```toml +# Option 1: Separate credentials +jdbc_url = "jdbc:sqlserver://localhost:1433;databaseName=mydb" +username = "sa" +password = "YourPassword123" + +# Option 2: Embedded in URL +jdbc_url = "jdbc:sqlserver://localhost:1433;databaseName=mydb;user=sa;password=YourPassword123" +``` + +### H2 (In-Memory) + +```toml +# No credentials needed for in-memory +jdbc_url = "jdbc:h2:mem:testdb" + +# Or with file-based +jdbc_url = "jdbc:h2:file:/data/mydb;USER=sa;PASSWORD=sa" +``` + +## Mode Comparison + +### Incremental Mode (Universal) + +**Works with ALL databases** - requires only a tracking column: + +```toml +mode = "incremental" +tracking_column = "updated_at" # or "id", "created_at", etc. +query = "SELECT * FROM table WHERE {tracking_column} > {last_offset} ORDER BY {tracking_column}" +``` + +**Benefits:** + +- Prevents duplicate reads +- Tracks offset automatically +- Efficient for large tables +- Works with timestamps, IDs, or any orderable column + +**Database Examples:** + +- MySQL: `WHERE updated_at > {last_offset}` +- Oracle: `WHERE ROWNUM > {last_offset}` or use ID +- SQL Server: `WHERE updated_at > {last_offset}` +- PostgreSQL: `WHERE id > {last_offset}` + +### Bulk Mode (Universal) + +**Works with ALL databases** - no special requirements: + +```toml +mode = "bulk" +query = "SELECT * FROM table" # Any valid SELECT query +``` + +**Benefits:** + +- No tracking column needed +- Works with any SELECT query +- Good for snapshots +- Supports complex queries with JOINs, aggregations, etc. + +**Use Cases:** + +- Initial data load +- Periodic full snapshots +- Complex analytical queries +- Tables without tracking columns diff --git a/core/connectors/sources/jdbc_source/config.toml b/core/connectors/sources/jdbc_source/config.toml new file mode 100644 index 0000000000..27bf7237e6 --- /dev/null +++ b/core/connectors/sources/jdbc_source/config.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +type = "source" +key = "jdbc" +enabled = true +version = 0 +name = "JDBC source" +path = "../../target/release/libiggy_connector_jdbc_source" +verbose = false + +[[streams]] +stream = "user_events" +topic = "users" +schema = "json" +batch_length = 100 + +[plugin_config] +jdbc_url = "jdbc:postgresql://localhost:5432/database" +driver_class = "org.postgresql.Driver" +driver_jar_path = "/tmp/jdbc-drivers/postgresql-42.7.1.jar" +username = "postgres" +password = "postgres" +query = "SELECT * FROM users WHERE id > {last_offset} ORDER BY id" +poll_interval = "1s" +batch_size = 1000 +tracking_column = "id" +initial_offset = "0" +mode = "incremental" +snake_case_columns = false +include_metadata = true +enable_connection_pool = false diff --git a/core/connectors/sources/jdbc_source/src/lib.rs b/core/connectors/sources/jdbc_source/src/lib.rs new file mode 100644 index 0000000000..a3cee99533 --- /dev/null +++ b/core/connectors/sources/jdbc_source/src/lib.rs @@ -0,0 +1,2176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use iggy_connector_sdk::{ + ConnectorState, Error, ProducedMessage, ProducedMessages, Schema, Source, source_connector, +}; +use jni::objects::{GlobalRef, JByteArray, JObject, JString, JThrowable, JValue}; +use jni::{JNIEnv, JavaVM}; +use regex::Regex; +use secrecy::{ExposeSecret, SecretString}; +use serde::{Deserialize, Serialize}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; +use tracing::info; +use uuid::Uuid; + +mod secret_string_serde { + use secrecy::SecretString; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { + let s = String::deserialize(d)?; + Ok(SecretString::from(s)) + } + #[allow(unused_variables)] + pub fn serialize(val: &SecretString, s: S) -> Result { + "".serialize(s) + } +} + +mod opt_secret_string_serde { + use secrecy::SecretString; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let s: Option = Option::deserialize(d)?; + Ok(s.map(SecretString::from)) + } + #[allow(unused_variables)] + pub fn serialize(val: &Option, s: S) -> Result { + Option::::None.serialize(s) + } +} + +/// Cached compiled regex patterns for password sanitization +static RE_USER_PASS_AT: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"://([^:]+):([^@?;/]+)@").unwrap()); +static RE_PASSWORD_PARAM: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"(?i)(password|pwd|pass)=([^;&\s]+)").unwrap()); +static RE_ORACLE_PASS: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"thin:([^/]+)/([^@]+)@").unwrap()); + +const CONNECTOR_NAME: &str = "JDBC source"; + +/// Source mode for the JDBC connector +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[serde(rename_all = "lowercase")] +pub enum Mode { + /// Full table scan on every poll + Bulk, + /// Track last offset and only fetch new rows + Incremental, +} + +/// Configuration for JDBC source connector +#[derive(Clone, Deserialize, Serialize)] +pub struct JdbcSourceConfig { + /// JDBC connection URL (e.g., "jdbc:mysql://localhost:3306/mydb") + /// Can include credentials: "jdbc:mysql://localhost:3306/mydb?user=root&password=secret" + #[serde(with = "secret_string_serde")] + pub jdbc_url: SecretString, + + /// JDBC driver class name (e.g., "com.mysql.cj.jdbc.Driver") + pub driver_class: String, + + /// Path to JDBC driver JAR file + pub driver_jar_path: String, + + /// Database username (optional if included in jdbc_url) + #[serde(default)] + pub username: Option, + + /// Database password (optional if included in jdbc_url) + #[serde(default, with = "opt_secret_string_serde")] + pub password: Option, + + /// SQL query to execute for fetching data + /// Can use {last_offset} placeholder for incremental reads + pub query: String, + + /// Polling interval (e.g., "30s", "5m", "1h") + #[serde(with = "humantime_serde")] + pub poll_interval: Duration, + + /// Batch size - maximum rows to fetch per poll + #[serde(default = "default_batch_size")] + pub batch_size: u32, + + /// Tracking column for incremental reads (e.g., "id", "updated_at") + #[serde(default)] + pub tracking_column: Option, + + /// Initial offset value for the first poll + #[serde(default)] + pub initial_offset: Option, + + /// Source mode: "bulk" (full table scan) or "incremental" (track last offset) + #[serde(default = "default_mode")] + pub mode: Mode, + + /// Convert column names to snake_case + #[serde(default)] + pub snake_case_columns: bool, + + /// Include metadata in output (table name, operation type, timestamp) + #[serde(default = "default_true")] + pub include_metadata: bool, + + /// JVM options (e.g., ["-Xmx512m", "-Xms128m"]) + #[serde(default)] + pub jvm_options: Vec, + + /// Enable connection pooling via HikariCP + #[serde(default)] + pub enable_connection_pool: bool, + + /// Maximum pool size (default: 10) + #[serde(default = "default_pool_size")] + pub max_pool_size: u32, + + /// Minimum idle connections (default: 2) + #[serde(default = "default_min_idle")] + pub min_idle: u32, + + /// Connection timeout in milliseconds (default: 30000) + #[serde(default = "default_connection_timeout")] + pub connection_timeout_ms: u64, +} + +fn default_pool_size() -> u32 { + 10 +} + +fn default_min_idle() -> u32 { + 2 +} + +fn default_connection_timeout() -> u64 { + 30000 +} + +fn default_batch_size() -> u32 { + 1000 +} + +fn default_mode() -> Mode { + Mode::Incremental +} + +fn default_true() -> bool { + true +} + +impl std::fmt::Debug for JdbcSourceConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JdbcSourceConfig") + .field( + "jdbc_url", + &sanitize_jdbc_url(self.jdbc_url.expose_secret()), + ) + .field("driver_class", &self.driver_class) + .field("driver_jar_path", &self.driver_jar_path) + .field("username", &self.username) + .field("password", &self.password.as_ref().map(|_| "***")) + .field("query", &self.query) + .field("poll_interval", &self.poll_interval) + .field("batch_size", &self.batch_size) + .field("tracking_column", &self.tracking_column) + .field("initial_offset", &self.initial_offset) + .field("mode", &self.mode) + .field("snake_case_columns", &self.snake_case_columns) + .field("include_metadata", &self.include_metadata) + .field("enable_connection_pool", &self.enable_connection_pool) + .field("max_pool_size", &self.max_pool_size) + .field("min_idle", &self.min_idle) + .field("connection_timeout_ms", &self.connection_timeout_ms) + .finish() + } +} + +/// Internal state tracking for the JDBC source +#[derive(Debug, Clone, Serialize, Deserialize)] +struct State { + /// Last tracked offset value (for incremental mode) + last_offset: Option, + + /// Total rows processed + processed_rows: u64, + + /// Last poll timestamp + last_poll_time: DateTime, +} + +impl Default for State { + fn default() -> Self { + Self { + last_offset: None, + processed_rows: 0, + last_poll_time: Utc::now(), + } + } +} + +/// Database record structure for output messages +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseRecord { + pub table_name: Option, + pub operation_type: String, + pub timestamp: DateTime, + pub data: serde_json::Value, +} + +/// JDBC Source Connector +#[derive(Debug)] +pub struct JdbcSource { + id: u32, + config: JdbcSourceConfig, + jvm: Option>, + // Behind a Mutex so `poll()` (&self) can transparently re-establish a dead + // direct connection without `&mut self`. + connection: Mutex>, + connection_pool: Option, // HikariDataSource if pooling enabled + state: Arc>, +} + +/// Sanitize JDBC URL by masking passwords for logging +fn sanitize_jdbc_url(url: &str) -> String { + // Pattern 1: user:password@host format (MySQL, PostgreSQL) + let url = RE_USER_PASS_AT.replace_all(url, "://$1:***@"); + + // Pattern 2: password=value format (PostgreSQL, SQL Server, H2) + let url = RE_PASSWORD_PARAM.replace_all(&url, "$1=***"); + + // Pattern 3: Oracle user/password@host format + let url = RE_ORACLE_PASS.replace_all(&url, "thin:$1/***@"); + + url.to_string() +} + +impl JdbcSource { + /// Create a new JDBC source connector + pub fn new(id: u32, config: JdbcSourceConfig, connector_state: Option) -> Self { + // Restore state from persistent storage if available + let state = connector_state + .and_then(|cs| cs.deserialize::(CONNECTOR_NAME, id)) + .unwrap_or_else(|| { + let mut default_state = State::default(); + // Use initial_offset from config if provided + if let Some(ref initial_offset) = config.initial_offset { + default_state.last_offset = Some(initial_offset.clone()); + } + default_state + }); + + Self { + id, + config, + jvm: None, + connection: Mutex::new(None), + connection_pool: None, + state: Arc::new(Mutex::new(state)), + } + } + + /// Obtain the process-wide JVM, creating it on first use. JNI permits only a + /// single JVM per OS process, so this is shared across all JDBC connector + /// instances (see [`get_or_create_jvm`]). + fn initialize_jvm(&mut self) -> Result<(), Error> { + info!("Initializing JVM for JDBC source connector [{}]", self.id); + let jvm = get_or_create_jvm(&self.config.driver_jar_path, &self.config.jvm_options)?; + self.jvm = Some(jvm); + Ok(()) + } + + /// Load JDBC driver and create connection (or connection pool) + fn create_connection(&mut self) -> Result<(), Error> { + let jvm = self + .jvm + .as_ref() + .ok_or_else(|| Error::InitError("JVM not initialized".to_string()))?; + + let mut env = jvm + .attach_current_thread() + .map_err(|e| Error::InitError(format!("Failed to attach thread to JVM: {}", e)))?; + + info!("Loading JDBC driver: {}", self.config.driver_class); + + // Load driver using Class.forName() which triggers static initialization + info!( + "Loading driver class via Class.forName: {}", + self.config.driver_class + ); + + let class_class = env + .find_class("java/lang/Class") + .map_err(|e| Error::InitError(format!("Failed to find Class: {}", e)))?; + + let driver_class_name = env + .new_string(&self.config.driver_class) + .map_err(|e| Error::InitError(format!("Failed to create class name string: {}", e)))?; + + // Call Class.forName(className) to load and initialize the driver + env.call_static_method( + class_class, + "forName", + "(Ljava/lang/String;)Ljava/lang/Class;", + &[JValue::Object(&driver_class_name.into())], + ) + .map_err(|e| { + Error::InitError(format!( + "Failed to load driver class '{}': {}", + self.config.driver_class, e + )) + })?; + + info!("JDBC driver loaded and registered successfully"); + + if self.config.enable_connection_pool { + info!( + "Setting up HikariCP connection pool to: {}", + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()) + ); + let pool = self.create_connection_pool_internal(&mut env)?; + self.connection_pool = Some(pool); + } else { + info!( + "Creating direct JDBC connection to: {}", + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()) + ); + let conn = self.create_direct_connection_internal(&mut env)?; + *self.connection.lock().expect("connection mutex poisoned") = Some(conn); + } + + Ok(()) + } + + /// Create a direct JDBC connection via DriverManager + fn create_direct_connection_internal(&self, env: &mut JNIEnv) -> Result { + // Set the thread context class loader to help DriverManager find the driver + let current_thread_class = env + .find_class("java/lang/Thread") + .map_err(|e| Error::InitError(format!("Failed to find Thread class: {}", e)))?; + + let current_thread = env + .call_static_method( + current_thread_class, + "currentThread", + "()Ljava/lang/Thread;", + &[], + ) + .map_err(|e| Error::InitError(format!("Failed to get current thread: {}", e)))? + .l() + .map_err(|e| Error::InitError(format!("Failed to extract thread object: {}", e)))?; + + // Get the class loader that loaded the driver + let driver_class = env + .find_class(self.config.driver_class.replace('.', "/")) + .map_err(|e| Error::InitError(format!("Failed to find driver class: {}", e)))?; + + let driver_class_loader = env + .call_method( + &driver_class, + "getClassLoader", + "()Ljava/lang/ClassLoader;", + &[], + ) + .map_err(|e| Error::InitError(format!("Failed to get driver class loader: {}", e)))? + .l() + .map_err(|e| Error::InitError(format!("Failed to extract class loader: {}", e)))?; + + // Set the context class loader + env.call_method( + ¤t_thread, + "setContextClassLoader", + "(Ljava/lang/ClassLoader;)V", + &[JValue::Object(&driver_class_loader)], + ) + .map_err(|e| Error::InitError(format!("Failed to set context class loader: {}", e)))?; + + info!( + "Set thread context class loader for driver: {}", + self.config.driver_class + ); + + // Get connection from DriverManager + let driver_manager = env + .find_class("java/sql/DriverManager") + .map_err(|e| Error::InitError(format!("Failed to find DriverManager: {}", e)))?; + + let jdbc_url = env + .new_string(self.config.jdbc_url.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create JDBC URL string: {}", e)))?; + + // If username/password are provided separately, use 3-arg getConnection + let connection = if let (Some(username), Some(password)) = + (&self.config.username, &self.config.password) + { + info!("Using separate username/password authentication"); + let username_jstring = env.new_string(username).map_err(|e| { + Error::InitError(format!("Failed to create username string: {}", e)) + })?; + let password_jstring = env.new_string(password.expose_secret()).map_err(|e| { + Error::InitError(format!("Failed to create password string: {}", e)) + })?; + + env.call_static_method( + driver_manager, + "getConnection", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)Ljava/sql/Connection;", + &[ + JValue::Object(&jdbc_url.into()), + JValue::Object(&username_jstring.into()), + JValue::Object(&password_jstring.into()), + ], + ) + .map_err(|e| { + Error::InitError(format!( + "Failed to create JDBC connection with credentials: {}", + e + )) + })? + } else { + info!("Using connection string with embedded credentials"); + env.call_static_method( + driver_manager, + "getConnection", + "(Ljava/lang/String;)Ljava/sql/Connection;", + &[JValue::Object(&jdbc_url.into())], + ) + .map_err(|e| { + Error::InitError(format!("Failed to create JDBC connection from URL: {}", e)) + })? + }; + + let connection_obj = connection + .l() + .map_err(|e| Error::InitError(format!("Failed to get connection object: {}", e)))?; + + let global_ref = env + .new_global_ref(connection_obj) + .map_err(|e| Error::InitError(format!("Failed to create global reference: {}", e)))?; + + info!("Direct database connection established successfully"); + Ok(global_ref) + } + + /// Create HikariCP connection pool + fn create_connection_pool_internal(&self, env: &mut JNIEnv) -> Result { + info!( + "Initializing HikariCP with max_pool_size={}, min_idle={}", + self.config.max_pool_size, self.config.min_idle + ); + + let hikari_config_class = env.find_class("com/zaxxer/hikari/HikariConfig").map_err( + |e| { + Error::InitError(format!( + "Failed to find HikariConfig class. Ensure HikariCP JAR is in classpath: {}", + e + )) + }, + )?; + + let hikari_config = env + .new_object(hikari_config_class, "()V", &[]) + .map_err(|e| Error::InitError(format!("Failed to create HikariConfig: {}", e)))?; + + let jdbc_url_jstring = env + .new_string(self.config.jdbc_url.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create JDBC URL: {}", e)))?; + env.call_method( + &hikari_config, + "setJdbcUrl", + "(Ljava/lang/String;)V", + &[JValue::Object(&jdbc_url_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set JDBC URL: {}", e)))?; + + if let Some(username) = &self.config.username { + let username_jstring = env + .new_string(username) + .map_err(|e| Error::InitError(format!("Failed to create username: {}", e)))?; + env.call_method( + &hikari_config, + "setUsername", + "(Ljava/lang/String;)V", + &[JValue::Object(&username_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set username: {}", e)))?; + } + + if let Some(password) = &self.config.password { + let password_jstring = env + .new_string(password.expose_secret()) + .map_err(|e| Error::InitError(format!("Failed to create password: {}", e)))?; + env.call_method( + &hikari_config, + "setPassword", + "(Ljava/lang/String;)V", + &[JValue::Object(&password_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set password: {}", e)))?; + } + + let driver_class_jstring = env + .new_string(&self.config.driver_class) + .map_err(|e| Error::InitError(format!("Failed to create driver class name: {}", e)))?; + env.call_method( + &hikari_config, + "setDriverClassName", + "(Ljava/lang/String;)V", + &[JValue::Object(&driver_class_jstring.into())], + ) + .map_err(|e| Error::InitError(format!("Failed to set driver class: {}", e)))?; + + env.call_method( + &hikari_config, + "setMaximumPoolSize", + "(I)V", + &[JValue::Int( + self.config.max_pool_size.min(i32::MAX as u32) as i32 + )], + ) + .map_err(|e| Error::InitError(format!("Failed to set max pool size: {}", e)))?; + + env.call_method( + &hikari_config, + "setMinimumIdle", + "(I)V", + &[JValue::Int(self.config.min_idle.min(i32::MAX as u32) as i32)], + ) + .map_err(|e| Error::InitError(format!("Failed to set min idle: {}", e)))?; + + env.call_method( + &hikari_config, + "setConnectionTimeout", + "(J)V", + &[JValue::Long(self.config.connection_timeout_ms as i64)], + ) + .map_err(|e| Error::InitError(format!("Failed to set connection timeout: {}", e)))?; + + let hikari_datasource_class = env + .find_class("com/zaxxer/hikari/HikariDataSource") + .map_err(|e| { + Error::InitError(format!("Failed to find HikariDataSource class: {}", e)) + })?; + + let datasource = env + .new_object( + hikari_datasource_class, + "(Lcom/zaxxer/hikari/HikariConfig;)V", + &[JValue::Object(&hikari_config)], + ) + .map_err(|e| Error::InitError(format!("Failed to create HikariDataSource: {}", e)))?; + + let global_ref = env.new_global_ref(datasource).map_err(|e| { + Error::InitError(format!("Failed to create global reference for pool: {}", e)) + })?; + + info!("HikariCP connection pool created successfully"); + Ok(global_ref) + } + + /// Acquire a connection. Returns the connection plus whether it was borrowed + /// from the pool (and therefore must be `close()`d to return it to the pool). + /// In direct mode a dead connection is transparently re-established. + fn get_connection<'local>( + &self, + env: &mut JNIEnv<'local>, + ) -> Result<(JObject<'local>, bool), Error> { + if let Some(pool) = &self.connection_pool { + let connection = env + .call_method( + pool.as_obj(), + "getConnection", + "()Ljava/sql/Connection;", + &[], + ) + .map_err(|e| { + Error::Connection(format!("Failed to get connection from pool: {}", e)) + })? + .l() + .map_err(|e| { + Error::Connection(format!("Failed to extract connection object: {}", e)) + })?; + return Ok((connection, true)); + } + + // Direct connection: validate and re-establish if it has dropped. + let needs_reconnect = { + let guard = self.connection.lock().expect("connection mutex poisoned"); + match guard.as_ref() { + Some(conn) => !self.connection_is_valid(env, conn.as_obj()), + None => true, + } + }; + if needs_reconnect { + info!("Direct JDBC connection is not valid; re-establishing"); + let new_conn = self.create_direct_connection_internal(env)?; + *self.connection.lock().expect("connection mutex poisoned") = Some(new_conn); + } + + let guard = self.connection.lock().expect("connection mutex poisoned"); + let conn = guard + .as_ref() + .ok_or_else(|| Error::Connection("No connection available".to_string()))?; + let local_ref = env + .new_local_ref(conn.as_obj()) + .map_err(|e| Error::Connection(format!("Failed to create local ref: {}", e)))?; + Ok((local_ref, false)) + } + + /// Best-effort `Connection.isValid(timeout)` check. Returns false on any + /// JNI error so the caller re-establishes the connection. + fn connection_is_valid(&self, env: &mut JNIEnv, conn: &JObject) -> bool { + let timeout_secs = (self.config.connection_timeout_ms / 1000).clamp(1, 30) as i32; + env.call_method(conn, "isValid", "(I)Z", &[JValue::Int(timeout_secs)]) + .and_then(|v| v.z()) + .unwrap_or(false) + } + + /// Execute query and fetch results. + /// + /// The mutex is held only briefly: once to read the current offset for + /// query building, and once after the JNI work to write the updated state. + fn execute_query(&self, env: &mut JNIEnv) -> Result, Error> { + let (connection, pooled) = self.get_connection(env)?; + + // Read current state snapshot (short lock) + let query = { + let state = self.state.lock().expect("state mutex poisoned"); + self.build_query(&state) + }; + info!("Executing query: {}", query); + + // Execute statement and fetch all rows (no lock held). A pooled + // connection must be returned to the pool afterwards (close() on a + // Hikari connection returns it rather than destroying it), otherwise + // the pool is exhausted after `max_pool_size` polls. + let result = self.execute_statement_and_fetch_rows(env, &connection, &query); + if pooled { + let _ = env.call_method(&connection, "close", "()V", &[]); + } + let (messages, row_count, max_offset) = result?; + + // Update state with results (short lock) + { + let mut state = self.state.lock().expect("state mutex poisoned"); + if let Some(offset) = max_offset { + state.last_offset = Some(offset); + } + state.processed_rows += row_count; + state.last_poll_time = Utc::now(); + info!( + "Fetched {} rows, total processed: {}", + row_count, state.processed_rows + ); + } + + Ok(messages) + } + + /// Prepare a JDBC statement, execute it, and read all result rows into messages. + fn execute_statement_and_fetch_rows( + &self, + env: &mut JNIEnv, + connection: &JObject, + query: &str, + ) -> Result<(Vec, u64, Option), Error> { + let query_jstring = env + .new_string(query) + .map_err(|e| Error::Connection(format!("Failed to create query string: {}", e)))?; + + let statement = match env + .call_method( + connection, + "prepareStatement", + "(Ljava/lang/String;)Ljava/sql/PreparedStatement;", + &[JValue::Object(&query_jstring.into())], + ) + .and_then(|v| v.l()) + { + Ok(s) => s, + Err(_) => return Err(classify_query_failure(env, "prepare statement")), + }; + + // Use setMaxRows for database-agnostic row limiting instead of SQL LIMIT clause. + // This works across all JDBC drivers (MySQL, Oracle, SQL Server, H2, etc.) + env.call_method( + &statement, + "setMaxRows", + "(I)V", + &[JValue::Int( + self.config.batch_size.min(i32::MAX as u32) as i32 + )], + ) + .map_err(|e| Error::Connection(format!("Failed to set max rows: {}", e)))?; + + let result_set = match env + .call_method(&statement, "executeQuery", "()Ljava/sql/ResultSet;", &[]) + .and_then(|v| v.l()) + { + Ok(rs) => rs, + Err(_) => { + let _ = env.call_method(&statement, "close", "()V", &[]); + return Err(classify_query_failure(env, "execute query")); + } + }; + + let columns = self.read_column_metadata(env, &result_set)?; + let (messages, row_count, max_offset) = self.read_rows(env, &result_set, &columns)?; + + // Close statement (best-effort) + let _ = env.call_method(&statement, "close", "()V", &[]); + + Ok((messages, row_count, max_offset)) + } + + /// Read column names and types from the ResultSet metadata. + fn read_column_metadata( + &self, + env: &mut JNIEnv, + result_set: &JObject, + ) -> Result, Error> { + let metadata = env + .call_method( + result_set, + "getMetaData", + "()Ljava/sql/ResultSetMetaData;", + &[], + ) + .map_err(|e| Error::Connection(format!("Failed to get metadata: {}", e)))? + .l() + .map_err(|e| Error::Connection(format!("Failed to get metadata object: {}", e)))?; + + let column_count = env + .call_method(&metadata, "getColumnCount", "()I", &[]) + .map_err(|e| Error::Connection(format!("Failed to get column count: {}", e)))? + .i() + .map_err(|e| Error::Connection(format!("Failed to extract column count: {}", e)))?; + + info!("Query returned {} columns", column_count); + + let mut columns = Vec::with_capacity(column_count as usize); + for i in 1..=column_count { + let col_name = self.get_column_name(env, &metadata, i)?; + let col_type = self.get_column_type(env, &metadata, i)?; + columns.push((col_name, col_type)); + } + + Ok(columns) + } + + /// Iterate over result set rows and convert each to a ProducedMessage. + fn read_rows( + &self, + env: &mut JNIEnv, + result_set: &JObject, + columns: &[(String, i32)], + ) -> Result<(Vec, u64, Option), Error> { + let mut messages = Vec::new(); + let mut row_count: u64 = 0; + let mut max_offset: Option = None; + + loop { + let has_next = env + .call_method(result_set, "next", "()Z", &[]) + .map_err(|e| Error::Connection(format!("Failed to fetch next row: {}", e)))? + .z() + .map_err(|e| Error::Connection(format!("Failed to extract boolean: {}", e)))?; + + if !has_next { + break; + } + + // Read each row inside its own JNI local-reference frame so the per + // -column local refs (getObject/getString/getBytes results) are + // reclaimed every iteration; otherwise a large result set would + // overflow the JNI local reference table and abort the JVM. + env.push_local_frame(32) + .map_err(|e| Error::Connection(format!("Failed to push local frame: {}", e)))?; + let row_result = self.read_single_row(env, result_set, columns); + // SAFETY: `read_single_row` returns only owned Rust data (a JSON map + // and an optional String); no JNI local reference escapes the frame. + let _ = unsafe { env.pop_local_frame(&JObject::null()) }; + let (row_data, offset) = row_result?; + + if let Some(offset) = offset { + max_offset = Some(offset); + } + + let message = self.build_message(row_data)?; + messages.push(message); + row_count += 1; + } + + Ok((messages, row_count, max_offset)) + } + + /// Extract data from a single result set row, returning the row map and optional offset. + fn read_single_row( + &self, + env: &mut JNIEnv, + result_set: &JObject, + columns: &[(String, i32)], + ) -> Result<(serde_json::Map, Option), Error> { + let mut row_data = serde_json::Map::new(); + let mut offset = None; + + for (idx, (col_name, col_type)) in columns.iter().enumerate() { + let col_idx = (idx + 1) as i32; + let value = self.extract_column_value(env, result_set, col_idx, col_type)?; + + let final_col_name = if self.config.snake_case_columns { + to_snake_case(col_name) + } else { + col_name.clone() + }; + + row_data.insert(final_col_name.clone(), value); + + // Track offset if this is the tracking column + if let Some(ref tracking_col) = self.config.tracking_column + && col_name == tracking_col + { + offset = Some(self.extract_offset_value(&row_data, &final_col_name)); + } + } + + Ok((row_data, offset)) + } + + /// Build a ProducedMessage from row data, optionally wrapping in DatabaseRecord metadata. + fn build_message( + &self, + row_data: serde_json::Map, + ) -> Result { + let payload = if self.config.include_metadata { + let record = DatabaseRecord { + table_name: None, + operation_type: "SELECT".to_string(), + timestamp: Utc::now(), + data: serde_json::Value::Object(row_data), + }; + serde_json::to_vec(&record) + .map_err(|e| Error::Serialization(format!("Failed to serialize record: {}", e)))? + } else { + serde_json::to_vec(&serde_json::Value::Object(row_data)) + .map_err(|e| Error::Serialization(format!("Failed to serialize row data: {}", e)))? + }; + + Ok(ProducedMessage { + id: Some(Uuid::new_v4().as_u128()), + payload, + headers: None, + checksum: None, + timestamp: Some(Utc::now().timestamp_millis() as u64), + origin_timestamp: Some(Utc::now().timestamp_millis() as u64), + }) + } + + /// Build query with offset placeholder replacement. + /// Row limiting is handled via JDBC setMaxRows rather than SQL LIMIT + /// to ensure cross-database compatibility. + fn build_query(&self, state: &State) -> String { + let mut query = self.config.query.clone(); + + if self.config.mode == Mode::Incremental { + if let Some(ref offset) = state.last_offset { + query = query.replace("{last_offset}", "e_sql_literal(offset)); + } else if let Some(ref initial) = self.config.initial_offset { + query = query.replace("{last_offset}", "e_sql_literal(initial)); + } else { + // Remove WHERE clause if no offset available + query = query.replace("WHERE {tracking_column} > {last_offset}", ""); + } + } + + query + } + + /// Get column name from ResultSetMetaData + fn get_column_name( + &self, + env: &mut JNIEnv, + metadata: &JObject, + column_index: i32, + ) -> Result { + let col_name_obj = env + .call_method( + metadata, + "getColumnName", + "(I)Ljava/lang/String;", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get column name: {}", e)))? + .l() + .map_err(|e| Error::Connection(format!("Failed to get column name object: {}", e)))?; + + let col_name: String = env + .get_string(&JString::from(col_name_obj)) + .map_err(|e| Error::Connection(format!("Failed to convert column name: {}", e)))? + .into(); + + Ok(col_name) + } + + /// Get column type from ResultSetMetaData + fn get_column_type( + &self, + env: &mut JNIEnv, + metadata: &JObject, + column_index: i32, + ) -> Result { + let col_type = env + .call_method( + metadata, + "getColumnType", + "(I)I", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get column type: {}", e)))? + .i() + .map_err(|e| Error::Connection(format!("Failed to extract column type: {}", e)))?; + + Ok(col_type) + } + + /// Extract column value based on JDBC type + fn extract_column_value( + &self, + env: &mut JNIEnv, + result_set: &JObject, + column_index: i32, + sql_type: &i32, + ) -> Result { + use java::sql::Types; + + // Check if null first + let obj = env + .call_method( + result_set, + "getObject", + "(I)Ljava/lang/Object;", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get object: {}", e)))? + .l() + .map_err(|e| Error::Connection(format!("Failed to get object reference: {}", e)))?; + + if obj.is_null() { + return Ok(serde_json::Value::Null); + } + + match *sql_type { + Types::BIT | Types::BOOLEAN => { + let value = env + .call_method( + result_set, + "getBoolean", + "(I)Z", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get boolean: {}", e)))? + .z() + .map_err(|e| Error::Connection(format!("Failed to extract boolean: {}", e)))?; + Ok(serde_json::Value::Bool(value)) + } + Types::TINYINT | Types::SMALLINT | Types::INTEGER => { + let value = env + .call_method(result_set, "getInt", "(I)I", &[JValue::Int(column_index)]) + .map_err(|e| Error::Connection(format!("Failed to get int: {}", e)))? + .i() + .map_err(|e| Error::Connection(format!("Failed to extract int: {}", e)))?; + Ok(serde_json::json!(value)) + } + Types::BIGINT => { + let value = env + .call_method(result_set, "getLong", "(I)J", &[JValue::Int(column_index)]) + .map_err(|e| Error::Connection(format!("Failed to get long: {}", e)))? + .j() + .map_err(|e| Error::Connection(format!("Failed to extract long: {}", e)))?; + Ok(serde_json::json!(value)) + } + Types::FLOAT | Types::REAL => { + let value = env + .call_method(result_set, "getFloat", "(I)F", &[JValue::Int(column_index)]) + .map_err(|e| Error::Connection(format!("Failed to get float: {}", e)))? + .f() + .map_err(|e| Error::Connection(format!("Failed to extract float: {}", e)))?; + Ok(serde_json::json!(value)) + } + Types::DOUBLE => { + let value = env + .call_method( + result_set, + "getDouble", + "(I)D", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get double: {}", e)))? + .d() + .map_err(|e| Error::Connection(format!("Failed to extract double: {}", e)))?; + Ok(serde_json::json!(value)) + } + // NUMERIC/DECIMAL can carry more precision than an f64 can represent + // (e.g. money/large decimals), so emit them as strings to avoid + // silent precision loss. + Types::NUMERIC | Types::DECIMAL => { + self.get_column_as_string(env, result_set, column_index) + } + // Binary columns are base64-encoded so arbitrary bytes survive the + // round-trip through JSON. + Types::BINARY | Types::VARBINARY | Types::LONGVARBINARY => { + let bytes_obj = env + .call_method( + result_set, + "getBytes", + "(I)[B", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get bytes: {}", e)))? + .l() + .map_err(|e| Error::Connection(format!("Failed to get bytes object: {}", e)))?; + if bytes_obj.is_null() { + return Ok(serde_json::Value::Null); + } + let buf = env + .convert_byte_array(JByteArray::from(bytes_obj)) + .map_err(|e| Error::Connection(format!("Failed to convert bytes: {}", e)))?; + use base64::Engine; + Ok(serde_json::Value::String( + base64::engine::general_purpose::STANDARD.encode(&buf), + )) + } + Types::TIMESTAMP | Types::DATE | Types::TIME => { + let value = env + .call_method( + result_set, + "getString", + "(I)Ljava/lang/String;", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get timestamp: {}", e)))? + .l() + .map_err(|e| { + Error::Connection(format!("Failed to get timestamp object: {}", e)) + })?; + let str_value: String = env + .get_string(&JString::from(value)) + .map_err(|e| Error::Connection(format!("Failed to convert timestamp: {}", e)))? + .into(); + Ok(serde_json::Value::String(str_value)) + } + // Default: getString for all other types (CHAR, VARCHAR, etc.) + _ => self.get_column_as_string(env, result_set, column_index), + } + } + + /// Read a column via `ResultSet.getString`, returning JSON `null` when the + /// value is SQL NULL. + fn get_column_as_string( + &self, + env: &mut JNIEnv, + result_set: &JObject, + column_index: i32, + ) -> Result { + let value = env + .call_method( + result_set, + "getString", + "(I)Ljava/lang/String;", + &[JValue::Int(column_index)], + ) + .map_err(|e| Error::Connection(format!("Failed to get string: {}", e)))? + .l() + .map_err(|e| Error::Connection(format!("Failed to get string object: {}", e)))?; + + if value.is_null() { + Ok(serde_json::Value::Null) + } else { + let str_value: String = env + .get_string(&JString::from(value)) + .map_err(|e| Error::Connection(format!("Failed to convert string: {}", e)))? + .into(); + Ok(serde_json::Value::String(str_value)) + } + } + + /// Extract offset value as string + fn extract_offset_value( + &self, + row_data: &serde_json::Map, + col_name: &str, + ) -> String { + row_data + .get(col_name) + .map(|v| match v { + serde_json::Value::Number(n) => n.to_string(), + serde_json::Value::String(s) => s.clone(), + _ => v.to_string(), + }) + .unwrap_or_default() + } +} + +#[async_trait] +impl Source for JdbcSource { + async fn open(&mut self) -> Result<(), Error> { + info!("Opening JDBC source connector [{}]", self.id); + info!( + "Configuration: JDBC URL={}, Driver={}, Mode={:?}", + sanitize_jdbc_url(self.config.jdbc_url.expose_secret()), + self.config.driver_class, + self.config.mode + ); + + // Initialize JVM + self.initialize_jvm()?; + + // Create database connection + self.create_connection()?; + + info!("JDBC source connector [{}] opened successfully", self.id); + Ok(()) + } + + async fn poll(&self) -> Result { + // Sleep for poll interval + tokio::time::sleep(self.config.poll_interval).await; + + let jvm = self + .jvm + .as_ref() + .ok_or_else(|| Error::InitError("JVM not initialized".to_string()))?; + + let mut env = jvm + .attach_current_thread() + .map_err(|e| Error::InitError(format!("Failed to attach thread: {}", e)))?; + + // Execute query and fetch results + let messages = self.execute_query(&mut env)?; + + // Persist state so offsets survive connector restarts + let connector_state = { + let state = self.state.lock().expect("state mutex poisoned"); + ConnectorState::serialize(&*state, CONNECTOR_NAME, self.id) + }; + + Ok(ProducedMessages { + schema: Schema::Json, + messages, + state: connector_state, + }) + } + + async fn close(&mut self) -> Result<(), Error> { + info!("Closing JDBC source connector [{}]", self.id); + + if let Some(jvm) = &self.jvm + && let Ok(mut env) = jvm.attach_current_thread() + { + // Close connection pool if exists + if let Some(pool) = &self.connection_pool { + let _ = env.call_method(pool.as_obj(), "close", "()V", &[]); + info!("Connection pool closed"); + } + + // Close direct connection if exists + if let Some(connection) = self + .connection + .lock() + .expect("connection mutex poisoned") + .as_ref() + { + let _ = env.call_method(connection.as_obj(), "close", "()V", &[]); + info!("Database connection closed"); + } + } + + let state = self.state.lock().expect("state mutex poisoned"); + info!( + "JDBC source connector [{}] closed. Total rows processed: {}", + self.id, state.processed_rows + ); + + Ok(()) + } +} + +/// Convert string to snake_case +fn to_snake_case(s: &str) -> String { + let mut result = String::new(); + let mut prev_is_upper = false; + + for (i, ch) in s.chars().enumerate() { + if ch.is_uppercase() { + if i > 0 && !prev_is_upper { + result.push('_'); + } + result.push(ch.to_lowercase().next().unwrap()); + prev_is_upper = true; + } else { + result.push(ch); + prev_is_upper = false; + } + } + + result +} + +/// Process-wide JVM. JNI allows only one `JavaVM` per OS process, so every JDBC +/// connector instance in this dynamic library shares this one. +static GLOBAL_JVM: Mutex>> = Mutex::new(None); + +/// Return the process JVM, creating it on first use within this dynamic +/// library. The first caller's `jvm_options`/classpath win; later callers (e.g. +/// a second JDBC connector of the same type) reuse the existing VM instead of +/// failing with `JNI_EEXIST`. +/// +/// Limitation: a JDBC *source* and a JDBC *sink* are separate dynamic libraries +/// and do not share this static, so configuring both in the *same* connectors +/// runtime process is not supported (the second to start cannot create a second +/// JVM). Run them in separate runtime processes. +fn get_or_create_jvm(driver_jar_path: &str, jvm_options: &[String]) -> Result, Error> { + let mut guard = GLOBAL_JVM.lock().expect("jvm mutex poisoned"); + if let Some(jvm) = guard.as_ref() { + info!("Reusing existing process JVM"); + return Ok(jvm.clone()); + } + + let classpath_option = format!("-Djava.class.path={driver_jar_path}"); + let mut args_builder = jni::InitArgsBuilder::new() + .version(jni::JNIVersion::V8) + .option(&classpath_option); + for option in jvm_options { + args_builder = args_builder.option(option); + } + let jvm_args = args_builder + .build() + .map_err(|e| Error::InitError(format!("Failed to build JVM arguments: {e:?}")))?; + let jvm = JavaVM::new(jvm_args) + .map_err(|e| Error::InitError(format!("Failed to create JVM: {e:?}")))?; + + info!("JVM initialized successfully (classpath: {driver_jar_path})"); + let arc = Arc::new(jvm); + *guard = Some(arc.clone()); + Ok(arc) +} + +/// Quote a value as a SQL string literal, escaping embedded single quotes by +/// doubling them. Used to substitute the incremental `{last_offset}` value, +/// which originates from a (DB-controlled) tracking-column value. +fn quote_sql_literal(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +/// Classify a JDBC `SQLState` class (first 2 chars) as transient vs permanent. +/// `08` connection, `40` rollback/serialization, `53` resources, `57` operator +/// intervention, `58` system error are transient; everything else (and an +/// unknown/absent state) is permanent. +fn is_transient_sql_state(sql_state: Option<&str>) -> bool { + match sql_state { + Some(s) if s.len() >= 2 => matches!(&s[..2], "08" | "40" | "53" | "57" | "58"), + _ => false, + } +} + +/// Inspect and CLEAR the pending Java exception after a failed query JNI call, +/// returning a classified `Error`: transient SQL states map to +/// `Error::Connection` (so the runtime re-polls and the connection is +/// re-validated), permanent ones to `Error::InvalidRecordValue`. Clearing is +/// required so the next JNI call on this thread is not aborted. +fn classify_query_failure(env: &mut JNIEnv, action: &str) -> Error { + let (sql_state, message) = take_pending_sql_exception(env); + let transient = is_transient_sql_state(sql_state.as_deref()); + let state = sql_state.as_deref().unwrap_or("?"); + let msg = format!("Failed to {action} (SQLState {state}): {message}"); + if transient { + Error::Connection(msg) + } else { + Error::InvalidRecordValue(msg) + } +} + +/// Take the pending Java exception (clearing it) and return its `SQLState` (if a +/// `java.sql.SQLException`) and message. +fn take_pending_sql_exception(env: &mut JNIEnv) -> (Option, String) { + let throwable = match env.exception_occurred() { + Ok(t) if !t.is_null() => t, + _ => return (None, "unknown error".to_string()), + }; + let _ = env.exception_clear(); + + let message = throwable_string_method(env, &throwable, "getMessage") + .unwrap_or_else(|| "unknown error".to_string()); + let sql_state = if env + .is_instance_of(&throwable, "java/sql/SQLException") + .unwrap_or(false) + { + throwable_string_method(env, &throwable, "getSQLState") + } else { + None + }; + (sql_state, message) +} + +/// Call a no-arg `String`-returning method on a throwable; None on JNI error/null. +fn throwable_string_method( + env: &mut JNIEnv, + throwable: &JThrowable, + method: &str, +) -> Option { + let obj = env + .call_method(throwable, method, "()Ljava/lang/String;", &[]) + .ok()? + .l() + .ok()?; + if obj.is_null() { + return None; + } + env.get_string(&JString::from(obj)).ok().map(|s| s.into()) +} + +/// JDBC SQL Types constants +mod java { + pub mod sql { + #[allow(dead_code)] + pub struct Types; + + #[allow(dead_code)] + impl Types { + pub const BIT: i32 = -7; + pub const TINYINT: i32 = -6; + pub const SMALLINT: i32 = 5; + pub const INTEGER: i32 = 4; + pub const BIGINT: i32 = -5; + pub const FLOAT: i32 = 6; + pub const REAL: i32 = 7; + pub const DOUBLE: i32 = 8; + pub const NUMERIC: i32 = 2; + pub const DECIMAL: i32 = 3; + pub const CHAR: i32 = 1; + pub const VARCHAR: i32 = 12; + pub const LONGVARCHAR: i32 = -1; + pub const DATE: i32 = 91; + pub const TIME: i32 = 92; + pub const TIMESTAMP: i32 = 93; + pub const BINARY: i32 = -2; + pub const VARBINARY: i32 = -3; + pub const LONGVARBINARY: i32 = -4; + pub const NULL: i32 = 0; + pub const BOOLEAN: i32 = 16; + } + } +} + +// Export the connector via SDK macro +source_connector!(JdbcSource); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sanitize_jdbc_url_mysql_format() { + let url = "jdbc:mysql://root:SuperSecret123@localhost:3306/mydb"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, "jdbc:mysql://root:***@localhost:3306/mydb"); + assert!(!sanitized.contains("SuperSecret123")); + } + + #[test] + fn test_sanitize_jdbc_url_postgresql_query_params() { + let url = "jdbc:postgresql://localhost:5432/mydb?user=admin&password=P@ssw0rd&ssl=true"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!( + sanitized, + "jdbc:postgresql://localhost:5432/mydb?user=admin&password=***&ssl=true" + ); + assert!(!sanitized.contains("P@ssw0rd")); + } + + #[test] + fn test_sanitize_jdbc_url_oracle_format() { + let url = "jdbc:oracle:thin:system/oracle123@localhost:1521:XE"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, "jdbc:oracle:thin:system/***@localhost:1521:XE"); + assert!(!sanitized.contains("oracle123")); + } + + #[test] + fn test_sanitize_jdbc_url_sqlserver_format() { + let url = "jdbc:sqlserver://localhost:1433;user=sa;password=MySecretPass;database=Sales"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!( + sanitized, + "jdbc:sqlserver://localhost:1433;user=sa;password=***;database=Sales" + ); + assert!(!sanitized.contains("MySecretPass")); + } + + #[test] + fn test_sanitize_jdbc_url_h2_format() { + let url = "jdbc:h2:mem:testdb;USER=sa;PASSWORD=secret"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, "jdbc:h2:mem:testdb;USER=sa;PASSWORD=***"); + assert!(!sanitized.contains("secret")); + } + + #[test] + fn test_sanitize_jdbc_url_case_insensitive() { + let url1 = "jdbc:postgresql://localhost?password=secret"; + let url2 = "jdbc:postgresql://localhost?PASSWORD=secret"; + let url3 = "jdbc:postgresql://localhost?pwd=secret"; + let url4 = "jdbc:postgresql://localhost?PWD=secret"; + + for url in [url1, url2, url3, url4] { + let sanitized = sanitize_jdbc_url(url); + assert!(!sanitized.contains("secret"), "Failed for URL: {}", url); + assert!(sanitized.contains("***")); + } + } + + #[test] + fn test_sanitize_jdbc_url_no_password() { + let url = "jdbc:h2:mem:testdb"; + let sanitized = sanitize_jdbc_url(url); + assert_eq!(sanitized, url); + } + + #[test] + fn test_sanitize_jdbc_url_multiple_passwords() { + let url = "jdbc:postgresql://localhost?password=secret1&pwd=secret2"; + let sanitized = sanitize_jdbc_url(url); + assert!(!sanitized.contains("secret1")); + assert!(!sanitized.contains("secret2")); + assert_eq!( + sanitized, + "jdbc:postgresql://localhost?password=***&pwd=***" + ); + } + + #[test] + fn test_build_query_incremental_with_offset() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM users WHERE id > {last_offset} ORDER BY id".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: Some("id".to_string()), + initial_offset: Some("0".to_string()), + mode: Mode::Incremental, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + // With initial offset (no last_offset yet) + let state = State { + last_offset: None, + processed_rows: 0, + last_poll_time: Utc::now(), + }; + let query = source.build_query(&state); + assert_eq!(query, "SELECT * FROM users WHERE id > '0' ORDER BY id"); + + // With tracked offset + let state = State { + last_offset: Some("42".to_string()), + processed_rows: 42, + last_poll_time: Utc::now(), + }; + let query = source.build_query(&state); + assert_eq!(query, "SELECT * FROM users WHERE id > '42' ORDER BY id"); + } + + #[test] + fn test_build_query_bulk_mode_no_limit_appended() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM products".to_string(), + poll_interval: Duration::from_secs(60), + batch_size: 5000, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: false, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + let state = State::default(); + let query = source.build_query(&state); + // build_query should NOT append LIMIT; row limiting is done via setMaxRows + assert_eq!(query, "SELECT * FROM products"); + assert!(!query.to_uppercase().contains("LIMIT")); + } + + #[test] + fn test_state_restoration_from_connector_state() { + let original_state = State { + last_offset: Some("2024-06-15 12:00:00".to_string()), + processed_rows: 1500, + last_poll_time: Utc::now(), + }; + let connector_state = ConnectorState::serialize(&original_state, CONNECTOR_NAME, 1) + .expect("Failed to serialize state"); + + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM orders WHERE updated_at > {last_offset}".to_string(), + poll_interval: Duration::from_secs(30), + batch_size: 1000, + tracking_column: Some("updated_at".to_string()), + initial_offset: Some("2024-01-01 00:00:00".to_string()), + mode: Mode::Incremental, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, Some(connector_state)); + let state = source.state.lock().unwrap(); + assert_eq!(state.last_offset, Some("2024-06-15 12:00:00".to_string())); + assert_eq!(state.processed_rows, 1500); + } + + #[test] + fn test_quote_sql_literal_escapes_single_quotes() { + assert_eq!(quote_sql_literal("42"), "'42'"); + assert_eq!( + quote_sql_literal("2024-01-01 00:00:00"), + "'2024-01-01 00:00:00'" + ); + assert_eq!(quote_sql_literal("o'brien"), "'o''brien'"); + assert_eq!( + quote_sql_literal("x'; DROP TABLE t; --"), + "'x''; DROP TABLE t; --'" + ); + } + + #[test] + fn test_is_transient_sql_state() { + for s in [ + "08001", "08006", "40001", "40P01", "53300", "57P01", "58030", + ] { + assert!(is_transient_sql_state(Some(s)), "{s} should be transient"); + } + for s in ["22001", "23505", "42601", "42P01", "99999"] { + assert!(!is_transient_sql_state(Some(s)), "{s} should be permanent"); + } + assert!(!is_transient_sql_state(None)); + assert!(!is_transient_sql_state(Some(""))); + } + + #[test] + fn test_to_snake_case() { + assert_eq!(to_snake_case("OrderDate"), "order_date"); + assert_eq!(to_snake_case("updatedAt"), "updated_at"); + assert_eq!(to_snake_case("ID"), "id"); // consecutive uppers stay together + assert_eq!(to_snake_case("already_snake"), "already_snake"); + assert_eq!(to_snake_case("simple"), "simple"); + } + + // ========================================================================= + // Config deserialization tests + // ========================================================================= + + #[test] + fn test_config_deserialization_minimal_toml() { + let toml_str = r#" + jdbc_url = "jdbc:h2:mem:test" + driver_class = "org.h2.Driver" + driver_jar_path = "/tmp/h2.jar" + query = "SELECT * FROM users" + poll_interval = "30s" + "#; + let config: JdbcSourceConfig = + toml::from_str(toml_str).expect("Failed to parse minimal TOML config"); + assert_eq!(config.driver_class, "org.h2.Driver"); + assert_eq!(config.query, "SELECT * FROM users"); + assert_eq!(config.poll_interval, Duration::from_secs(30)); + // Verify defaults are applied + assert_eq!(config.mode, Mode::Incremental); + assert_eq!(config.batch_size, 1000); + assert!(config.include_metadata); + assert!(!config.snake_case_columns); + assert!(!config.enable_connection_pool); + assert_eq!(config.max_pool_size, 10); + assert_eq!(config.min_idle, 2); + assert_eq!(config.connection_timeout_ms, 30000); + assert!(config.username.is_none()); + assert!(config.password.is_none()); + assert!(config.tracking_column.is_none()); + assert!(config.initial_offset.is_none()); + assert!(config.jvm_options.is_empty()); + } + + #[test] + fn test_config_deserialization_full_toml() { + let toml_str = r#" + jdbc_url = "jdbc:mysql://localhost:3306/mydb" + driver_class = "com.mysql.cj.jdbc.Driver" + driver_jar_path = "/opt/drivers/mysql.jar" + username = "admin" + password = "s3cret" + query = "SELECT * FROM orders WHERE id > {last_offset} ORDER BY id" + poll_interval = "5m" + batch_size = 500 + tracking_column = "id" + initial_offset = "0" + mode = "incremental" + snake_case_columns = true + include_metadata = false + jvm_options = ["-Xmx512m", "-Xms128m"] + enable_connection_pool = true + max_pool_size = 20 + min_idle = 5 + connection_timeout_ms = 60000 + "#; + let config: JdbcSourceConfig = + toml::from_str(toml_str).expect("Failed to parse full TOML config"); + assert_eq!(config.driver_class, "com.mysql.cj.jdbc.Driver"); + assert_eq!(config.username.as_deref(), Some("admin")); + assert!(config.password.is_some()); + assert_eq!(config.batch_size, 500); + assert_eq!(config.tracking_column.as_deref(), Some("id")); + assert_eq!(config.initial_offset.as_deref(), Some("0")); + assert_eq!(config.mode, Mode::Incremental); + assert!(config.snake_case_columns); + assert!(!config.include_metadata); + assert_eq!(config.jvm_options, vec!["-Xmx512m", "-Xms128m"]); + assert!(config.enable_connection_pool); + assert_eq!(config.max_pool_size, 20); + assert_eq!(config.min_idle, 5); + assert_eq!(config.connection_timeout_ms, 60000); + assert_eq!(config.poll_interval, Duration::from_secs(300)); + } + + #[test] + fn test_config_deserialization_bulk_mode() { + let toml_str = r#" + jdbc_url = "jdbc:h2:mem:test" + driver_class = "org.h2.Driver" + driver_jar_path = "/tmp/h2.jar" + query = "SELECT * FROM products" + poll_interval = "1h" + mode = "bulk" + "#; + let config: JdbcSourceConfig = + toml::from_str(toml_str).expect("Failed to parse bulk mode config"); + assert_eq!(config.mode, Mode::Bulk); + assert_eq!(config.poll_interval, Duration::from_secs(3600)); + } + + #[test] + fn test_config_deserialization_invalid_mode_fails() { + let toml_str = r#" + jdbc_url = "jdbc:h2:mem:test" + driver_class = "org.h2.Driver" + driver_jar_path = "/tmp/h2.jar" + query = "SELECT 1" + poll_interval = "1s" + mode = "invalid_mode" + "#; + let result = toml::from_str::(toml_str); + assert!( + result.is_err(), + "Expected error for invalid mode, but got: {:?}", + result + ); + } + + // ========================================================================= + // State restoration tests + // ========================================================================= + + #[test] + fn test_state_restoration_with_malformed_bytes_falls_back_to_default() { + let connector_state = ConnectorState(vec![0xFF, 0xFE, 0xFD, 0x00]); + + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, Some(connector_state)); + let state = source.state.lock().unwrap(); + // Should fall back to default state + assert!(state.last_offset.is_none()); + assert_eq!(state.processed_rows, 0); + } + + #[test] + fn test_state_restoration_with_empty_bytes_falls_back_to_default() { + let connector_state = ConnectorState(vec![]); + + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, Some(connector_state)); + let state = source.state.lock().unwrap(); + assert!(state.last_offset.is_none()); + assert_eq!(state.processed_rows, 0); + } + + #[test] + fn test_state_restoration_none_uses_initial_offset() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM orders WHERE id > {last_offset}".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: Some("id".to_string()), + initial_offset: Some("100".to_string()), + mode: Mode::Incremental, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + let state = source.state.lock().unwrap(); + assert_eq!(state.last_offset, Some("100".to_string())); + assert_eq!(state.processed_rows, 0); + } + + #[test] + fn test_state_restoration_none_without_initial_offset() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM products".to_string(), + poll_interval: Duration::from_secs(60), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + let state = source.state.lock().unwrap(); + assert!(state.last_offset.is_none()); + assert_eq!(state.processed_rows, 0); + } + + // ========================================================================= + // extract_offset_value tests + // ========================================================================= + + #[test] + fn test_extract_offset_value_with_integer() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + let mut row = serde_json::Map::new(); + row.insert("id".to_string(), serde_json::json!(42)); + assert_eq!(source.extract_offset_value(&row, "id"), "42"); + } + + #[test] + fn test_extract_offset_value_with_string() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + let mut row = serde_json::Map::new(); + row.insert( + "updated_at".to_string(), + serde_json::json!("2024-06-15 12:00:00"), + ); + assert_eq!( + source.extract_offset_value(&row, "updated_at"), + "2024-06-15 12:00:00" + ); + } + + #[test] + fn test_extract_offset_value_with_float() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + let mut row = serde_json::Map::new(); + row.insert("version".to_string(), serde_json::json!(3.5)); + assert_eq!(source.extract_offset_value(&row, "version"), "3.5"); + } + + #[test] + fn test_extract_offset_value_with_null() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + let mut row = serde_json::Map::new(); + row.insert("id".to_string(), serde_json::Value::Null); + assert_eq!(source.extract_offset_value(&row, "id"), "null"); + } + + #[test] + fn test_extract_offset_value_missing_column() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + + let row = serde_json::Map::new(); + assert_eq!(source.extract_offset_value(&row, "nonexistent"), ""); + } + + // ========================================================================= + // build_query edge case tests + // ========================================================================= + + #[test] + fn test_build_query_incremental_no_offset_no_initial_removes_where_clause() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM users WHERE {tracking_column} > {last_offset} ORDER BY id" + .to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: Some("id".to_string()), + initial_offset: None, + mode: Mode::Incremental, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + let state = State { + last_offset: None, + processed_rows: 0, + last_poll_time: Utc::now(), + }; + let query = source.build_query(&state); + // The WHERE clause placeholder should be removed + assert!( + !query.contains("{last_offset}"), + "Query should not contain unresolved placeholder: {}", + query + ); + } + + #[test] + fn test_build_query_bulk_mode_ignores_offset() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT * FROM users WHERE id > {last_offset}".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: Some("id".to_string()), + initial_offset: Some("0".to_string()), + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + let source = JdbcSource::new(1, config, None); + let state = State { + last_offset: Some("42".to_string()), + processed_rows: 42, + last_poll_time: Utc::now(), + }; + let query = source.build_query(&state); + // In bulk mode, offset placeholders are NOT replaced + assert!( + query.contains("{last_offset}"), + "Bulk mode should not replace offset placeholders: {}", + query + ); + } + + // ========================================================================= + // Mode enum tests + // ========================================================================= + + #[test] + fn test_mode_serialization_roundtrip() { + let incremental = Mode::Incremental; + let serialized = serde_json::to_string(&incremental).unwrap(); + assert_eq!(serialized, r#""incremental""#); + let deserialized: Mode = serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized, Mode::Incremental); + + let bulk = Mode::Bulk; + let serialized = serde_json::to_string(&bulk).unwrap(); + assert_eq!(serialized, r#""bulk""#); + let deserialized: Mode = serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized, Mode::Bulk); + } + + #[test] + fn test_mode_deserialization_rejects_unknown() { + let result = serde_json::from_str::(r#""streaming""#); + assert!( + result.is_err(), + "Unknown mode 'streaming' should fail deserialization" + ); + } + + // ========================================================================= + // Debug impl tests (ensures secrets are not leaked) + // ========================================================================= + + #[test] + fn test_config_debug_does_not_leak_password() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:mysql://root:SuperSecret@localhost/db"), + driver_class: "com.mysql.cj.jdbc.Driver".to_string(), + driver_jar_path: "/tmp/mysql.jar".to_string(), + username: Some("admin".to_string()), + password: Some(SecretString::from("MyP@ssw0rd")), + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + + let debug_output = format!("{:?}", config); + assert!( + !debug_output.contains("SuperSecret"), + "Debug output should not contain JDBC URL password: {}", + debug_output + ); + assert!( + !debug_output.contains("MyP@ssw0rd"), + "Debug output should not contain password field: {}", + debug_output + ); + assert!( + debug_output.contains("***"), + "Debug output should contain masked password: {}", + debug_output + ); + } + + #[test] + fn test_config_debug_without_password() { + let config = JdbcSourceConfig { + jdbc_url: SecretString::from("jdbc:h2:mem:test"), + driver_class: "org.h2.Driver".to_string(), + driver_jar_path: "/tmp/h2.jar".to_string(), + username: None, + password: None, + query: "SELECT 1".to_string(), + poll_interval: Duration::from_secs(10), + batch_size: 100, + tracking_column: None, + initial_offset: None, + mode: Mode::Bulk, + snake_case_columns: false, + include_metadata: true, + jvm_options: vec![], + enable_connection_pool: false, + max_pool_size: 10, + min_idle: 2, + connection_timeout_ms: 30000, + }; + + let debug_output = format!("{:?}", config); + // Should not panic and should contain the struct name + assert!(debug_output.contains("JdbcSourceConfig")); + } +} diff --git a/core/integration/tests/connectors/jdbc/config_postgres.toml b/core/integration/tests/connectors/jdbc/config_postgres.toml new file mode 100644 index 0000000000..19097f5988 --- /dev/null +++ b/core/integration/tests/connectors/jdbc/config_postgres.toml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# JDBC Source Connector Runtime Configuration for Postgres + +[connectors] +config_type = "local" +config_dir = "tests/connectors/jdbc/connectors_config_postgres" diff --git a/core/integration/tests/connectors/jdbc/config_sink_postgres.toml b/core/integration/tests/connectors/jdbc/config_sink_postgres.toml new file mode 100644 index 0000000000..f4a61aa484 --- /dev/null +++ b/core/integration/tests/connectors/jdbc/config_sink_postgres.toml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# JDBC Sink Connector Runtime Configuration for Postgres + +[connectors] +config_type = "local" +config_dir = "tests/connectors/jdbc/connectors_config_sink_postgres" diff --git a/core/integration/tests/connectors/jdbc/connectors_config_postgres/jdbc_pg.toml b/core/integration/tests/connectors/jdbc/connectors_config_postgres/jdbc_pg.toml new file mode 100644 index 0000000000..e8ba3d57e9 --- /dev/null +++ b/core/integration/tests/connectors/jdbc/connectors_config_postgres/jdbc_pg.toml @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# JDBC Source Connector for PostgreSQL + +type = "source" +key = "jdbc_pg" +enabled = true +version = 0 +name = "JDBC PostgreSQL Source" +path = "../../target/debug/libiggy_connector_jdbc_source" + +[[streams]] +stream = "test_stream" +topic = "test_topic" +schema = "json" +partition_id = 1 + +[plugin_config] +# All required fields - values will be overridden by environment variables +# Use placeholder values that match the expected types +jdbc_url = "" +driver_class = "" +driver_jar_path = "" +username = "" +password = "" +query = "" +poll_interval = "1s" +batch_size = 100 +mode = "bulk" +# Used only by incremental-mode tests; ignored in bulk mode. +tracking_column = "id" +initial_offset = "0" +enable_connection_pool = false +snake_case_columns = false +include_metadata = true diff --git a/core/integration/tests/connectors/jdbc/connectors_config_sink_postgres/jdbc_sink_pg.toml b/core/integration/tests/connectors/jdbc/connectors_config_sink_postgres/jdbc_sink_pg.toml new file mode 100644 index 0000000000..f04e0042cc --- /dev/null +++ b/core/integration/tests/connectors/jdbc/connectors_config_sink_postgres/jdbc_sink_pg.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# JDBC Sink Connector for PostgreSQL + +type = "sink" +key = "jdbc_sink_pg" +enabled = true +version = 0 +name = "JDBC PostgreSQL Sink" +path = "../../target/debug/libiggy_connector_jdbc_sink" + +[[streams]] +stream = "test_stream" +topics = ["test_topic"] +schema = "json" + +[plugin_config] +# All values are overridden at test time via environment variables. +jdbc_url = "" +driver_class = "" +driver_jar_path = "" +username = "" +password = "" +target_table = "" +auto_create_table = true +payload_format = "json" +batch_size = 100 +include_metadata = true +include_checksum = true +include_origin_timestamp = true +enable_connection_pool = false diff --git a/core/integration/tests/connectors/jdbc/mod.rs b/core/integration/tests/connectors/jdbc/mod.rs new file mode 100644 index 0000000000..60064394a9 --- /dev/null +++ b/core/integration/tests/connectors/jdbc/mod.rs @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// JDBC connector tests. +// Source PostgreSQL tests: test_with_postgres.rs +// Sink PostgreSQL tests: test_sink_with_postgres.rs +mod test_sink_with_postgres; +mod test_with_postgres; diff --git a/core/integration/tests/connectors/jdbc/test_sink_with_postgres.rs b/core/integration/tests/connectors/jdbc/test_sink_with_postgres.rs new file mode 100644 index 0000000000..68470b8f25 --- /dev/null +++ b/core/integration/tests/connectors/jdbc/test_sink_with_postgres.rs @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::connectors::{ConnectorsRuntime, IggySetup, setup_runtime}; +use bytes::Bytes; +use iggy::prelude::IggyMessage; +use serial_test::serial; +use sqlx::Row; +use sqlx::postgres::PgPoolOptions; +use std::collections::HashMap; +use std::time::Duration; +use testcontainers_modules::postgres::Postgres; +use testcontainers_modules::testcontainers::ContainerAsync; +use testcontainers_modules::testcontainers::runners::AsyncRunner; +use tokio::time::sleep; +use tracing::info; + +const POSTGRES_USER: &str = "postgres"; +const POSTGRES_PASSWORD: &str = "postgres"; +const POSTGRES_DB: &str = "postgres"; +const SINK_TABLE: &str = "iggy_sink_events"; + +/// Maximum number of attempts before giving up waiting for sink rows. +const POLL_ATTEMPTS: usize = 30; +/// Delay between attempts. +const POLL_INTERVAL: Duration = Duration::from_millis(500); + +/// Start a Postgres container and return the container, the JDBC URL (for the +/// connector), the sqlx URL (for verification) and the driver JAR path. +async fn setup_postgres_container() -> Result< + (ContainerAsync, String, String, String), + Box, +> { + info!("Starting Postgres container for JDBC sink testing..."); + + let postgres = Postgres::default().start().await?; + let host = postgres.get_host().await?; + let port = postgres.get_host_port_ipv4(5432).await?; + + let jdbc_url = format!("jdbc:postgresql://{host}:{port}/{POSTGRES_DB}"); + let sqlx_url = + format!("postgres://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{host}:{port}/{POSTGRES_DB}"); + let driver_jar = get_postgres_driver_jar().await?; + + info!("Postgres container started at {host}:{port}"); + Ok((postgres, jdbc_url, sqlx_url, driver_jar)) +} + +/// Get the PostgreSQL JDBC driver, downloading (and caching) it if necessary. +async fn get_postgres_driver_jar() -> Result> { + let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string()); + let jdbc_test_dir = format!("{target_dir}/test-jdbc-drivers"); + let jar_path = format!("{jdbc_test_dir}/postgresql-42.7.1.jar"); + + std::fs::create_dir_all(&jdbc_test_dir)?; + + if std::path::Path::new(&jar_path).exists() { + info!("PostgreSQL JDBC driver found at {jar_path}"); + return Ok(std::fs::canonicalize(&jar_path)? + .to_string_lossy() + .to_string()); + } + + info!("Downloading PostgreSQL JDBC driver..."); + let download_url = "https://jdbc.postgresql.org/download/postgresql-42.7.1.jar"; + let response = reqwest::get(download_url).await?; + if !response.status().is_success() { + return Err(format!("Failed to download driver: HTTP {}", response.status()).into()); + } + let bytes = response.bytes().await?; + std::fs::write(&jar_path, bytes)?; + + info!("PostgreSQL JDBC driver downloaded to {jar_path}"); + Ok(std::fs::canonicalize(&jar_path)? + .to_string_lossy() + .to_string()) +} + +/// Build the environment variable overrides for the JDBC Postgres sink connector. +/// The stream/topic the sink consumes from is fixed in the connector TOML; only +/// the dynamic connection settings need to be injected here. +fn build_jdbc_sink_env(jdbc_url: &str, driver_jar: &str) -> HashMap { + let prefix = "IGGY_CONNECTORS_SINK_JDBC_SINK_PG"; + let mut envs = HashMap::new(); + + let mut set = |suffix: &str, value: &str| { + envs.insert(format!("{prefix}_{suffix}"), value.to_owned()); + }; + + set("PLUGIN_CONFIG_JDBC_URL", jdbc_url); + set("PLUGIN_CONFIG_DRIVER_CLASS", "org.postgresql.Driver"); + set("PLUGIN_CONFIG_DRIVER_JAR_PATH", driver_jar); + set("PLUGIN_CONFIG_USERNAME", POSTGRES_USER); + set("PLUGIN_CONFIG_PASSWORD", POSTGRES_PASSWORD); + set("PLUGIN_CONFIG_TARGET_TABLE", SINK_TABLE); + set("PLUGIN_CONFIG_AUTO_CREATE_TABLE", "true"); + set("PLUGIN_CONFIG_PAYLOAD_FORMAT", "json"); + set("PLUGIN_CONFIG_BATCH_SIZE", "100"); + set("PLUGIN_CONFIG_INCLUDE_METADATA", "true"); + + envs +} + +/// Start the connectors runtime with the JDBC Postgres sink configured. +async fn setup_jdbc_postgres_sink( + jdbc_url: &str, + driver_jar: &str, +) -> Result< + (ConnectorsRuntime, crate::connectors::ConnectorsIggyClient), + Box, +> { + let iggy_setup = IggySetup::default(); + let envs = build_jdbc_sink_env(jdbc_url, driver_jar); + + let mut runtime = setup_runtime(); + runtime + .init("jdbc/config_sink_postgres.toml", Some(envs), iggy_setup) + .await; + + let client = runtime.create_client().await; + Ok((runtime, client)) +} + +/// Build JSON `IggyMessage`s with sequential ids from the given payloads. +fn build_json_messages(payloads: &[serde_json::Value]) -> Vec { + payloads + .iter() + .enumerate() + .map(|(i, payload)| { + let bytes = serde_json::to_vec(payload).expect("Failed to serialize payload"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(bytes)) + .build() + .expect("Failed to build message") + }) + .collect() +} + +/// Poll the sink table until `expected` rows appear (or attempts are exhausted), +/// returning (iggy_offset, iggy_stream, iggy_topic, payload-as-json) per row. +async fn wait_for_rows( + sqlx_url: &str, + expected: usize, +) -> Vec<(i64, String, String, serde_json::Value)> { + let pool = PgPoolOptions::new() + .max_connections(2) + .connect(sqlx_url) + .await + .expect("Failed to connect to Postgres for verification"); + + let query = format!( + "SELECT iggy_offset, iggy_stream, iggy_topic, payload FROM {SINK_TABLE} ORDER BY iggy_offset" + ); + + for _ in 0..POLL_ATTEMPTS { + // The table may not exist yet until the sink's open() runs. + // Query string is built only from a compile-time constant table name. + if let Ok(rows) = sqlx::query(sqlx::AssertSqlSafe(query.clone())) + .fetch_all(&pool) + .await + && rows.len() >= expected + { + return rows + .iter() + .map(|row| { + let offset: i64 = row.get("iggy_offset"); + let stream: String = row.get("iggy_stream"); + let topic: String = row.get("iggy_topic"); + let payload: String = row.get("payload"); + let value: serde_json::Value = + serde_json::from_str(&payload).expect("Stored payload is not valid JSON"); + (offset, stream, topic, value) + }) + .collect(); + } + sleep(POLL_INTERVAL).await; + } + + Vec::new() +} + +/// Test: JSON messages produced to Iggy are written as rows in Postgres by the +/// JDBC sink, with payload and metadata columns populated. +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_sink_writes_rows() { + let (_container, jdbc_url, sqlx_url, driver_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {e}"); + return; + } + }; + + let (_runtime, client) = setup_jdbc_postgres_sink(&jdbc_url, &driver_jar) + .await + .expect("Failed to setup sink runtime"); + + let payloads = vec![ + serde_json::json!({"id": 1, "name": "alice", "active": true}), + serde_json::json!({"id": 2, "name": "bob", "active": false}), + serde_json::json!({"id": 3, "name": "carol", "active": true}), + ]; + let mut messages = build_json_messages(&payloads); + + client + .send_messages(&mut messages) + .await + .expect("Failed to send messages to Iggy"); + + info!("Waiting for the JDBC sink to write rows to Postgres..."); + let rows = wait_for_rows(&sqlx_url, payloads.len()).await; + + assert_eq!( + rows.len(), + payloads.len(), + "Expected {} rows written to Postgres by the JDBC sink, got {}", + payloads.len(), + rows.len() + ); + + for (i, (offset, stream, topic, payload)) in rows.iter().enumerate() { + assert_eq!(*offset, i as i64, "Offset mismatch at row {i}"); + assert_eq!(stream, DEFAULT_STREAM, "Stream mismatch at row {i}"); + assert_eq!(topic, DEFAULT_TOPIC, "Topic mismatch at row {i}"); + assert_eq!(payload, &payloads[i], "Payload mismatch at row {i}"); + } +} + +/// Test: a single message round-trips through the sink with the correct value. +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_sink_single_message() { + let (_container, jdbc_url, sqlx_url, driver_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {e}"); + return; + } + }; + + let (_runtime, client) = setup_jdbc_postgres_sink(&jdbc_url, &driver_jar) + .await + .expect("Failed to setup sink runtime"); + + let payloads = vec![serde_json::json!({"answer": 42})]; + let mut messages = build_json_messages(&payloads); + client + .send_messages(&mut messages) + .await + .expect("Failed to send message to Iggy"); + + let rows = wait_for_rows(&sqlx_url, 1).await; + assert_eq!( + rows.len(), + 1, + "Expected exactly 1 row written by the JDBC sink" + ); + assert_eq!( + rows[0].3.get("answer").and_then(|v| v.as_i64()), + Some(42), + "Expected answer=42 in stored payload" + ); +} + +// Matches IggySetup::default() (DEFAULT_TEST_STREAM / DEFAULT_TEST_TOPIC). +const DEFAULT_STREAM: &str = "test_stream"; +const DEFAULT_TOPIC: &str = "test_topic"; diff --git a/core/integration/tests/connectors/jdbc/test_with_postgres.rs b/core/integration/tests/connectors/jdbc/test_with_postgres.rs new file mode 100644 index 0000000000..56d2ef649d --- /dev/null +++ b/core/integration/tests/connectors/jdbc/test_with_postgres.rs @@ -0,0 +1,513 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::connectors::{ConnectorsRuntime, IggySetup, setup_runtime}; +use serial_test::serial; +use sqlx::postgres::PgPoolOptions; +use std::collections::HashMap; +use std::time::Duration; +use testcontainers_modules::postgres::Postgres; +use testcontainers_modules::testcontainers::ContainerAsync; +use testcontainers_modules::testcontainers::runners::AsyncRunner; +use tokio::time::sleep; +use tracing::info; + +const POSTGRES_USER: &str = "postgres"; +const POSTGRES_PASSWORD: &str = "postgres"; +const POSTGRES_DB: &str = "postgres"; + +/// Maximum number of poll attempts before giving up +const POLL_ATTEMPTS: usize = 30; +/// Delay between poll attempts +const POLL_INTERVAL: Duration = Duration::from_millis(500); + +/// Setup Postgres container with test data +async fn setup_postgres_container() +-> Result<(ContainerAsync, String, String), Box> { + info!("Starting Postgres container for JDBC testing..."); + + let postgres = Postgres::default().start().await?; + + let host = postgres.get_host().await?; + let port = postgres.get_host_port_ipv4(5432).await?; + let jdbc_url: String = format!("jdbc:postgresql://{}:{}/{}", host, port, POSTGRES_DB); + + let postgres_jar: String = get_postgres_driver_jar().await?; + + info!("Postgres container started at {}:{}", host, port); + Ok((postgres, jdbc_url, postgres_jar)) +} + +/// Get PostgreSQL JDBC driver, downloading if necessary +async fn get_postgres_driver_jar() -> Result> { + let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string()); + let jdbc_test_dir = format!("{}/test-jdbc-drivers", target_dir); + let jar_path = format!("{}/postgresql-42.7.1.jar", jdbc_test_dir); + + std::fs::create_dir_all(&jdbc_test_dir)?; + + if std::path::Path::new(&jar_path).exists() { + info!("PostgreSQL JDBC driver found at {}", jar_path); + let absolute_path = std::fs::canonicalize(&jar_path)? + .to_string_lossy() + .to_string(); + return Ok(absolute_path); + } + + info!("Downloading PostgreSQL JDBC driver..."); + let download_url = "https://jdbc.postgresql.org/download/postgresql-42.7.1.jar"; + + let response = reqwest::get(download_url).await?; + if !response.status().is_success() { + return Err(format!("Failed to download driver: HTTP {}", response.status()).into()); + } + + let bytes = response.bytes().await?; + std::fs::write(&jar_path, bytes)?; + + info!("PostgreSQL JDBC driver downloaded to {}", jar_path); + let absolute_path = std::fs::canonicalize(&jar_path)? + .to_string_lossy() + .to_string(); + Ok(absolute_path) +} + +/// Build the environment variables for a JDBC Postgres source connector. +fn build_jdbc_env( + jdbc_url: &str, + postgres_jar: &str, + query: &str, + mode: &str, + iggy_setup: &IggySetup, +) -> HashMap { + let mut envs = HashMap::new(); + + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_JDBC_URL".to_owned(), + jdbc_url.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_DRIVER_CLASS".to_owned(), + "org.postgresql.Driver".to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_DRIVER_JAR_PATH".to_owned(), + postgres_jar.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_USERNAME".to_owned(), + POSTGRES_USER.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_PASSWORD".to_owned(), + POSTGRES_PASSWORD.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_QUERY".to_owned(), + query.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_POLL_INTERVAL".to_owned(), + "1s".to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_BATCH_SIZE".to_owned(), + "100".to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_MODE".to_owned(), + mode.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_ENABLE_CONNECTION_POOL".to_owned(), + "false".to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_SNAKE_CASE_COLUMNS".to_owned(), + "false".to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_INCLUDE_METADATA".to_owned(), + "true".to_owned(), + ); + + // Stream configuration + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_STREAMS_0_STREAM".to_owned(), + iggy_setup.stream.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_STREAMS_0_TOPIC".to_owned(), + iggy_setup.topic.to_owned(), + ); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_STREAMS_0_SCHEMA".to_owned(), + "json".to_owned(), + ); + + envs +} + +/// Poll messages from Iggy with retry logic, returning deserialized JSON values. +async fn poll_messages_with_retry( + client: &crate::connectors::ConnectorsIggyClient, + expected_count: usize, +) -> Vec { + let mut received: Vec = Vec::new(); + for attempt in 0..POLL_ATTEMPTS { + let polled_messages = client + .get_messages() + .await + .expect("Failed to poll messages"); + + for msg in &polled_messages.messages { + if let Ok(value) = serde_json::from_slice::(&msg.payload) { + received.push(value); + } + } + + if received.len() >= expected_count { + info!( + "Received {} messages after {} attempts", + received.len(), + attempt + 1 + ); + return received; + } + + sleep(POLL_INTERVAL).await; + } + + received +} + +/// Setup connector runtime with JDBC source for Postgres +async fn setup_jdbc_postgres_source( + jdbc_url: &str, + postgres_jar: &str, + query: &str, + mode: &str, +) -> Result< + (ConnectorsRuntime, crate::connectors::ConnectorsIggyClient), + Box, +> { + let iggy_setup = IggySetup::default(); + let envs = build_jdbc_env(jdbc_url, postgres_jar, query, mode, &iggy_setup); + + let mut runtime = setup_runtime(); + runtime + .init("jdbc/config_postgres.toml", Some(envs), iggy_setup) + .await; + + let client = runtime.create_client().await; + Ok((runtime, client)) +} + +/// Test: basic bulk mode query produces messages with correct structure +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_source_basic() { + let (_postgres_container, jdbc_url, postgres_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {}", e); + return; + } + }; + + let query = "SELECT 1 as id, 'test' as name"; + let (_runtime, client) = setup_jdbc_postgres_source(&jdbc_url, &postgres_jar, query, "bulk") + .await + .expect("Failed to setup runtime"); + + info!("Waiting for JDBC connector to poll from Postgres..."); + let messages = poll_messages_with_retry(&client, 1).await; + + assert!( + !messages.is_empty(), + "Expected at least 1 message from JDBC Postgres source" + ); + + // Verify message structure: should have metadata wrapping (include_metadata=true) + let first = &messages[0]; + assert!( + first.get("data").is_some(), + "Expected 'data' field in message (include_metadata=true), got: {}", + first + ); + assert_eq!( + first.get("operation_type").and_then(|v| v.as_str()), + Some("SELECT"), + "Expected operation_type=SELECT" + ); + + // Verify the actual data content + let data = first.get("data").unwrap(); + assert_eq!( + data.get("id").and_then(|v| v.as_i64()), + Some(1), + "Expected id=1 in data" + ); + assert_eq!( + data.get("name").and_then(|v| v.as_str()), + Some("test"), + "Expected name='test' in data" + ); +} + +/// Test: bulk mode with multiple rows from an actual table +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_source_multiple_rows() { + let (postgres_container, jdbc_url, postgres_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {}", e); + return; + } + }; + + // Use a multi-row SELECT to simulate table data without needing DDL + let query = r#" + SELECT * FROM (VALUES + (1, 'alice', true), + (2, 'bob', false), + (3, 'carol', true) + ) AS t(id, name, active) + "#; + + let (_runtime, client) = setup_jdbc_postgres_source(&jdbc_url, &postgres_jar, query, "bulk") + .await + .expect("Failed to setup runtime"); + + info!("Waiting for JDBC connector to poll multiple rows..."); + let messages = poll_messages_with_retry(&client, 3).await; + + assert!( + messages.len() >= 3, + "Expected at least 3 messages, got {}", + messages.len() + ); + + // Verify each row has the expected structure + for msg in &messages[..3] { + let data = msg.get("data").expect("Missing 'data' field"); + assert!(data.get("id").is_some(), "Missing 'id' column in row data"); + assert!( + data.get("name").is_some(), + "Missing 'name' column in row data" + ); + assert!( + data.get("active").is_some(), + "Missing 'active' column in row data" + ); + } + + // Verify specific values for the first row + let first_data = messages[0].get("data").unwrap(); + assert_eq!(first_data.get("id").and_then(|v| v.as_i64()), Some(1)); + assert_eq!( + first_data.get("name").and_then(|v| v.as_str()), + Some("alice") + ); + + // Keep container alive until assertions complete + drop(postgres_container); +} + +/// Test: message contains timestamp field when metadata is enabled +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_source_metadata_fields() { + let (_postgres_container, jdbc_url, postgres_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {}", e); + return; + } + }; + + let query = "SELECT 42 as value"; + let (_runtime, client) = setup_jdbc_postgres_source(&jdbc_url, &postgres_jar, query, "bulk") + .await + .expect("Failed to setup runtime"); + + let messages = poll_messages_with_retry(&client, 1).await; + assert!(!messages.is_empty(), "Expected at least 1 message"); + + let msg = &messages[0]; + + // Verify all metadata fields are present + assert!( + msg.get("timestamp").is_some(), + "Missing 'timestamp' metadata field" + ); + assert!( + msg.get("operation_type").is_some(), + "Missing 'operation_type' metadata field" + ); + assert!(msg.get("data").is_some(), "Missing 'data' metadata field"); + + // table_name should be null for SELECT queries without a specific table + // (this is expected behavior for computed queries) + assert!( + msg.get("table_name").is_some(), + "Missing 'table_name' metadata field" + ); +} + +/// Derive a sqlx (`postgres://`) URL from the connector's JDBC URL so the test +/// can seed the table the source reads from. +fn pg_sqlx_url(jdbc_url: &str) -> String { + let host_and_db = jdbc_url + .strip_prefix("jdbc:postgresql://") + .unwrap_or(jdbc_url); + format!("postgres://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{host_and_db}") +} + +/// Collect the `data.id` integer from each polled (metadata-wrapped) message. +fn collect_ids(messages: &[serde_json::Value]) -> Vec { + messages + .iter() + .filter_map(|m| { + m.get("data") + .and_then(|d| d.get("id")) + .and_then(|v| v.as_i64()) + }) + .collect() +} + +/// Test: incremental mode advances its tracking offset across polls; newly +/// inserted rows are delivered exactly once and previously read rows are not +/// re-delivered. +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_source_incremental_advances_offset() { + let (_container, jdbc_url, postgres_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {e}"); + return; + } + }; + + // Seed a real table BEFORE the source starts polling. + let pool = PgPoolOptions::new() + .max_connections(2) + .connect(&pg_sqlx_url(&jdbc_url)) + .await + .expect("Failed to connect to Postgres for seeding"); + sqlx::query("CREATE TABLE inc_test (id INT PRIMARY KEY, name TEXT)") + .execute(&pool) + .await + .expect("Failed to create table"); + sqlx::query("INSERT INTO inc_test (id, name) VALUES (1, 'a'), (2, 'b'), (3, 'c')") + .execute(&pool) + .await + .expect("Failed to insert initial rows"); + + let query = "SELECT id, name FROM inc_test WHERE id > {last_offset} ORDER BY id"; + let (_runtime, client) = + setup_jdbc_postgres_source(&jdbc_url, &postgres_jar, query, "incremental") + .await + .expect("Failed to setup runtime"); + + // First batch: ids 1..3. + let first = poll_messages_with_retry(&client, 3).await; + let mut first_ids = collect_ids(&first); + first_ids.sort_unstable(); + assert_eq!(first_ids, vec![1, 2, 3], "Expected ids 1,2,3 on first poll"); + + // Insert more rows; only these (id > last_offset) should arrive next. + sqlx::query("INSERT INTO inc_test (id, name) VALUES (4, 'd'), (5, 'e')") + .execute(&pool) + .await + .expect("Failed to insert additional rows"); + + let second = poll_messages_with_retry(&client, 2).await; + let mut second_ids = collect_ids(&second); + second_ids.sort_unstable(); + assert_eq!( + second_ids, + vec![4, 5], + "Expected only the new ids 4,5 (offset must have advanced past 3), got {second_ids:?}" + ); +} + +/// Test: a single poll over many rows succeeds. This exercises the JNI +/// local-reference frame management in `read_rows`: a few-hundred-row result set +/// creates hundreds of per-column local references in one native call, which +/// would overflow the JNI local reference table (and abort the JVM) if each row +/// were not read inside its own local frame. +#[tokio::test] +#[serial] +async fn test_jdbc_postgres_source_large_result_set() { + let (_container, jdbc_url, postgres_jar) = match setup_postgres_container().await { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping test: Failed to setup Postgres: {e}"); + return; + } + }; + + let pool = PgPoolOptions::new() + .max_connections(2) + .connect(&pg_sqlx_url(&jdbc_url)) + .await + .expect("Failed to connect to Postgres for seeding"); + sqlx::query("CREATE TABLE big_test (id INT PRIMARY KEY, name TEXT, val NUMERIC(12,2))") + .execute(&pool) + .await + .expect("Failed to create table"); + sqlx::query( + "INSERT INTO big_test (id, name, val) \ + SELECT g, 'row_' || g, (g * 1.5)::numeric(12,2) FROM generate_series(1, 300) g", + ) + .execute(&pool) + .await + .expect("Failed to insert rows"); + + // batch_size well above the row count so the whole table is read in a single + // poll (one read_rows call → hundreds of local refs). + let iggy_setup = IggySetup::default(); + let query = "SELECT id, name, val FROM big_test ORDER BY id"; + let mut envs = build_jdbc_env(&jdbc_url, &postgres_jar, query, "bulk", &iggy_setup); + envs.insert( + "IGGY_CONNECTORS_SOURCE_JDBC_PG_PLUGIN_CONFIG_BATCH_SIZE".to_owned(), + "5000".to_owned(), + ); + + let mut runtime = setup_runtime(); + runtime + .init("jdbc/config_postgres.toml", Some(envs), iggy_setup) + .await; + let client = runtime.create_client().await; + + // The runtime would have crashed on the oversized poll without per-row local + // frames; receiving a healthy batch of well-formed messages proves it did not. + let messages = poll_messages_with_retry(&client, 150).await; + assert!( + messages.len() >= 150, + "Expected the source to stream a large result set without crashing; got {} messages", + messages.len() + ); + for msg in &messages[..150] { + let data = msg.get("data").expect("Missing 'data' field"); + assert!(data.get("id").and_then(|v| v.as_i64()).is_some()); + assert!(data.get("name").and_then(|v| v.as_str()).is_some()); + } +} diff --git a/core/integration/tests/connectors/mod.rs b/core/integration/tests/connectors/mod.rs index fc624f897f..a877b15e11 100644 --- a/core/integration/tests/connectors/mod.rs +++ b/core/integration/tests/connectors/mod.rs @@ -24,6 +24,7 @@ mod http; mod http_config_provider; mod iceberg; mod influxdb; +mod jdbc; mod mongodb; mod postgres; mod quickwit; @@ -32,8 +33,44 @@ mod random_source_liveness; mod runtime; mod stdout; -use iggy_common::IggyTimestamp; +use iggy::prelude::{IggyClient, IggyMessage, Partitioning}; +use iggy_common::Client; +use iggy_common::{ + CompressionAlgorithm, IggyExpiry, IggyTimestamp, MaxTopicSize, MessageClient, PolledMessages, + StreamClient, TopicClient, +}; +use integration::harness::{ConnectorsRuntimeConfig, IpAddrKind, TestHarness, TestServerConfig}; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +const DEFAULT_TEST_STREAM: &str = "test_stream"; +const DEFAULT_TEST_TOPIC: &str = "test_topic"; + +fn setup_runtime() -> ConnectorsRuntime { + ConnectorsRuntime { + harness: TestHarness::builder() + .server( + TestServerConfig::builder() + .ip_kind(IpAddrKind::V4) + .quic_enabled(false) + .http_enabled(false) + .websocket_enabled(false) + .extra_envs(HashMap::from([ + // The harness pre-reserves a fixed TCP port (see PortReserver), + // so the server binds a non-zero port. That relies on the server + // writing current_config.toml on bind regardless of how the port + // was chosen (see tcp_listener.rs); the harness reads that file to + // discover the bound address before it considers startup complete. + ("IGGY_TCP_ADDRESS".to_owned(), "127.0.0.1:0".to_owned()), + ])) + .build(), + ) + .build() + .unwrap(), + stream: "".to_owned(), + topic: "".to_owned(), + } +} const ONE_DAY_MICROS: u64 = 24 * 60 * 60 * 1_000_000; @@ -60,3 +97,154 @@ pub fn create_test_messages(count: usize) -> Vec { }) .collect() } + +#[derive(Debug)] +struct ConnectorsRuntime { + stream: String, + topic: String, + harness: TestHarness, +} + +#[derive(Debug)] +struct ConnectorsIggyClient { + stream: String, + topic: String, + client: IggyClient, +} + +impl ConnectorsIggyClient { + /// Send messages to the configured stream/topic (used by sink connector tests). + #[allow(dead_code)] + async fn send_messages( + &self, + messages: &mut [IggyMessage], + ) -> Result<(), iggy_common::IggyError> { + self.client + .send_messages( + &self.stream.clone().try_into().unwrap(), + &self.topic.clone().try_into().unwrap(), + &Partitioning::balanced(), + messages, + ) + .await + } + + async fn get_messages(&self) -> Result { + self.client + .poll_messages( + &self.stream.clone().try_into().unwrap(), + &self.topic.clone().try_into().unwrap(), + None, + &iggy_common::Consumer::new("test_consumer".try_into().unwrap()), + &iggy_common::PollingStrategy::next(), + 10, + true, + ) + .await + } +} + +#[derive(Debug)] +pub struct IggySetup { + pub stream: String, + pub topic: String, +} + +impl Default for IggySetup { + fn default() -> Self { + Self { + stream: DEFAULT_TEST_STREAM.to_owned(), + topic: DEFAULT_TEST_TOPIC.to_owned(), + } + } +} + +impl ConnectorsRuntime { + pub async fn init( + &mut self, + config_path: &str, + envs: Option>, + iggy_setup: IggySetup, + ) { + let config_path = format!("tests/connectors/{config_path}"); + let mut all_envs = HashMap::new(); + all_envs.insert( + "IGGY_CONNECTORS_CONFIG_PATH".to_owned(), + config_path.to_owned(), + ); + + if let Some(envs) = envs { + for (k, v) in envs { + all_envs.insert(k, v); + } + } + + // Start the iggy server + self.harness + .start() + .await + .expect("Failed to start test harness"); + + let client = self.create_iggy_client().await; + client + .create_stream(&iggy_setup.stream) + .await + .expect("Failed to create stream"); + let stream_id = iggy_setup + .stream + .clone() + .try_into() + .expect("Invalid stream name in Iggy setup"); + client + .create_topic( + &stream_id, + &iggy_setup.topic, + 1, + CompressionAlgorithm::None, + None, + IggyExpiry::ServerDefault, + MaxTopicSize::ServerDefault, + ) + .await + .expect("Failed to create topic"); + client.shutdown().await.expect("Failed to shutdown client"); + + let connectors_config = ConnectorsRuntimeConfig::builder() + .extra_envs(all_envs) + .build(); + + self.harness + .server_mut() + .set_connectors_runtime_config(connectors_config); + self.harness + .server_mut() + .start_dependents() + .await + .expect("Failed to start connectors runtime"); + + self.stream = iggy_setup.stream; + self.topic = iggy_setup.topic; + } + + pub async fn create_client(&self) -> ConnectorsIggyClient { + ConnectorsIggyClient { + stream: self.stream.clone(), + topic: self.topic.clone(), + client: self.create_iggy_client().await, + } + } + + async fn create_iggy_client(&self) -> IggyClient { + self.harness + .tcp_root_client() + .await + .expect("Failed to create root TCP client") + } + + pub fn connectors_api_address(&self) -> Option { + self.harness + .server() + .connectors_runtime() + .map(|cr| cr.http_address().to_string()) + } +} diff --git a/core/server/src/tcp/tcp_listener.rs b/core/server/src/tcp/tcp_listener.rs index 3f53e4e640..c805f28d10 100644 --- a/core/server/src/tcp/tcp_listener.rs +++ b/core/server/src/tcp/tcp_listener.rs @@ -73,11 +73,11 @@ pub async fn start( // Store bound address locally shard.tcp_bound_address.set(Some(actual_addr)); - if addr.port() == 0 { - // Notify config writer on shard 0 - let _ = shard.config_writer_notify.try_send(()); + // Always notify config writer so it can write the current_config.toml + let _ = shard.config_writer_notify.try_send(()); - // Broadcast to other shards for SO_REUSEPORT binding + if addr.port() == 0 { + // Broadcast to other shards for SO_REUSEPORT binding (only needed for dynamic ports) let event = ShardEvent::AddressBound { protocol: TransportProtocol::Tcp, address: actual_addr,