From bf850b361c9a29ac81b6d9901f486026cb5e059e Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Thu, 16 Apr 2026 17:02:24 +0530 Subject: [PATCH 1/7] Fixes #24348: Strip URL scheme from hostPort to prevent ValueError --- .../ingestion/models/custom_pydantic.py | 29 +++++-- ingestion/src/metadata/utils/db_utils.py | 55 +++++++++++- .../tests/unit/test_build_connection_url.py | 28 ++++++ ingestion/tests/unit/test_db_utils.py | 85 ++++++++++++++++++- 4 files changed, 187 insertions(+), 10 deletions(-) diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index 512dcd3580ef..e7ac7cf6f7c6 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -42,17 +42,30 @@ class BaseModel(PydanticBaseModel): def model_post_init(self, context: Any, /): """ - This function is used to parse the FilterPattern fields for the Connection classes. - This is needed because dict is defined in the JSON schema for the FilterPattern field, - but a FilterPattern object is required in the generated code. + Post-init hook for Connection classes: + - Sanitises ``hostPort`` by stripping accidental URL scheme prefixes. + - Converts raw ``dict`` values into ``FilterPattern`` objects. """ # pylint: disable=import-outside-toplevel + if not self.__class__.__name__.endswith("Connection"): + return + if not hasattr(self, "__pydantic_fields__"): + return + + if "hostPort" in self.__pydantic_fields__: + raw = getattr(self, "hostPort", None) + if isinstance(raw, str) and "://" in raw: + try: + from metadata.utils.db_utils import clean_host_port + + object.__setattr__(self, "hostPort", clean_host_port(raw)) + except Exception: + logger.warning( + "Failed to clean hostPort '%s'; leaving as-is", + raw[:50], + ) + try: - if not self.__class__.__name__.endswith("Connection"): - # Only parse FilterPattern for Connection classes - return - if not hasattr(self, "__pydantic_fields__"): - return for field in self.__pydantic_fields__: if field.endswith("FilterPattern"): from metadata.generated.schema.type.filterPattern import ( diff --git a/ingestion/src/metadata/utils/db_utils.py b/ingestion/src/metadata/utils/db_utils.py index 8f31470f37ad..5d81bc8879d7 100644 --- a/ingestion/src/metadata/utils/db_utils.py +++ b/ingestion/src/metadata/utils/db_utils.py @@ -12,9 +12,11 @@ """ Helpers module for db sources """ + import time import traceback from typing import Iterable, List, Union +from urllib.parse import urlparse from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.entity.data.table import Table @@ -43,12 +45,63 @@ PUBLIC_SCHEMA = "public" +def clean_host_port(host_port: str) -> str: + """ + Strip URL scheme prefixes from a hostPort string. + + Users sometimes enter a full URL (e.g. 'http://localhost:3306') + instead of just 'localhost:3306'. This strips the scheme to avoid + ValueError when parsing host and port. + """ + host_port = host_port.strip() + if "://" not in host_port: + return host_port.rstrip("/") + + parsed = urlparse(host_port) + hostname = parsed.hostname or "" + safe_label = ( + f"{parsed.scheme}://{hostname}" + if parsed.scheme and hostname + else "URL with scheme" + ) + logger.warning( + "The hostPort '%s' contains a URL scheme. " + "Expected format is 'hostname[:port]' (e.g. 'localhost:3306'). " + "Stripping the scheme prefix.", + safe_label, + ) + try: + port = parsed.port + except ValueError as exc: + raise ValueError( + f"Invalid hostPort '{safe_label}'. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306')." + ) from exc + + if not hostname: + # urlparse couldn't extract hostname (e.g. jdbc:postgresql://host:5432) + # Fall back to stripping everything before the last :// + raw = host_port.rsplit("://", 1)[-1] + raw = raw.split("/", 1)[0] + raw = raw.split("?", 1)[0] + raw = raw.split("#", 1)[0] + if "@" in raw: + raw = raw.rsplit("@", 1)[-1] + return raw + + host = f"[{hostname}]" if ":" in hostname else hostname + return f"{host}:{port}" if port is not None else host + + def get_host_from_host_port(uri: str) -> str: """ if uri is like "localhost:9000" then return the host "localhost" """ - return uri.split(":")[0] + cleaned = clean_host_port(uri) + if cleaned.startswith("["): + return cleaned.split("]")[0] + "]" + return cleaned.split(":")[0] # pylint: disable=too-many-locals diff --git a/ingestion/tests/unit/test_build_connection_url.py b/ingestion/tests/unit/test_build_connection_url.py index 3ce1e58f9399..19179516d7f6 100644 --- a/ingestion/tests/unit/test_build_connection_url.py +++ b/ingestion/tests/unit/test_build_connection_url.py @@ -97,6 +97,34 @@ def test_get_connection_url_mysql(self): "mysql+pymysql://openmetadata_user:mocked_token@localhost:3306/openmetadata_db", ) + def test_get_connection_url_mysql_with_url_scheme(self): + """hostPort with http:// prefix should be cleaned automatically""" + connection = MysqlConnectionConfig( + username="openmetadata_user", + authType=BasicAuth(password="openmetadata_password"), + hostPort="http://localhost:3306", + databaseSchema="openmetadata_db", + ) + engine_connection = MySQLConnection(connection).client + self.assertEqual( + engine_connection.url.render_as_string(hide_password=False), + "mysql+pymysql://openmetadata_user:openmetadata_password@localhost:3306/openmetadata_db", + ) + + def test_get_connection_url_postgres_with_url_scheme(self): + """hostPort with https:// prefix should be cleaned automatically""" + connection = PostgresConnectionConfig( + username="openmetadata_user", + authType=BasicAuth(password="openmetadata_password"), + hostPort="https://localhost:5432", + database="openmetadata_db", + ) + engine_connection = PostgresConnection(connection).client + self.assertEqual( + engine_connection.url.render_as_string(hide_password=False), + "postgresql+psycopg2://openmetadata_user:openmetadata_password@localhost:5432/openmetadata_db", + ) + def test_get_connection_url_postgres(self): connection = PostgresConnectionConfig( username="openmetadata_user", diff --git a/ingestion/tests/unit/test_db_utils.py b/ingestion/tests/unit/test_db_utils.py index f0eb2d61ad1f..507b66470a0e 100644 --- a/ingestion/tests/unit/test_db_utils.py +++ b/ingestion/tests/unit/test_db_utils.py @@ -12,6 +12,7 @@ """ Unit tests for db_utils module """ + import uuid from copy import deepcopy from unittest import TestCase @@ -36,7 +37,11 @@ from metadata.ingestion.lineage.models import Dialect from metadata.ingestion.lineage.sql_lineage import search_cache from metadata.ingestion.source.models import TableView -from metadata.utils.db_utils import get_host_from_host_port, get_view_lineage +from metadata.utils.db_utils import ( + clean_host_port, + get_host_from_host_port, + get_view_lineage, +) # Mock LineageTable class to simulate collate_sqllineage.core.models.Table @@ -118,6 +123,84 @@ def test_get_host_from_host_port(self): self.assertEqual(get_host_from_host_port("localhost"), "localhost") self.assertEqual(get_host_from_host_port("example.com"), "example.com") + # Test with URL scheme prefixes + self.assertEqual(get_host_from_host_port("http://localhost:3306"), "localhost") + self.assertEqual( + get_host_from_host_port("https://example.com:5432"), "example.com" + ) + self.assertEqual(get_host_from_host_port("http://localhost"), "localhost") + + # Test with IPv6 addresses + self.assertEqual(get_host_from_host_port("http://[::1]:3306"), "[::1]") + self.assertEqual(get_host_from_host_port("[::1]:3306"), "[::1]") + + def test_clean_host_port(self): + """Test clean_host_port strips URL scheme prefixes""" + # Already-clean values pass through unchanged + self.assertEqual(clean_host_port("localhost:3306"), "localhost:3306") + self.assertEqual(clean_host_port("127.0.0.1:5432"), "127.0.0.1:5432") + self.assertEqual(clean_host_port("example.com"), "example.com") + + # HTTP prefix is stripped + self.assertEqual(clean_host_port("http://localhost:3306"), "localhost:3306") + self.assertEqual(clean_host_port("http://example.com:8080"), "example.com:8080") + + # HTTPS prefix is stripped + self.assertEqual(clean_host_port("https://localhost:5432"), "localhost:5432") + self.assertEqual( + clean_host_port("https://mydb.example.com:3306"), "mydb.example.com:3306" + ) + + # Trailing slash is stripped + self.assertEqual(clean_host_port("http://localhost:3306/"), "localhost:3306") + + # Host only with scheme + self.assertEqual(clean_host_port("http://localhost"), "localhost") + self.assertEqual(clean_host_port("https://example.com"), "example.com") + + # URL with path is handled — path/query/fragment are discarded + self.assertEqual(clean_host_port("http://localhost:3306/db"), "localhost:3306") + self.assertEqual( + clean_host_port("https://example.com:5432/mydb?ssl=true"), + "example.com:5432", + ) + + # Whitespace is stripped + self.assertEqual(clean_host_port(" localhost:3306 "), "localhost:3306") + self.assertEqual(clean_host_port(" http://localhost:3306 "), "localhost:3306") + + # JDBC-style URLs fall back to raw extraction + self.assertEqual(clean_host_port("jdbc:postgresql://host:5432"), "host:5432") + self.assertEqual(clean_host_port("jdbc:postgresql://host:5432/db"), "host:5432") + self.assertEqual( + clean_host_port("jdbc:postgresql://host:5432?ssl=true"), "host:5432" + ) + self.assertEqual( + clean_host_port("jdbc:postgresql://host:5432/db?ssl=true#ref"), + "host:5432", + ) + + # IPv6 addresses — brackets are preserved + self.assertEqual(clean_host_port("http://[::1]:3306"), "[::1]:3306") + self.assertEqual(clean_host_port("https://[::1]:5432"), "[::1]:5432") + self.assertEqual(clean_host_port("http://[::1]"), "[::1]") + self.assertEqual( + clean_host_port("http://[2001:db8::1]:3306"), "[2001:db8::1]:3306" + ) + + # Plain IPv6 without scheme passes through unchanged + self.assertEqual(clean_host_port("[::1]:3306"), "[::1]:3306") + + # JDBC with userinfo — credentials are stripped + self.assertEqual( + clean_host_port("jdbc:postgresql://user:pass@host:5432/db"), + "host:5432", + ) + + # Invalid port raises ValueError + with self.assertRaises(ValueError): + clean_host_port("http://localhost:abc") + @patch("metadata.utils.db_utils.ConnectionTypeDialectMapper") @patch("metadata.utils.db_utils.fqn") def test_get_view_lineage_success_with_lineage_parser( From 267c821637f54cc69bfbcd2afeb8d536a4dde013 Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Fri, 17 Apr 2026 07:12:43 +0530 Subject: [PATCH 2/7] Address gitar-bot review: let ValueError propagate from clean_host_port The broad 'except Exception' in model_post_init() was swallowing the intentional ValueError that clean_host_port raises for invalid ports (e.g. 'http://localhost:abc'), leaving a malformed hostPort on the connection object and turning a clear user error into an obscure downstream failure. The local import of clean_host_port is from a sibling module that always exists, so the try/except was unnecessary defence. Removed it so pydantic surfaces validation errors where they belong. --- .../metadata/ingestion/models/custom_pydantic.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index e7ac7cf6f7c6..33ac440b9a1b 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -55,15 +55,12 @@ def model_post_init(self, context: Any, /): if "hostPort" in self.__pydantic_fields__: raw = getattr(self, "hostPort", None) if isinstance(raw, str) and "://" in raw: - try: - from metadata.utils.db_utils import clean_host_port - - object.__setattr__(self, "hostPort", clean_host_port(raw)) - except Exception: - logger.warning( - "Failed to clean hostPort '%s'; leaving as-is", - raw[:50], - ) + from metadata.utils.db_utils import clean_host_port + + # Let ValueError propagate: if clean_host_port cannot parse + # the input (e.g. non-numeric port), the user must fix their + # config rather than silently getting a broken hostPort. + object.__setattr__(self, "hostPort", clean_host_port(raw)) try: for field in self.__pydantic_fields__: From 162313519638a23d67c511b7ae7327f973a1c173 Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Fri, 17 Apr 2026 07:44:20 +0530 Subject: [PATCH 3/7] fix(ingestion): keep custom_pydantic dependency-free to avoid circular import The model_post_init hook added in the previous commit imported `clean_host_port` from `metadata.utils.db_utils` at runtime. That module has a heavy dependency graph (OpenMetadata client, LineageParser, generated schemas), so every *Connection class construction would drag it in and cause circular-import failures during integration test bootstrap. The file docstring explicitly requires BaseModel to be 'self-sufficient with only pydantic at import time', so the hostPort sanitisation is now handled by a stdlib-only helper (_strip_hostport_scheme) colocated with BaseModel. `clean_host_port` remains the public API in db_utils and delegates to the same helper, preserving behaviour for all existing callers and unit tests. --- .../ingestion/models/custom_pydantic.py | 60 +++++++++++++++++-- ingestion/src/metadata/utils/db_utils.py | 47 +++------------ 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index 33ac440b9a1b..fb4564264b6a 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -18,6 +18,7 @@ import json import logging from typing import Any, Callable, Dict, Literal, Optional, Union +from urllib.parse import urlparse from pydantic import BaseModel as PydanticBaseModel from pydantic import WrapSerializer, model_validator @@ -34,6 +35,55 @@ JSON_ENCODERS = "json_encoders" +def _strip_hostport_scheme(raw: str) -> str: + """ + Strip an accidental URL scheme from a hostPort string. + + Self-contained helper that depends only on the standard library so + ``model_post_init`` never drags heavy ``metadata.*`` imports into the + bootstrap path of every generated Connection class. + + Raises ValueError if the scheme carries a non-numeric port so the user + gets a clear error instead of a silently broken hostPort. + """ + value = raw.strip() + if "://" not in value: + return value + + parsed = urlparse(value) + hostname = parsed.hostname or "" + safe_label = ( + f"{parsed.scheme}://{hostname}" + if parsed.scheme and hostname + else "URL with scheme" + ) + logger.warning( + "The hostPort '%s' contains a URL scheme. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306'). Stripping the scheme prefix.", + safe_label, + ) + try: + port = parsed.port + except ValueError as exc: + raise ValueError( + f"Invalid hostPort '{safe_label}'. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306')." + ) from exc + + if not hostname: + # urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db') + # Fall back to stripping scheme and any trailing path/query/fragment/userinfo. + tail = value.rsplit("://", 1)[-1] + for sep in ("/", "?", "#"): + tail = tail.split(sep, 1)[0] + if "@" in tail: + tail = tail.rsplit("@", 1)[-1] + return tail + + host = f"[{hostname}]" if ":" in hostname else hostname + return f"{host}:{port}" if port is not None else host + + class BaseModel(PydanticBaseModel): """ Base model for OpenMetadata generated models. @@ -55,12 +105,10 @@ def model_post_init(self, context: Any, /): if "hostPort" in self.__pydantic_fields__: raw = getattr(self, "hostPort", None) if isinstance(raw, str) and "://" in raw: - from metadata.utils.db_utils import clean_host_port - - # Let ValueError propagate: if clean_host_port cannot parse - # the input (e.g. non-numeric port), the user must fix their - # config rather than silently getting a broken hostPort. - object.__setattr__(self, "hostPort", clean_host_port(raw)) + # Let ValueError propagate: if the hostPort cannot be parsed + # (e.g. non-numeric port), the user must fix their config + # rather than silently getting a broken hostPort. + object.__setattr__(self, "hostPort", _strip_hostport_scheme(raw)) try: for field in self.__pydantic_fields__: diff --git a/ingestion/src/metadata/utils/db_utils.py b/ingestion/src/metadata/utils/db_utils.py index 5d81bc8879d7..34ef0ad73737 100644 --- a/ingestion/src/metadata/utils/db_utils.py +++ b/ingestion/src/metadata/utils/db_utils.py @@ -16,7 +16,6 @@ import time import traceback from typing import Iterable, List, Union -from urllib.parse import urlparse from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.entity.data.table import Table @@ -34,6 +33,7 @@ get_lineage_by_query, get_lineage_via_table_entity, ) +from metadata.ingestion.models.custom_pydantic import _strip_hostport_scheme from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.source.models import TableView from metadata.utils import fqn @@ -52,45 +52,14 @@ def clean_host_port(host_port: str) -> str: Users sometimes enter a full URL (e.g. 'http://localhost:3306') instead of just 'localhost:3306'. This strips the scheme to avoid ValueError when parsing host and port. + + Delegates to the stdlib-only helper colocated with ``BaseModel`` so the + behaviour stays in lockstep with Pydantic's ``model_post_init`` hook. """ - host_port = host_port.strip() - if "://" not in host_port: - return host_port.rstrip("/") - - parsed = urlparse(host_port) - hostname = parsed.hostname or "" - safe_label = ( - f"{parsed.scheme}://{hostname}" - if parsed.scheme and hostname - else "URL with scheme" - ) - logger.warning( - "The hostPort '%s' contains a URL scheme. " - "Expected format is 'hostname[:port]' (e.g. 'localhost:3306'). " - "Stripping the scheme prefix.", - safe_label, - ) - try: - port = parsed.port - except ValueError as exc: - raise ValueError( - f"Invalid hostPort '{safe_label}'. Expected format is " - "'hostname[:port]' (e.g. 'localhost:3306')." - ) from exc - - if not hostname: - # urlparse couldn't extract hostname (e.g. jdbc:postgresql://host:5432) - # Fall back to stripping everything before the last :// - raw = host_port.rsplit("://", 1)[-1] - raw = raw.split("/", 1)[0] - raw = raw.split("?", 1)[0] - raw = raw.split("#", 1)[0] - if "@" in raw: - raw = raw.rsplit("@", 1)[-1] - return raw - - host = f"[{hostname}]" if ":" in hostname else hostname - return f"{host}:{port}" if port is not None else host + value = host_port.strip() + if "://" not in value: + return value.rstrip("/") + return _strip_hostport_scheme(value) def get_host_from_host_port(uri: str) -> str: From 2a9502050007f9aed420f0181318d6fbecf35db3 Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Mon, 20 Apr 2026 16:44:31 +0530 Subject: [PATCH 4/7] fix(ingestion): make strip_hostport_scheme public and validate JDBC port - Renames _strip_hostport_scheme to public strip_hostport_scheme so cross-module imports (db_utils.py) no longer depend on a private symbol; private alias kept for back-compat. - Validates the port in the JDBC-style fallback branch so inputs like 'jdbc:postgresql://host:abc/db' raise ValueError with a clear message, matching the docstring contract instead of silently passing a broken hostPort downstream. - Extends test_clean_host_port with the new fallback raise cases. --- .../ingestion/models/custom_pydantic.py | 28 +++++++++++++++++-- ingestion/src/metadata/utils/db_utils.py | 4 +-- ingestion/tests/unit/test_db_utils.py | 6 ++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index fb4564264b6a..97031662c348 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -35,7 +35,7 @@ JSON_ENCODERS = "json_encoders" -def _strip_hostport_scheme(raw: str) -> str: +def strip_hostport_scheme(raw: str) -> str: """ Strip an accidental URL scheme from a hostPort string. @@ -44,7 +44,9 @@ def _strip_hostport_scheme(raw: str) -> str: bootstrap path of every generated Connection class. Raises ValueError if the scheme carries a non-numeric port so the user - gets a clear error instead of a silently broken hostPort. + gets a clear error instead of a silently broken hostPort. This applies to + both standard URLs handled by ``urlparse`` and to JDBC-style URLs handled + by the fallback branch (e.g. ``jdbc:postgresql://host:abc/db``). """ value = raw.strip() if "://" not in value: @@ -78,12 +80,32 @@ def _strip_hostport_scheme(raw: str) -> str: tail = tail.split(sep, 1)[0] if "@" in tail: tail = tail.rsplit("@", 1)[-1] + + # Validate the port in the fallback path so the same ValueError + # contract holds for JDBC-style URLs (e.g. 'jdbc:postgresql://host:abc'). + fallback_port: Optional[str] = None + if tail.startswith("["): + closing = tail.find("]") + if closing != -1 and len(tail) > closing + 1 and tail[closing + 1] == ":": + fallback_port = tail[closing + 2 :] + elif ":" in tail: + fallback_port = tail.rsplit(":", 1)[1] + if fallback_port and not fallback_port.isdigit(): + raise ValueError( + f"Invalid hostPort '{safe_label}'. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306')." + ) return tail host = f"[{hostname}]" if ":" in hostname else hostname return f"{host}:{port}" if port is not None else host +# Backwards-compatible private alias retained for any internal callers that +# pinned to the original underscored symbol while the helper was private. +_strip_hostport_scheme = strip_hostport_scheme + + class BaseModel(PydanticBaseModel): """ Base model for OpenMetadata generated models. @@ -108,7 +130,7 @@ def model_post_init(self, context: Any, /): # Let ValueError propagate: if the hostPort cannot be parsed # (e.g. non-numeric port), the user must fix their config # rather than silently getting a broken hostPort. - object.__setattr__(self, "hostPort", _strip_hostport_scheme(raw)) + object.__setattr__(self, "hostPort", strip_hostport_scheme(raw)) try: for field in self.__pydantic_fields__: diff --git a/ingestion/src/metadata/utils/db_utils.py b/ingestion/src/metadata/utils/db_utils.py index 34ef0ad73737..5ba6a557212a 100644 --- a/ingestion/src/metadata/utils/db_utils.py +++ b/ingestion/src/metadata/utils/db_utils.py @@ -33,7 +33,7 @@ get_lineage_by_query, get_lineage_via_table_entity, ) -from metadata.ingestion.models.custom_pydantic import _strip_hostport_scheme +from metadata.ingestion.models.custom_pydantic import strip_hostport_scheme from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.source.models import TableView from metadata.utils import fqn @@ -59,7 +59,7 @@ def clean_host_port(host_port: str) -> str: value = host_port.strip() if "://" not in value: return value.rstrip("/") - return _strip_hostport_scheme(value) + return strip_hostport_scheme(value) def get_host_from_host_port(uri: str) -> str: diff --git a/ingestion/tests/unit/test_db_utils.py b/ingestion/tests/unit/test_db_utils.py index 507b66470a0e..5484b5a47cfd 100644 --- a/ingestion/tests/unit/test_db_utils.py +++ b/ingestion/tests/unit/test_db_utils.py @@ -201,6 +201,12 @@ def test_clean_host_port(self): with self.assertRaises(ValueError): clean_host_port("http://localhost:abc") + # Non-numeric port in JDBC-style fallback also raises ValueError + with self.assertRaises(ValueError): + clean_host_port("jdbc:postgresql://host:abc/db") + with self.assertRaises(ValueError): + clean_host_port("jdbc:postgresql://[::1]:abc") + @patch("metadata.utils.db_utils.ConnectionTypeDialectMapper") @patch("metadata.utils.db_utils.fqn") def test_get_view_lineage_success_with_lineage_parser( From 96428c1c5885a05e42a5d24c9f0c70d040b67503 Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Sun, 26 Apr 2026 19:43:37 +0530 Subject: [PATCH 5/7] =?UTF-8?q?fix(ingestion):=20move=20hostPort=20scheme?= =?UTF-8?q?=20stripping=20to=20get=5Fconnection=5Furl=5Fcommon=20in=20buil?= =?UTF-8?q?ders.py=20=20The=20model=5Fpost=5Finit=20hook=20in=20custom=5Fp?= =?UTF-8?q?ydantic.py=20was=20too=20broad=20=E2=80=94=20it=20stripped=20UR?= =?UTF-8?q?L=20schemes=20from=20ALL=20*Connection=20classes=20including=20?= =?UTF-8?q?AirflowConnection=20and=20OpenMetadataConnection=20where=20http?= =?UTF-8?q?://localhost:8080=20is=20a=20legitimate=20value.=20This=20broke?= =?UTF-8?q?=20all=20py-tests=20integration=20tests.=20=20Fix:=20remove=20s?= =?UTF-8?q?tripping=20from=20model=5Fpost=5Finit;=20apply=20strip=5Fhostpo?= =?UTF-8?q?rt=5Fscheme()=20only=20in=20get=5Fconnection=5Furl=5Fcommon()?= =?UTF-8?q?=20in=20builders.py,=20which=20is=20called=20exclusively=20by?= =?UTF-8?q?=20database=20source=20connectors.=20=20Also=20address=20two=20?= =?UTF-8?q?Copilot=20review=20issues=20in=20strip=5Fhostport=5Fscheme:=20-?= =?UTF-8?q?=20safe=5Flabel=20now=20uses=20a=20sanitised=20tail=20for=20JDB?= =?UTF-8?q?C/empty-hostname=20inputs=20=20=20instead=20of=20the=20generic?= =?UTF-8?q?=20'URL=20with=20scheme'=20string=20-=20Empty=20tail=20after=20?= =?UTF-8?q?scheme=20strip=20raises=20ValueError=20instead=20of=20silently?= =?UTF-8?q?=20=20=20returning=20an=20empty=20hostPort=20-=20Fallback=20por?= =?UTF-8?q?t=20validation=20now=20uses=20urlparse('//tail')=20for=20consis?= =?UTF-8?q?tent=20=20=20IPv6=20handling,=20matching=20the=20Copilot=20sugg?= =?UTF-8?q?ested=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ingestion/connections/builders.py | 3 +- .../ingestion/models/custom_pydantic.py | 89 ++++++++++--------- ingestion/tests/unit/test_db_utils.py | 4 + 3 files changed, 55 insertions(+), 41 deletions(-) diff --git a/ingestion/src/metadata/ingestion/connections/builders.py b/ingestion/src/metadata/ingestion/connections/builders.py index 449aec3e939e..7d01d26bd41b 100644 --- a/ingestion/src/metadata/ingestion/connections/builders.py +++ b/ingestion/src/metadata/ingestion/connections/builders.py @@ -33,6 +33,7 @@ from metadata.ingestion.connections.headers import inject_query_header_by_conn from metadata.ingestion.connections.query_logger import attach_query_tracker from metadata.ingestion.connections.secrets import connection_with_options_secrets +from metadata.ingestion.models.custom_pydantic import strip_hostport_scheme from metadata.utils.constants import BUILDER_PASSWORD_ATTR from metadata.utils.logger import cli_logger @@ -188,7 +189,7 @@ def get_connection_url_common(connection) -> str: url = _add_password(url, connection) url += "@" - url += connection.hostPort + url += strip_hostport_scheme(connection.hostPort) if hasattr(connection, "database"): url += f"/{connection.database}" if connection.database else "" diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index 97031662c348..21db145424a0 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -39,14 +39,15 @@ def strip_hostport_scheme(raw: str) -> str: """ Strip an accidental URL scheme from a hostPort string. - Self-contained helper that depends only on the standard library so - ``model_post_init`` never drags heavy ``metadata.*`` imports into the - bootstrap path of every generated Connection class. - - Raises ValueError if the scheme carries a non-numeric port so the user - gets a clear error instead of a silently broken hostPort. This applies to - both standard URLs handled by ``urlparse`` and to JDBC-style URLs handled - by the fallback branch (e.g. ``jdbc:postgresql://host:abc/db``). + Self-contained helper that depends only on the standard library so it + can be imported from ``builders.py`` and ``db_utils.py`` without circular + imports. + + Raises ValueError if the value cannot be resolved to a valid + ``hostname[:port]`` string — e.g. when the port is non-numeric or when + the hostname is empty after stripping the scheme. This applies to both + standard URLs handled by ``urlparse`` and to JDBC-style URLs handled by + the fallback branch (e.g. ``jdbc:postgresql://host:abc/db``). """ value = raw.strip() if "://" not in value: @@ -54,11 +55,19 @@ def strip_hostport_scheme(raw: str) -> str: parsed = urlparse(value) hostname = parsed.hostname or "" - safe_label = ( - f"{parsed.scheme}://{hostname}" - if parsed.scheme and hostname - else "URL with scheme" - ) + + # Build a sanitised label for log/error messages that identifies the + # problematic input without leaking credentials or a full path. + if parsed.scheme and hostname: + safe_label = f"{parsed.scheme}://{hostname}" + else: + # urlparse couldn't extract a hostname (e.g. jdbc:postgresql://…). + # Construct the label from the raw tail so messages remain actionable. + tail_for_label = value.rsplit("://", 1)[-1].split("/", 1)[0] + if "@" in tail_for_label: + tail_for_label = tail_for_label.rsplit("@", 1)[-1] + safe_label = tail_for_label or value + logger.warning( "The hostPort '%s' contains a URL scheme. Expected format is " "'hostname[:port]' (e.g. 'localhost:3306'). Stripping the scheme prefix.", @@ -73,24 +82,32 @@ def strip_hostport_scheme(raw: str) -> str: ) from exc if not hostname: - # urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db') - # Fall back to stripping scheme and any trailing path/query/fragment/userinfo. + # urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db'). + # Fall back to stripping the scheme and any trailing path/query/fragment/userinfo. tail = value.rsplit("://", 1)[-1] for sep in ("/", "?", "#"): tail = tail.split(sep, 1)[0] if "@" in tail: tail = tail.rsplit("@", 1)[-1] + if not tail: + raise ValueError( + f"Invalid hostPort '{safe_label}'. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306')." + ) + # Validate the port in the fallback path so the same ValueError - # contract holds for JDBC-style URLs (e.g. 'jdbc:postgresql://host:abc'). - fallback_port: Optional[str] = None - if tail.startswith("["): - closing = tail.find("]") - if closing != -1 and len(tail) > closing + 1 and tail[closing + 1] == ":": - fallback_port = tail[closing + 2 :] - elif ":" in tail: - fallback_port = tail.rsplit(":", 1)[1] - if fallback_port and not fallback_port.isdigit(): + # contract holds for JDBC-style URLs and other malformed inputs. + fallback_parsed = urlparse(f"//{tail}") + try: + fallback_hostname = fallback_parsed.hostname or "" + _ = fallback_parsed.port # raises ValueError for non-numeric ports + except ValueError as exc: + raise ValueError( + f"Invalid hostPort '{safe_label}'. Expected format is " + "'hostname[:port]' (e.g. 'localhost:3306')." + ) from exc + if not fallback_hostname: raise ValueError( f"Invalid hostPort '{safe_label}'. Expected format is " "'hostname[:port]' (e.g. 'localhost:3306')." @@ -114,25 +131,17 @@ class BaseModel(PydanticBaseModel): def model_post_init(self, context: Any, /): """ - Post-init hook for Connection classes: - - Sanitises ``hostPort`` by stripping accidental URL scheme prefixes. - - Converts raw ``dict`` values into ``FilterPattern`` objects. + This function is used to parse the FilterPattern fields for the Connection classes. + This is needed because dict is defined in the JSON schema for the FilterPattern field, + but a FilterPattern object is required in the generated code. """ # pylint: disable=import-outside-toplevel - if not self.__class__.__name__.endswith("Connection"): - return - if not hasattr(self, "__pydantic_fields__"): - return - - if "hostPort" in self.__pydantic_fields__: - raw = getattr(self, "hostPort", None) - if isinstance(raw, str) and "://" in raw: - # Let ValueError propagate: if the hostPort cannot be parsed - # (e.g. non-numeric port), the user must fix their config - # rather than silently getting a broken hostPort. - object.__setattr__(self, "hostPort", strip_hostport_scheme(raw)) - try: + if not self.__class__.__name__.endswith("Connection"): + # Only parse FilterPattern for Connection classes + return + if not hasattr(self, "__pydantic_fields__"): + return for field in self.__pydantic_fields__: if field.endswith("FilterPattern"): from metadata.generated.schema.type.filterPattern import ( diff --git a/ingestion/tests/unit/test_db_utils.py b/ingestion/tests/unit/test_db_utils.py index 5484b5a47cfd..d4124b153698 100644 --- a/ingestion/tests/unit/test_db_utils.py +++ b/ingestion/tests/unit/test_db_utils.py @@ -201,6 +201,10 @@ def test_clean_host_port(self): with self.assertRaises(ValueError): clean_host_port("http://localhost:abc") + # Empty host after scheme strip raises ValueError + with self.assertRaises(ValueError): + clean_host_port("http://") + # Non-numeric port in JDBC-style fallback also raises ValueError with self.assertRaises(ValueError): clean_host_port("jdbc:postgresql://host:abc/db") From d813bd6aca2d7026bc61928eee87717d8b94a35c Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Mon, 27 Apr 2026 02:16:10 +0530 Subject: [PATCH 6/7] fix(ingestion): protect all connectors from scheme-prefixed hostPort by stripping in model_post_init Addresses gitar-bot review feedback on PR #27191: moving strip_hostport_scheme to get_connection_url_common left several connectors that parse hostPort manually still vulnerable to the original crash (#24348): - cassandra/connection.py: host, port = connection.hostPort.split(':') - db2/connection.py: connection.hostPort.split(':')[0] - databricks/auth.py: connection.hostPort.split(':')[0] - redshift/connection.py: connection.hostPort.split(':')[0] - microsoftfabric/connection.py: connection.hostPort.split(':')[0] - builders.py (IAM auth): connection.hostPort.split(':') Restore strip_hostport_scheme call to model_post_init so scheme stripping happens at construction time for ALL *Connection classes, protecting every downstream split site automatically. Exclude AirflowConnection and OpenMetadataConnection via _HOSTPORT_URL_ALLOWED since their hostPort field legitimately stores a full URL (e.g. http://localhost:8080 and http://localhost:8585/api). The call in get_connection_url_common is kept for defence-in-depth; it is idempotent because strip_hostport_scheme short-circuits when '://' is absent. Add three new unit tests to test_connection_builders.py covering: - URL scheme stripping at model construction - ValueError raised for invalid port at construction time - URL-allowed connections are not stripped --- .../ingestion/models/custom_pydantic.py | 34 ++++++++-- .../tests/unit/test_connection_builders.py | 65 +++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index 21db145424a0..90ff9bf74c6f 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -122,6 +122,13 @@ def strip_hostport_scheme(raw: str) -> str: # pinned to the original underscored symbol while the helper was private. _strip_hostport_scheme = strip_hostport_scheme +# Connection classes whose hostPort field legitimately stores a full URL +# (e.g. "http://airflow:8080" or "http://localhost:8585/api") and must NOT +# have their scheme stripped by model_post_init. +_HOSTPORT_URL_ALLOWED: frozenset = frozenset( + {"AirflowConnection", "OpenMetadataConnection"} +) + class BaseModel(PydanticBaseModel): """ @@ -134,14 +141,31 @@ def model_post_init(self, context: Any, /): This function is used to parse the FilterPattern fields for the Connection classes. This is needed because dict is defined in the JSON schema for the FilterPattern field, but a FilterPattern object is required in the generated code. + + Additionally, for Connection classes that store a plain ``hostname[:port]`` + in ``hostPort``, any accidental URL scheme prefix (e.g. ``http://``) is + stripped here so that all downstream connector code — including connectors + that call ``connection.hostPort.split(":")`` directly — receives a clean value. """ # pylint: disable=import-outside-toplevel + if not self.__class__.__name__.endswith("Connection"): + # Only process Connection classes + return + if not hasattr(self, "__pydantic_fields__"): + return + + # Strip accidental URL schemes from hostPort at model construction time. + # This protects connectors that split hostPort manually (e.g. Cassandra, + # Databricks, Redshift, DB2, Microsoft Fabric) without going through + # get_connection_url_common. ValueError for genuinely invalid inputs + # (e.g. non-numeric port) propagates immediately with a clear message. + if ( + hasattr(self, "hostPort") + and self.__class__.__name__ not in _HOSTPORT_URL_ALLOWED + ): + self.hostPort = strip_hostport_scheme(self.hostPort) + try: - if not self.__class__.__name__.endswith("Connection"): - # Only parse FilterPattern for Connection classes - return - if not hasattr(self, "__pydantic_fields__"): - return for field in self.__pydantic_fields__: if field.endswith("FilterPattern"): from metadata.generated.schema.type.filterPattern import ( diff --git a/ingestion/tests/unit/test_connection_builders.py b/ingestion/tests/unit/test_connection_builders.py index 611503b265f5..27a3790b50ae 100644 --- a/ingestion/tests/unit/test_connection_builders.py +++ b/ingestion/tests/unit/test_connection_builders.py @@ -19,11 +19,15 @@ from metadata.generated.schema.entity.services.connections.database.mysqlConnection import ( MysqlConnection, ) +from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( + OpenMetadataConnection, +) from metadata.ingestion.connections.builders import ( get_connection_args_common, get_connection_options_dict, init_empty_connection_arguments, ) +from metadata.ingestion.models.custom_pydantic import _HOSTPORT_URL_ALLOWED class ConnectionBuilderTest(TestCase): @@ -79,3 +83,64 @@ def test_init_empty_connection_arguments(self): self.assertEqual(new_args.root.get("hello"), "world") self.assertIsNone(new_args.root.get("not there")) + + def test_model_post_init_strips_url_scheme_from_hostport(self): + """ + Verify that model_post_init strips URL schemes from hostPort at + construction time so connectors that call hostPort.split(":") directly + receive a clean hostname[:port] value. + """ + # http:// prefix should be stripped + conn = MysqlConnection( + username="user", + authType=BasicAuth(password="pass"), + hostPort="http://localhost:3306", + ) + self.assertEqual(conn.hostPort, "localhost:3306") + + # https:// prefix should be stripped + conn2 = MysqlConnection( + username="user", + authType=BasicAuth(password="pass"), + hostPort="https://myhost:5432", + ) + self.assertEqual(conn2.hostPort, "myhost:5432") + + # Already clean value should pass through unchanged + conn3 = MysqlConnection( + username="user", + authType=BasicAuth(password="pass"), + hostPort="localhost:3306", + ) + self.assertEqual(conn3.hostPort, "localhost:3306") + + def test_model_post_init_raises_for_invalid_port_in_hostport(self): + """ + Verify that a non-numeric port in a scheme-prefixed hostPort raises + at model construction time (fail-fast) rather than producing a + confusing error deep in connector code. + """ + with self.assertRaises(Exception): + MysqlConnection( + username="user", + authType=BasicAuth(password="pass"), + hostPort="http://localhost:abc", + ) + + def test_url_allowed_connections_not_stripped(self): + """ + Verify that connection classes in _HOSTPORT_URL_ALLOWED (AirflowConnection + and OpenMetadataConnection) are excluded from hostPort scheme stripping, + since their hostPort field legitimately stores a full URL. + """ + # Confirm the exclusion set contains the expected class names + self.assertIn("AirflowConnection", _HOSTPORT_URL_ALLOWED) + self.assertIn("OpenMetadataConnection", _HOSTPORT_URL_ALLOWED) + + # OpenMetadataConnection.hostPort is a plain str field that expects a + # full URL like "http://localhost:8585/api" — it must NOT be stripped. + ometa = OpenMetadataConnection( + hostPort="http://localhost:8585/api", + ) + self.assertIn("http", ometa.hostPort) + self.assertIn("localhost", ometa.hostPort) From ac816796dba5a22cd4fde7594cc7ee91ed836462 Mon Sep 17 00:00:00 2001 From: Rajdeep Singh Date: Mon, 27 Apr 2026 21:25:48 +0530 Subject: [PATCH 7/7] fix(ingestion): replace class-name allowlist with module-path check for hostPort stripping - Replace _HOSTPORT_URL_ALLOWED frozenset (only 2 entries) with _DATABASE_CONNECTION_MODULE_MARKER that checks the class module path. Only classes under .services.connections.database. are subject to scheme stripping; dashboard, pipeline, metadata, and all other connector categories pass through unchanged because their hostPort legitimately holds a full URL (e.g. http://grafana:3000). - Add isinstance(host_port, str) guard in model_post_init to prevent AttributeError when hostPort is Optional and left unset (None). - Update test_connection_builders.py: replace _HOSTPORT_URL_ALLOWED import with _DATABASE_CONNECTION_MODULE_MARKER, rename and update the allowlist test to reflect the new module-path strategy, and add a regression test for the None hostPort crash (CassandraConnection with default hostPort). Fixes gitar-bot Bug 1 (None crash) and Bug 2 (incomplete class-name allowlist that failed to cover non-database URL-based connectors). --- .../ingestion/connections/builders.py | 1 + .../ingestion/models/custom_pydantic.py | 34 ++++++++++------- .../tests/unit/test_connection_builders.py | 38 ++++++++++++++----- 3 files changed, 49 insertions(+), 24 deletions(-) diff --git a/ingestion/src/metadata/ingestion/connections/builders.py b/ingestion/src/metadata/ingestion/connections/builders.py index 7d01d26bd41b..32d9d4d8b2cd 100644 --- a/ingestion/src/metadata/ingestion/connections/builders.py +++ b/ingestion/src/metadata/ingestion/connections/builders.py @@ -12,6 +12,7 @@ """ Get and test connection utilities """ + from functools import partial from typing import Any, Callable, Dict, Optional from urllib.parse import quote_plus diff --git a/ingestion/src/metadata/ingestion/models/custom_pydantic.py b/ingestion/src/metadata/ingestion/models/custom_pydantic.py index 90ff9bf74c6f..ed0551ef96f1 100644 --- a/ingestion/src/metadata/ingestion/models/custom_pydantic.py +++ b/ingestion/src/metadata/ingestion/models/custom_pydantic.py @@ -15,6 +15,7 @@ dependencies against any other metadata package. This class should be self-sufficient with only pydantic at import time. """ + import json import logging from typing import Any, Callable, Dict, Literal, Optional, Union @@ -122,12 +123,12 @@ def strip_hostport_scheme(raw: str) -> str: # pinned to the original underscored symbol while the helper was private. _strip_hostport_scheme = strip_hostport_scheme -# Connection classes whose hostPort field legitimately stores a full URL -# (e.g. "http://airflow:8080" or "http://localhost:8585/api") and must NOT -# have their scheme stripped by model_post_init. -_HOSTPORT_URL_ALLOWED: frozenset = frozenset( - {"AirflowConnection", "OpenMetadataConnection"} -) +# Module sub-path that identifies database-connector classes. +# Only *Connection classes whose module contains this sub-path have a plain +# ``hostname[:port]`` hostPort — all other connection categories +# (dashboard, pipeline, search, metadata, …) legitimately store a full URL +# in hostPort and must NOT have their scheme stripped. +_DATABASE_CONNECTION_MODULE_MARKER = ".services.connections.database." class BaseModel(PydanticBaseModel): @@ -154,16 +155,21 @@ def model_post_init(self, context: Any, /): if not hasattr(self, "__pydantic_fields__"): return - # Strip accidental URL schemes from hostPort at model construction time. - # This protects connectors that split hostPort manually (e.g. Cassandra, - # Databricks, Redshift, DB2, Microsoft Fabric) without going through - # get_connection_url_common. ValueError for genuinely invalid inputs - # (e.g. non-numeric port) propagates immediately with a clear message. + # Strip accidental URL schemes from hostPort at model construction time, + # but ONLY for database-connector classes whose hostPort is a plain + # ``hostname[:port]``. Dashboard, pipeline, search, and other connector + # categories legitimately use a full URL in hostPort (e.g. + # ``http://grafana:3000``), so we restrict stripping to classes whose + # module path contains the database-connection marker. + # The isinstance(str) guard prevents an AttributeError when hostPort is + # optional and left unset (None). + host_port = getattr(self, "hostPort", None) if ( - hasattr(self, "hostPort") - and self.__class__.__name__ not in _HOSTPORT_URL_ALLOWED + isinstance(host_port, str) + and "://" in host_port + and _DATABASE_CONNECTION_MODULE_MARKER in (self.__class__.__module__ or "") ): - self.hostPort = strip_hostport_scheme(self.hostPort) + self.hostPort = strip_hostport_scheme(host_port) try: for field in self.__pydantic_fields__: diff --git a/ingestion/tests/unit/test_connection_builders.py b/ingestion/tests/unit/test_connection_builders.py index 27a3790b50ae..66c9a2fa32cc 100644 --- a/ingestion/tests/unit/test_connection_builders.py +++ b/ingestion/tests/unit/test_connection_builders.py @@ -11,11 +11,15 @@ """ Validate connection builder utilities """ + from unittest import TestCase from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, ) +from metadata.generated.schema.entity.services.connections.database.cassandraConnection import ( + CassandraConnection, +) from metadata.generated.schema.entity.services.connections.database.mysqlConnection import ( MysqlConnection, ) @@ -27,7 +31,7 @@ get_connection_options_dict, init_empty_connection_arguments, ) -from metadata.ingestion.models.custom_pydantic import _HOSTPORT_URL_ALLOWED +from metadata.ingestion.models.custom_pydantic import _DATABASE_CONNECTION_MODULE_MARKER class ConnectionBuilderTest(TestCase): @@ -127,20 +131,34 @@ def test_model_post_init_raises_for_invalid_port_in_hostport(self): hostPort="http://localhost:abc", ) - def test_url_allowed_connections_not_stripped(self): + def test_non_database_connections_not_stripped(self): """ - Verify that connection classes in _HOSTPORT_URL_ALLOWED (AirflowConnection - and OpenMetadataConnection) are excluded from hostPort scheme stripping, - since their hostPort field legitimately stores a full URL. + Verify that non-database connection classes (metadata, dashboard, pipeline, …) + are NOT subject to hostPort scheme stripping because their hostPort legitimately + stores a full URL (e.g. OpenMetadataConnection hostPort = http://localhost:8585/api). + + The guard uses the module path: only classes whose __module__ contains + _DATABASE_CONNECTION_MODULE_MARKER ('.services.connections.database.') + are stripped. All others pass through unchanged. """ - # Confirm the exclusion set contains the expected class names - self.assertIn("AirflowConnection", _HOSTPORT_URL_ALLOWED) - self.assertIn("OpenMetadataConnection", _HOSTPORT_URL_ALLOWED) + # Confirm the marker is correct + self.assertIn("database", _DATABASE_CONNECTION_MODULE_MARKER) - # OpenMetadataConnection.hostPort is a plain str field that expects a - # full URL like "http://localhost:8585/api" — it must NOT be stripped. + # OpenMetadataConnection.hostPort is a plain str that expects a full URL. + # Its module contains '.connections.metadata.' — NOT the database marker — + # so it must NOT be stripped. ometa = OpenMetadataConnection( hostPort="http://localhost:8585/api", ) self.assertIn("http", ometa.hostPort) self.assertIn("localhost", ometa.hostPort) + + def test_none_hostport_does_not_crash(self): + """ + Regression test for gitar-bot bug report: constructing a database + connection where hostPort is Optional and left as None must NOT raise + AttributeError from strip_hostport_scheme(None). + """ + # CassandraConnection.hostPort is Optional[str] = None by default. + conn = CassandraConnection() + self.assertIsNone(conn.hostPort)