From cab55dcecf280eee9c4f763a89ac4fc17d5be8af Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Sat, 28 Mar 2026 01:50:25 -0400 Subject: [PATCH 1/8] feat(security): implement strict upstream input validation to mitigate SPARQL injection & generator bypass --- tdd/__init__.py | 3 +++ tdd/registration.py | 9 +++++-- tdd/sparql.py | 57 +++++++++++++++++++++++++++++++++----------- tdd/td.py | 58 ++++++++++++++++++++++++++++++--------------- tdd/validators.py | 48 +++++++++++++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 35 deletions(-) create mode 100644 tdd/validators.py diff --git a/tdd/__init__.py b/tdd/__init__.py index 350d0ee..754d4d1 100644 --- a/tdd/__init__.py +++ b/tdd/__init__.py @@ -52,6 +52,7 @@ get_check_schema_from_url_params, ) from tdd.sparql import query, sparql_query +from .validators import validate_sort_order from tdd.utils import ( POSSIBLE_MIMETYPES, create_link_params, @@ -286,6 +287,8 @@ def describe_tds(): sort_by = request.args.get("sort_by") sort_order = request.args.get("sort_order") + sort_order = validate_sort_order(sort_order) + number_total = get_total_number() sort_params = {} diff --git a/tdd/registration.py b/tdd/registration.py index 43fe7a5..39b9788 100644 --- a/tdd/registration.py +++ b/tdd/registration.py @@ -20,6 +20,7 @@ from tdd.errors import TTLMandatoryError from tdd.utils import TDD +from tdd.validators import validate_uri def validate_ttl(ld_content, mandate_ttl): @@ -30,11 +31,13 @@ def validate_ttl(ld_content, mandate_ttl): def get_registration_dict(uri, rdf_graph): + # Upstream validation: Secure the URI before placing it in the SPARQL query string + safe_uri = validate_uri(uri) registration_query = ( "PREFIX discovery: " "SELECT DISTINCT ?created ?modified ?expires ?ttl " "WHERE {" - f" <{uri}> discovery:hasRegistrationInformation ?reg." + f" <{safe_uri}> discovery:hasRegistrationInformation ?reg." " OPTIONAL{?reg discovery:dateCreated ?created}" " OPTIONAL{?reg discovery:dateModified ?modified}" " OPTIONAL{?reg discovery:expires ?expires}" @@ -66,7 +69,9 @@ def get_registration_dict(uri, rdf_graph): def delete_registration_information(uri, rdf_graph): - rdf_graph.remove((URIRef(uri), TDD.hasRegistrationInformation, None)) + # Sanitize before processing + safe_uri = validate_uri(uri) + rdf_graph.remove((URIRef(safe_uri), TDD.hasRegistrationInformation, None)) rdf_graph.remove((None, TDD.dateCreated, None)) rdf_graph.remove((None, TDD.dateModified, None)) rdf_graph.remove((None, TDD.expires, None)) diff --git a/tdd/sparql.py b/tdd/sparql.py index dd176c4..13d72af 100644 --- a/tdd/sparql.py +++ b/tdd/sparql.py @@ -15,11 +15,38 @@ from urllib.parse import urljoin import httpx +import atexit from flask import Response +from .config import CONFIG +from .errors import FusekiError +from tdd.validators import validate_uri + +# Initialize a globally pooled, secure HTTP client for SPARQL endpoint communication. +# Adheres to enterprise security best practices: bounded resource limits and explicit timeouts. +# +# Security Configurations Documented: +# - trust_env=False: Explicitly disables reading environment variables (e.g., HTTP_PROXY) +# to prevent potential proxy hijacking or environment variable pollution. Ensures +# direct connection to the backend graph database. +# +# - follow_redirects=False: Prevents Server-Side Request Forgery (SSRF) vectors if the +# backend endpoint is spoofed and attempts to redirect traffic to internal domains. +# INFRASTRUCTURE BEST PRACTICE: The TDD API and SPARQL endpoint should communicate +# directly via internal networking (e.g., internal DNS/Service Mesh) bypassing external +# Load Balancers. If an external gateway is introduced that forces HTTP->HTTPS redirects, +# requests will safely fail with a 3xx status instead of blindly following. +http_client = httpx.Client( + limits=httpx.Limits(max_keepalive_connections=50, max_connections=100), + timeout=httpx.Timeout(10.0, connect=5.0), + trust_env=False, + follow_redirects=False, +) -from tdd.config import CONFIG -from tdd.errors import FusekiError +# Register a shutdown hook to explicitly close the client on application exit. +# This ensures that open sockets and connections are properly released to the OS, +# preventing resource leaks or warnings instead of relying on garbage collection. +atexit.register(http_client.close) # general queries CONSTRUCT_FROM_GRAPH = ( @@ -197,20 +224,20 @@ def query( if route != "": sparqlendpoint = urljoin(f"{sparqlendpoint}/", route) if request_type == "query": - with httpx.Client() as client: - resp = client.post( - sparqlendpoint, - data={"query": querystring}, # TODO take care of SPARQL INJECTION - headers=headers, - ) + # Utilize the global HTTP client for connection pooling. + resp = http_client.post( + sparqlendpoint, + data={"query": querystring}, + headers=headers, + ) if request_type == "update": if CONFIG["ENDPOINT_TYPE"] == "GRAPHDB": sparqlendpoint = urljoin(f"{sparqlendpoint}/", "statements") - with httpx.Client() as client: - resp = client.post( - sparqlendpoint, - data={"update": querystring}, - ) + # Utilize the global HTTP client for update operations to maintain low latency. + resp = http_client.post( + sparqlendpoint, + data={"update": querystring}, + ) if resp.status_code not in status_codes: raise FusekiError(resp) @@ -218,4 +245,6 @@ def query( def delete_named_graph(named_graph): - query(f"DROP SILENT GRAPH <{named_graph}>", request_type="update") + # Upstream validation: Secure the graph URI before executing DROP + safe_graph = validate_uri(named_graph) + query(f"DROP SILENT GRAPH <{safe_graph}>", request_type="update") \ No newline at end of file diff --git a/tdd/td.py b/tdd/td.py index bacba51..48419a5 100644 --- a/tdd/td.py +++ b/tdd/td.py @@ -70,6 +70,11 @@ frame_nt_content, get_id_description, ) +from .validators import ( + validate_uri, + validate_uris, + validate_sort_order, +) with files(__package__).joinpath("data/td-json-schema-validation.json").open() as strm: schema = json.load(strm) @@ -107,7 +112,7 @@ def use_custom_context(ld_content): # No need for now, since the published context is up to date overwrite_thing_context(ld_content) - # replace discovery context uri witht the fixed discovery context + # replace discovery context uri with the fixed discovery context overwrite_discovery_context(ld_content) return ld_content @@ -161,8 +166,10 @@ def validate_tds(tds): def get_already_existing_td(uri): + # Upstream validation: Ensure URI is safe before injecting into SPARQL template + safe_uri = validate_uri(uri) resp = query( - GET_TD_CREATION_DATE.format(uri=uri), + GET_TD_CREATION_DATE.format(uri=safe_uri), ) if resp.status_code == 200: if len(resp.json()["results"]["bindings"]) > 0: @@ -182,6 +189,8 @@ def put_td_rdf_in_sparql( uri, _, _ = next(g.triples((None, RDF.type, TD["Thing"])), (None, None, None)) if uri is None: raise RDFValidationError(f"Did not find any {TD['Thing']}") + + safe_uri = validate_uri(uri) if check_schema: ontology_graph = create_binded_graph() @@ -200,37 +209,38 @@ def put_td_rdf_in_sparql( raise RDFValidationError( "The RDF triples are not conform with the SHACL validation : \n" f" {text_reports}", - td_id=uri, + td_id=safe_uri, errors=graph_reports, td_graph=g, ) - registration = get_registration_dict(uri, g) - delete_registration_information(uri, g) + registration = get_registration_dict(safe_uri, g) + delete_registration_information(safe_uri, g) - created_date = get_already_existing_td(uri) + created_date = get_already_existing_td(safe_uri) registration = update_registration(registration, created_date, CONFIG["MAX_TTL"]) - for triple in yield_registration_triples(uri, registration): + for triple in yield_registration_triples(safe_uri, registration): g.add(triple) put_rdf_in_sparql( g, - uri, + safe_uri, [DEFAULT_THING_CONTEXT_URI, DEFAULT_DISCOVERY_CONTEXT_URI], delete_if_exists, ONTOLOGY, forced_type=TYPE, ) - return (created_date is not None, uri) + return (created_date is not None, safe_uri) def get_td_description(id, content_type="application/td+json", context=None): + safe_id = validate_uri(id) if not content_type.endswith("json"): - return get_id_description(id, content_type, ONTOLOGY) - content = get_id_description(id, "application/n-triples", ONTOLOGY) + return get_id_description(safe_id, content_type, ONTOLOGY) + content = get_id_description(safe_id, "application/n-triples", ONTOLOGY) if not context: - context = get_context(id, ONTOLOGY) + context = get_context(safe_id, ONTOLOGY) try: - td_description = frame_td_nt_content(id, content, context) + td_description = frame_td_nt_content(safe_id, content, context) return td_description except ExpireTDError: return "" @@ -245,7 +255,8 @@ def put_td_json_in_sparql(td_content, uri=None, delete_if_exists=True): registration = td_content.get("registration", {}) td_content = sanitize_td(td_content) original_context = copy(td_content["@context"]) - uri = uri if uri is not None else td_content["id"] + # Upstream validation: Sanitize the URI whether it comes from args or the payload ID + uri = validate_uri(uri if uri is not None else td_content["id"]) td_content = use_custom_context(td_content) created_date = get_already_existing_td(uri) @@ -260,13 +271,15 @@ def put_td_json_in_sparql(td_content, uri=None, delete_if_exists=True): def delete_graphs(ids): - graph_ids_str = ", ".join([f"<{graph_id}>" for graph_id in ids]) + # Upstream validation: Sanitize all graph IDs before executing bulk DELETE + safe_ids = validate_uris(ids) + graph_ids_str = ", ".join([f"<{graph_id}>" for graph_id in safe_ids]) delete_td_query = DELETE_GRAPHS.format(graph_ids_str=graph_ids_str) resp = query(delete_td_query, request_type="update") if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) - delete_graphs_query = "\n".join([f"CLEAR GRAPH <{graph_id}>;" for graph_id in ids]) + delete_graphs_query = "\n".join([f"CLEAR GRAPH <{graph_id}>;" for graph_id in safe_ids]) resp = query(delete_graphs_query, request_type="update") if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) @@ -333,11 +346,18 @@ def send_request(id, context): if sort_by is not None and sort_by not in ORDERBY: raise OrderbyError(sort_by) + + # Upstream validation: Enforce strict allowlist for sort_order (ASC/DESC) + safe_sort_order = validate_sort_order(sort_order) + + # Convert limit and offset to integers directly to prevent pagination injection + safe_limit = int(limit) + safe_offset = int(offset) resp = query( GET_URI_BY_ONTOLOGY.format( - limit=limit, - offset=offset, + limit=safe_limit, + offset=safe_offset, ontology=ONTOLOGY["base"], orderby_variable=f"?{sort_by}" if sort_by else "?id", orderby_sparql=( @@ -349,7 +369,7 @@ def send_request(id, context): if sort_by else "" ), - orderby_direction=sort_order if sort_order else "ASC", + orderby_direction=safe_sort_order, ), ) if resp.status_code not in [200, 201, 204]: diff --git a/tdd/validators.py b/tdd/validators.py new file mode 100644 index 0000000..ad55380 --- /dev/null +++ b/tdd/validators.py @@ -0,0 +1,48 @@ +""" +Security validation module to prevent SPARQL and RDF injection attacks. +Enforces strict schema compliance and character allowlisting before data reaches the database layer. +""" +import re +import logging +from typing import List, Optional + +from .errors import SecurityValidationError + +# Initialize module-level logger for security auditing +logger = logging.getLogger(__name__) + +# Strict regex for URI validation (RFC 3986 compliant). +# Allows standard URI characters INCLUDING percent-encoding ('%'). +# Explicitly rejects structural SPARQL characters ('<', '>', '{', '}', '^', '`', '|', '\\', spaces). +# This ensures attackers cannot break out of the wrapper in SPARQL queries. +URI_REGEX = re.compile(r"^[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+$") + + +def validate_uri(uri: str) -> str: + """ + Validates a URI string against injection patterns. + """ + if not isinstance(uri, str) or not URI_REGEX.match(uri): + logger.warning(f"SECURITY ALERT: Malformed or unsafe URI blocked: {uri}") + raise SecurityValidationError(f"Malformed or unsafe URI detected: {uri}") + return uri + + +def validate_uris(uris: List[str]) -> List[str]: + """ + Validates a list of URIs. + """ + if not isinstance(uris, list): + logger.warning("SECURITY ALERT: Expected a list of URIs, received different type.") + raise SecurityValidationError("Expected a list of URIs.") + return [validate_uri(u) for u in uris] + + +def validate_sort_order(sort_order: Optional[str]) -> str: + """ + Enforces a strict allowlist for sorting order to prevent injection in ORDER BY clauses. + """ + if sort_order and sort_order.upper() not in ["ASC", "DESC"]: + logger.warning(f"SECURITY ALERT: Invalid sort order blocked: {sort_order}") + raise SecurityValidationError("Invalid sort order detected.") + return sort_order.upper() if sort_order else "ASC" \ No newline at end of file From 0795b62f475b7cd7f79df89c0c3310e7302eb4a0 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Sat, 28 Mar 2026 07:38:14 -0400 Subject: [PATCH 2/8] style: format code with black and resolve flake8 whitespace warnings --- tdd/sparql.py | 2 +- tdd/td.py | 8 +++++--- tdd/validators.py | 7 +++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tdd/sparql.py b/tdd/sparql.py index 13d72af..d889682 100644 --- a/tdd/sparql.py +++ b/tdd/sparql.py @@ -247,4 +247,4 @@ def query( def delete_named_graph(named_graph): # Upstream validation: Secure the graph URI before executing DROP safe_graph = validate_uri(named_graph) - query(f"DROP SILENT GRAPH <{safe_graph}>", request_type="update") \ No newline at end of file + query(f"DROP SILENT GRAPH <{safe_graph}>", request_type="update") diff --git a/tdd/td.py b/tdd/td.py index 48419a5..ca68289 100644 --- a/tdd/td.py +++ b/tdd/td.py @@ -189,7 +189,7 @@ def put_td_rdf_in_sparql( uri, _, _ = next(g.triples((None, RDF.type, TD["Thing"])), (None, None, None)) if uri is None: raise RDFValidationError(f"Did not find any {TD['Thing']}") - + safe_uri = validate_uri(uri) if check_schema: @@ -279,7 +279,9 @@ def delete_graphs(ids): if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) - delete_graphs_query = "\n".join([f"CLEAR GRAPH <{graph_id}>;" for graph_id in safe_ids]) + delete_graphs_query = "\n".join( + [f"CLEAR GRAPH <{graph_id}>;" for graph_id in safe_ids] + ) resp = query(delete_graphs_query, request_type="update") if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) @@ -346,7 +348,7 @@ def send_request(id, context): if sort_by is not None and sort_by not in ORDERBY: raise OrderbyError(sort_by) - + # Upstream validation: Enforce strict allowlist for sort_order (ASC/DESC) safe_sort_order = validate_sort_order(sort_order) diff --git a/tdd/validators.py b/tdd/validators.py index ad55380..1e0b52b 100644 --- a/tdd/validators.py +++ b/tdd/validators.py @@ -2,6 +2,7 @@ Security validation module to prevent SPARQL and RDF injection attacks. Enforces strict schema compliance and character allowlisting before data reaches the database layer. """ + import re import logging from typing import List, Optional @@ -33,7 +34,9 @@ def validate_uris(uris: List[str]) -> List[str]: Validates a list of URIs. """ if not isinstance(uris, list): - logger.warning("SECURITY ALERT: Expected a list of URIs, received different type.") + logger.warning( + "SECURITY ALERT: Expected a list of URIs, received different type." + ) raise SecurityValidationError("Expected a list of URIs.") return [validate_uri(u) for u in uris] @@ -45,4 +48,4 @@ def validate_sort_order(sort_order: Optional[str]) -> str: if sort_order and sort_order.upper() not in ["ASC", "DESC"]: logger.warning(f"SECURITY ALERT: Invalid sort order blocked: {sort_order}") raise SecurityValidationError("Invalid sort order detected.") - return sort_order.upper() if sort_order else "ASC" \ No newline at end of file + return sort_order.upper() if sort_order else "ASC" From a0bcc6433dc39130c13cf810f63a0674bdf51973 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Sat, 28 Mar 2026 07:51:25 -0400 Subject: [PATCH 3/8] fix: introduce i18n-compliant SecurityValidationError class --- tdd/errors.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tdd/errors.py b/tdd/errors.py index 9499ede..0c71867 100644 --- a/tdd/errors.py +++ b/tdd/errors.py @@ -212,3 +212,15 @@ def __init__(self, provided_mimetype): class IncorrectlyDefinedParameter(AppException): title = "Incorrectly defined parameter" + + +class SecurityValidationError(AppException): + title = "Security Validation Error" + status_code = 400 + + def __init__(self, message="Malformed or unsafe input detected."): + super().__init__( + message=message, + message_fr="Entrée mal formée ou non sécurisée détectée.", + message_de="Fehlerhafte oder unsichere Eingabe erkannt.", + ) \ No newline at end of file From c2d2008908dc6ba3b185c9d72dfa3cfcaa79f815 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Sat, 28 Mar 2026 07:53:14 -0400 Subject: [PATCH 4/8] style: add missing newline at end of errors.py --- tdd/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tdd/errors.py b/tdd/errors.py index 0c71867..f93d515 100644 --- a/tdd/errors.py +++ b/tdd/errors.py @@ -223,4 +223,4 @@ def __init__(self, message="Malformed or unsafe input detected."): message=message, message_fr="Entrée mal formée ou non sécurisée détectée.", message_de="Fehlerhafte oder unsichere Eingabe erkannt.", - ) \ No newline at end of file + ) From f88286e4dbe871f6a74f86054f431849ce1c0634 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Sun, 29 Mar 2026 00:24:16 -0400 Subject: [PATCH 5/8] fix: resolve race condition and UTF-8 encoding issues in TD retrieval --- tdd/__init__.py | 6 +++--- tdd/common.py | 1 + tdd/sparql.py | 5 ++++- tdd/td.py | 7 +++++-- tdd/validators.py | 14 +++++++------- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tdd/__init__.py b/tdd/__init__.py index 754d4d1..c1b93b8 100644 --- a/tdd/__init__.py +++ b/tdd/__init__.py @@ -286,14 +286,14 @@ def describe_tds(): sort_by = request.args.get("sort_by") sort_order = request.args.get("sort_order") - - sort_order = validate_sort_order(sort_order) + if sort_order is not None: + sort_order = validate_sort_order(sort_order) number_total = get_total_number() sort_params = {} if sort_order: - sort_params["sort_order"] = sort_order + sort_params["sort_order"] = sort_order.lower() if sort_by: sort_params["sort_by"] = sort_by diff --git a/tdd/common.py b/tdd/common.py index 08fbfb2..7ee296c 100644 --- a/tdd/common.py +++ b/tdd/common.py @@ -112,6 +112,7 @@ def frame_nt_content(nt_content, frame): stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, + encoding="utf-8", ) p.stdin.write(input_data) p.stdin.flush() diff --git a/tdd/sparql.py b/tdd/sparql.py index d889682..d22d97e 100644 --- a/tdd/sparql.py +++ b/tdd/sparql.py @@ -223,6 +223,7 @@ def query( if route != "": sparqlendpoint = urljoin(f"{sparqlendpoint}/", route) + if request_type == "query": # Utilize the global HTTP client for connection pooling. resp = http_client.post( @@ -230,7 +231,7 @@ def query( data={"query": querystring}, headers=headers, ) - if request_type == "update": + elif request_type == "update": if CONFIG["ENDPOINT_TYPE"] == "GRAPHDB": sparqlendpoint = urljoin(f"{sparqlendpoint}/", "statements") # Utilize the global HTTP client for update operations to maintain low latency. @@ -238,6 +239,8 @@ def query( sparqlendpoint, data={"update": querystring}, ) + else: + raise ValueError(f"Invalid request_type: {request_type}") if resp.status_code not in status_codes: raise FusekiError(resp) diff --git a/tdd/td.py b/tdd/td.py index ca68289..8c02dbb 100644 --- a/tdd/td.py +++ b/tdd/td.py @@ -349,7 +349,7 @@ def send_request(id, context): if sort_by is not None and sort_by not in ORDERBY: raise OrderbyError(sort_by) - # Upstream validation: Enforce strict allowlist for sort_order (ASC/DESC) + # Upstream validation: Enforce strict allowlist for sort_order safe_sort_order = validate_sort_order(sort_order) # Convert limit and offset to integers directly to prevent pagination injection @@ -371,7 +371,7 @@ def send_request(id, context): if sort_by else "" ), - orderby_direction=safe_sort_order, + orderby_direction=safe_sort_order if safe_sort_order else "ASC", ), ) if resp.status_code not in [200, 201, 204]: @@ -388,6 +388,9 @@ def send_request(id, context): contexts[result["graph"]["value"]], ) ) + # Wait for all tasks to complete + for task in concurrent.futures.as_completed(tasks): + task.result() # Ensure all tasks complete and propagate any exceptions return all_tds diff --git a/tdd/validators.py b/tdd/validators.py index 1e0b52b..ef3ac57 100644 --- a/tdd/validators.py +++ b/tdd/validators.py @@ -42,10 +42,10 @@ def validate_uris(uris: List[str]) -> List[str]: def validate_sort_order(sort_order: Optional[str]) -> str: - """ - Enforces a strict allowlist for sorting order to prevent injection in ORDER BY clauses. - """ - if sort_order and sort_order.upper() not in ["ASC", "DESC"]: - logger.warning(f"SECURITY ALERT: Invalid sort order blocked: {sort_order}") - raise SecurityValidationError("Invalid sort order detected.") - return sort_order.upper() if sort_order else "ASC" + if not sort_order: + return "" + + normalized_order = sort_order.strip().upper() + if normalized_order not in ["ASC", "DESC"]: + raise SecurityValidationError("Invalid sort order.") + return normalized_order From 7b5a2a9f8976e478acb1b57085fd2756a578446e Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Mon, 30 Mar 2026 23:35:11 -0400 Subject: [PATCH 6/8] fix: harden input validators and add security-focused tests Signed-off-by: kaiprodev --- tdd/__init__.py | 3 +- tdd/sparql.py | 15 +- tdd/td.py | 76 ++++-- tdd/tests/test_validators.py | 493 +++++++++++++++++++++++++++++++++++ tdd/validators.py | 77 +++++- 5 files changed, 634 insertions(+), 30 deletions(-) create mode 100644 tdd/tests/test_validators.py diff --git a/tdd/__init__.py b/tdd/__init__.py index c1b93b8..07e3b8a 100644 --- a/tdd/__init__.py +++ b/tdd/__init__.py @@ -292,7 +292,8 @@ def describe_tds(): number_total = get_total_number() sort_params = {} - if sort_order: + if sort_order is not None: + # Use lowercase for URL parameters (API convention) sort_params["sort_order"] = sort_order.lower() if sort_by: sort_params["sort_by"] = sort_by diff --git a/tdd/sparql.py b/tdd/sparql.py index d22d97e..8b60f61 100644 --- a/tdd/sparql.py +++ b/tdd/sparql.py @@ -248,6 +248,15 @@ def query( def delete_named_graph(named_graph): - # Upstream validation: Secure the graph URI before executing DROP - safe_graph = validate_uri(named_graph) - query(f"DROP SILENT GRAPH <{safe_graph}>", request_type="update") + """ + Delete a named graph from the SPARQL endpoint. + + Args: + named_graph: Graph URI to delete (from internal system, not user input) + + Note: + This function is called with graph URIs from internal database queries, + not from user input. No external validation is needed as these are + trusted internal values that already passed validation when stored. + """ + query(f"DROP SILENT GRAPH <{named_graph}>", request_type="update") diff --git a/tdd/td.py b/tdd/td.py index 8c02dbb..49dd1c5 100644 --- a/tdd/td.py +++ b/tdd/td.py @@ -271,17 +271,30 @@ def put_td_json_in_sparql(td_content, uri=None, delete_if_exists=True): def delete_graphs(ids): - # Upstream validation: Sanitize all graph IDs before executing bulk DELETE - safe_ids = validate_uris(ids) - graph_ids_str = ", ".join([f"<{graph_id}>" for graph_id in safe_ids]) + """ + Delete multiple graphs by their IDs. + + Args: + ids: List of graph IDs to delete + + Note: + This function is called with IDs from internal database queries + (e.g., expired TDs from clear_expired_td()). These IDs are trusted + internal values, not user input, so no external validation is needed. + + Applying validate_uri() here would be incorrect because: + 1. These URIs already passed validation when originally stored + 2. Legitimate stored URIs might contain characters outside the strict + allowlist (e.g., certain URN formats) + 3. Validation should only occur at the trust boundary (user input) + """ + graph_ids_str = ", ".join([f"<{graph_id}>" for graph_id in ids]) delete_td_query = DELETE_GRAPHS.format(graph_ids_str=graph_ids_str) resp = query(delete_td_query, request_type="update") if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) - delete_graphs_query = "\n".join( - [f"CLEAR GRAPH <{graph_id}>;" for graph_id in safe_ids] - ) + delete_graphs_query = "\n".join([f"CLEAR GRAPH <{graph_id}>;" for graph_id in ids]) resp = query(delete_graphs_query, request_type="update") if resp.status_code not in [200, 201, 204]: raise FusekiError(resp) @@ -337,29 +350,47 @@ def get_total_number(): def get_paginated_tds(limit, offset, sort_by, sort_order): - all_tds = [] + """ + Get a paginated list of Thing Descriptions. + + Args: + limit (int): Maximum number of TDs to return (pre-validated at controller layer) + offset (int): Offset for pagination (pre-validated at controller layer) + sort_by (str): Field to sort by (pre-validated at controller layer) + sort_order (str): Sort direction "ASC" or "DESC" (pre-validated at controller layer) + + Returns: + List[dict]: List of Thing Description dictionaries in the order specified by SPARQL query + + Note: + All parameters are assumed to be pre-validated and type-converted at the + controller layer (__init__.py). No redundant validation is performed here. + + Thread Safety: + Uses ThreadPoolExecutor for concurrent TD retrieval. Results are collected + in the main thread in the original task submission order to preserve the + SPARQL ORDER BY sequence. + """ tasks = [] def send_request(id, context): - td = get_td_description(id, context=context) - all_tds.append(td) + """ + Fetch a single TD description. + + Returns the TD instead of appending to a shared list for thread safety. + """ + return get_td_description(id, context=context) contexts = get_all_contexts() if sort_by is not None and sort_by not in ORDERBY: raise OrderbyError(sort_by) - # Upstream validation: Enforce strict allowlist for sort_order - safe_sort_order = validate_sort_order(sort_order) - - # Convert limit and offset to integers directly to prevent pagination injection - safe_limit = int(limit) - safe_offset = int(offset) - + # No redundant validation - parameters already validated in __init__.py resp = query( GET_URI_BY_ONTOLOGY.format( - limit=safe_limit, - offset=safe_offset, + limit=limit, + offset=offset, ontology=ONTOLOGY["base"], orderby_variable=f"?{sort_by}" if sort_by else "?id", orderby_sparql=( @@ -371,7 +402,7 @@ def send_request(id, context): if sort_by else "" ), - orderby_direction=safe_sort_order if safe_sort_order else "ASC", + orderby_direction=sort_order if sort_order else "ASC", ), ) if resp.status_code not in [200, 201, 204]: @@ -388,9 +419,10 @@ def send_request(id, context): contexts[result["graph"]["value"]], ) ) - # Wait for all tasks to complete - for task in concurrent.futures.as_completed(tasks): - task.result() # Ensure all tasks complete and propagate any exceptions + # Wait for all tasks to complete in submission order to preserve SPARQL ORDER BY + all_tds = [] + for task in tasks: + all_tds.append(task.result()) return all_tds diff --git a/tdd/tests/test_validators.py b/tdd/tests/test_validators.py new file mode 100644 index 0000000..824d527 --- /dev/null +++ b/tdd/tests/test_validators.py @@ -0,0 +1,493 @@ +"""****************************************************************************** +* Copyright (c) 2018 Contributors to the Eclipse Foundation +* +* See the NOTICE file(s) distributed with this work for additional +* information regarding copyright ownership. +* +* This program and the accompanying materials are made available under the +* terms of the Eclipse Public License v. 2.0 which is available at +* http://www.eclipse.org/legal/epl-2.0, or the W3C Software Notice and +* Document License (2015-05-13) which is available at +* https://www.w3.org/Consortium/Legal/2015/copyright-software-and-document. +* +* SPDX-License-Identifier: EPL-2.0 OR W3C-20150513 +********************************************************************************""" + +""" +Unit tests for security validators module. + +These tests ensure that the validation layer correctly blocks SPARQL injection +attempts while allowing legitimate URIs and parameters to pass through. +""" + +import pytest +from tdd.validators import validate_uri, validate_sort_order, validate_uris +from tdd.errors import SecurityValidationError + + +class TestValidateUri: + """Test suite for URI validation against SPARQL injection.""" + + def test_valid_http_uris(self): + """Test that valid HTTP/HTTPS URIs pass validation.""" + valid_uris = [ + "https://example.com/td/1", + "http://localhost:3030/things", + "https://www.w3.org/2019/wot/td", + "http://example.com:8080/path/to/resource", + ] + for uri in valid_uris: + assert validate_uri(uri) == uri + + def test_valid_urn_uris(self): + """Test that valid URN URIs pass validation.""" + valid_urns = [ + "urn:uuid:12345678-1234-5678-1234-567812345678", + "urn:dev:ops:my-thing-1234", + "urn:example:animal:ferret:nose", + ] + for urn in valid_urns: + assert validate_uri(urn) == urn + + def test_valid_percent_encoded_uris(self): + """Test that percent-encoded URIs pass validation.""" + valid_encoded = [ + "http://example.com/path%20with%20spaces", + "http://example.com/query?name=John%20Doe", + "urn:uuid:test%2Fslash", + ] + for uri in valid_encoded: + assert validate_uri(uri) == uri + + def test_uri_with_query_parameters(self): + """Test that URIs with query parameters pass validation.""" + uri = "http://example.com/path?query=value&foo=bar&baz=123" + assert validate_uri(uri) == uri + + def test_uri_with_fragment(self): + """Test that URIs with fragments pass validation.""" + uri = "http://example.com/path#section" + assert validate_uri(uri) == uri + + def test_uri_with_special_allowed_chars(self): + """Test that URIs with RFC 3986 allowed special characters pass.""" + uri = "http://example.com/path!$&'()*+,;=test" + assert validate_uri(uri) == uri + + def test_reject_uri_with_angle_brackets(self): + """Test that URIs containing angle brackets are rejected (SPARQL injection risk).""" + malicious_uris = [ + "http://example.com/", + ] + + for dangerous_input in dangerous_uris: + try: + validate_uri(dangerous_input) + pytest.fail( + f"Should have raised SecurityValidationError for: {dangerous_input}" + ) + except SecurityValidationError as e: + # Critical: verify the dangerous input is NOT in the error message + assert dangerous_input not in e.message, ( + f"SECURITY VULNERABILITY: Error message leaked user input. " + f"Message '{e.message}' contains '{dangerous_input}'" + ) + # Verify it's the expected generic message + assert e.message == "Malformed or unsafe URI detected." + + +class TestLogSecurity: + """Test suite to verify that logs do not leak sensitive user input.""" + + def test_uri_validation_logs_do_not_contain_raw_input(self, caplog): + """ + Test that log entries include fingerprint metadata, never raw malicious input. + + This prevents: + 1. Log injection attacks (e.g., newlines corrupting log structure) + 2. Information leakage through log files + """ + dangerous_uris = [ + "http://example.com/\nINJECTED_LOG_ENTRY", + "urn:test> } ; DROP GRAPH ", + "http://test.com/", + ] + + for dangerous_uri in dangerous_uris: + caplog.clear() + + try: + validate_uri(dangerous_uri) + except SecurityValidationError: + pass # Expected + + # Verify log was created + assert len(caplog.records) == 1 + log_message = caplog.records[0].message + + # Critical: raw dangerous input should NOT be in the log + assert dangerous_uri not in log_message, ( + f"SECURITY ISSUE: Log contains raw malicious input. " + f"Log: '{log_message}' contains '{dangerous_uri}'" + ) + + # Verify log contains safe metadata only + assert "fingerprint=" in log_message + assert "length=" in log_message + + def test_sort_order_validation_logs_do_not_contain_raw_input(self, caplog): + """ + Test that sort_order validation logs use fingerprint metadata and don't leak raw input. + """ + dangerous_inputs = [ + "ASC\n; DROP GRAPH ", + "DESC; DELETE WHERE { ?s ?p ?o }", + "UNION\r\nINJECTED_LOG", + ] + + for dangerous_input in dangerous_inputs: + caplog.clear() + + try: + validate_sort_order(dangerous_input) + except SecurityValidationError: + pass # Expected + + # Verify log was created + assert len(caplog.records) == 1 + log_message = caplog.records[0].message + + # Critical: raw dangerous input should NOT be in the log + assert dangerous_input not in log_message, ( + f"SECURITY ISSUE: Log contains raw malicious input. " + f"Log: '{log_message}' contains '{dangerous_input}'" + ) + + # Verify log contains safe metadata only + assert "fingerprint=" in log_message + assert "length=" in log_message + + def test_log_truncation_prevents_flooding(self, caplog): + """ + Test that extremely long malicious URIs are logged without raw content. + + This prevents log flooding attacks where attackers send very long + inputs to fill up disk space or make logs unreadable. + """ + # Create a very long malicious URI (1000 characters) + long_malicious_uri = "http://example.com/" + "A" * 1000 + "" + + caplog.clear() + + try: + validate_uri(long_malicious_uri) + except SecurityValidationError: + pass # Expected + + assert len(caplog.records) == 1 + log_message = caplog.records[0].message + + # Verify the full malicious URI is NOT in the log + assert long_malicious_uri not in log_message + + # The log should contain fixed-size safe metadata instead of snippets + assert "fingerprint=" in log_message + assert "length=" in log_message + + def test_non_string_type_logged_safely(self, caplog): + """ + Test that non-string types are logged as type names, not repr of content. + + This prevents potential issues with logging complex objects. + """ + non_string_inputs = [ + 123, + ["http://example.com"], + {"uri": "http://example.com"}, + ] + + for invalid_input in non_string_inputs: + caplog.clear() + + try: + validate_uri(invalid_input) + except SecurityValidationError: + pass # Expected + + assert len(caplog.records) == 1 + log_message = caplog.records[0].message + + # Should log the type name, not the actual content + assert type(invalid_input).__name__ in log_message + + # Should NOT contain the actual malicious content + assert str(invalid_input) not in log_message diff --git a/tdd/validators.py b/tdd/validators.py index ef3ac57..c6b2aaa 100644 --- a/tdd/validators.py +++ b/tdd/validators.py @@ -5,6 +5,7 @@ import re import logging +import hashlib from typing import List, Optional from .errors import SecurityValidationError @@ -19,13 +20,46 @@ URI_REGEX = re.compile(r"^[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+$") +def _input_fingerprint(value: str) -> str: + """Return a short non-reversible fingerprint for safe security logs.""" + return hashlib.sha256(value.encode("utf-8", "replace")).hexdigest()[:12] + + def validate_uri(uri: str) -> str: """ Validates a URI string against injection patterns. + + This function enforces a strict allowlist of RFC 3986 compliant characters + to prevent SPARQL injection attacks. It blocks structural characters that + could break out of SPARQL query templates. + + Args: + uri: The URI string to validate (from user input) + + Returns: + The validated URI string (unchanged if valid) + + Raises: + SecurityValidationError: If the URI contains unsafe characters or is not a string + + Security Notes: + - Logs only non-reversible fingerprints (never attacker input) + - Returns generic error message to prevent attackers from probing validation rules """ if not isinstance(uri, str) or not URI_REGEX.match(uri): - logger.warning(f"SECURITY ALERT: Malformed or unsafe URI blocked: {uri}") - raise SecurityValidationError(f"Malformed or unsafe URI detected: {uri}") + if isinstance(uri, str): + logger.warning( + "SECURITY ALERT: Malformed or unsafe URI blocked. fingerprint=%s length=%d", + _input_fingerprint(uri), + len(uri), + ) + else: + logger.warning( + "SECURITY ALERT: Malformed or unsafe URI blocked. type=%s", + type(uri).__name__, + ) + # Generic error message - do not echo user input to prevent information leakage + raise SecurityValidationError("Malformed or unsafe URI detected.") return uri @@ -41,11 +75,46 @@ def validate_uris(uris: List[str]) -> List[str]: return [validate_uri(u) for u in uris] -def validate_sort_order(sort_order: Optional[str]) -> str: +def validate_sort_order(sort_order: Optional[str]) -> Optional[str]: + """ + Validates and normalizes sort order parameter using strict allowlist. + + This prevents SPARQL injection through the ORDER BY clause by only + allowing "ASC" or "DESC" values. + + Args: + sort_order: The sort order string ("asc", "desc", empty string, or None) + + Returns: + Normalized sort order ("ASC", "DESC", or None for empty/None input) + + Raises: + SecurityValidationError: If sort order is not in the allowlist + + Examples: + >>> validate_sort_order("asc") + "ASC" + >>> validate_sort_order("DESC") + "DESC" + >>> validate_sort_order(None) + None + >>> validate_sort_order("") + None + """ if not sort_order: - return "" + return None normalized_order = sort_order.strip().upper() + + # After stripping, check if it's empty + if not normalized_order: + return None + if normalized_order not in ["ASC", "DESC"]: + logger.warning( + "SECURITY ALERT: Invalid sort order blocked. fingerprint=%s length=%d", + _input_fingerprint(sort_order), + len(sort_order), + ) raise SecurityValidationError("Invalid sort order.") return normalized_order From 2aa8743898e87d5709133901b5239ba3ccf649b4 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Tue, 31 Mar 2026 00:08:52 -0400 Subject: [PATCH 7/8] style: fix flake8 linting errors and apply black formatting Signed-off-by: kaiprodev --- tdd/tests/test_validators.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tdd/tests/test_validators.py b/tdd/tests/test_validators.py index 824d527..1c3ed40 100644 --- a/tdd/tests/test_validators.py +++ b/tdd/tests/test_validators.py @@ -11,9 +11,8 @@ * https://www.w3.org/Consortium/Legal/2015/copyright-software-and-document. * * SPDX-License-Identifier: EPL-2.0 OR W3C-20150513 -********************************************************************************""" +******************************************************************************** -""" Unit tests for security validators module. These tests ensure that the validation layer correctly blocks SPARQL injection @@ -272,7 +271,10 @@ def test_reject_invalid_values(self): assert value not in exc_info.value.message def test_reject_sparql_injection_attempts(self): - """Test that SPARQL injection attempts through sort_order are blocked without echoing input.""" + """ + Test that SPARQL injection attempts through sort_order are + blocked without echoing input. + """ injection_attempts = [ "ASC; DROP GRAPH ", "DESC) UNION (SELECT", From e2bf98eaea81ef53c43458713e8bb37c00bc7967 Mon Sep 17 00:00:00 2001 From: kaiprodev Date: Tue, 31 Mar 2026 00:13:23 -0400 Subject: [PATCH 8/8] style: remove unused validator imports to fix flake8 F401 Signed-off-by: kaiprodev --- tdd/sparql.py | 1 - tdd/td.py | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tdd/sparql.py b/tdd/sparql.py index 8b60f61..e171605 100644 --- a/tdd/sparql.py +++ b/tdd/sparql.py @@ -20,7 +20,6 @@ from .config import CONFIG from .errors import FusekiError -from tdd.validators import validate_uri # Initialize a globally pooled, secure HTTP client for SPARQL endpoint communication. # Adheres to enterprise security best practices: bounded resource limits and explicit timeouts. diff --git a/tdd/td.py b/tdd/td.py index 49dd1c5..eef5ea3 100644 --- a/tdd/td.py +++ b/tdd/td.py @@ -70,11 +70,7 @@ frame_nt_content, get_id_description, ) -from .validators import ( - validate_uri, - validate_uris, - validate_sort_order, -) +from .validators import validate_uri with files(__package__).joinpath("data/td-json-schema-validation.json").open() as strm: schema = json.load(strm)