From f2ffe4402ea4e3bf7ab8c5c52d43c1886323d84c Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Fri, 12 Jun 2026 15:29:22 -0700 Subject: [PATCH 1/2] fix(detection): pass single chunk validation flag to exports Signed-off-by: lipikaramaswamy --- .../engine/detection/detection_workflow.py | 2 + tests/engine/test_detection_workflow.py | 46 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index d59246b7..b08fc4af 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -117,6 +117,7 @@ def detect_and_validate_entities( gliner_detection_threshold=gliner_detection_threshold, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, entity_labels=entity_labels, data_summary=data_summary, ) @@ -138,6 +139,7 @@ def _build_detection_spec( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, data_summary: str | None = None, ) -> tuple[list[ModelConfig], list[ColumnConfigT]]: diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 3f45f1b1..17a15e86 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -543,6 +543,52 @@ def test_validator_pool_kwargs_thread_through_to_generator_params( assert params.excerpt_window_chars == 42 +def test_validation_single_chunk_full_text_threads_to_generator_params( + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + adapter = Mock() + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice"], + COL_DETECTED_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ), + failed_records=[], + ) + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + validation_single_chunk_full_text=False, + ) + columns = adapter.run_workflow.call_args.kwargs["columns"] + params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params + assert params.single_chunk_full_text is False + + +def test_build_detection_config_uses_default_validation_single_chunk_full_text( + tmp_path, + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + adapter = Mock() + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.build_detection_config( + pd.DataFrame({COL_TEXT: ["Alice"]}), + seed_path=tmp_path / "seed.parquet", + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + ) + columns = adapter.build_config.call_args.kwargs["columns"] + params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params + assert params.single_chunk_full_text is True + + def test_pool_size_greater_than_one_emits_warning( stub_detector_model_configs: list[ModelConfig], stub_detection_model_selection: DetectionModelSelection, From 5a74d0b09930e666a3276f1756219a01b8a26fcc Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Fri, 12 Jun 2026 16:15:58 -0700 Subject: [PATCH 2/2] fix(detection): expose single chunk validation flag Signed-off-by: lipikaramaswamy --- .../engine/detection/detection_workflow.py | 6 +++ tests/engine/test_detection_workflow.py | 42 ++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index b08fc4af..1c762844 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -247,6 +247,7 @@ def build_detection_config( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, data_summary: str | None = None, ) -> DataDesignerConfigBuilder: @@ -261,6 +262,7 @@ def build_detection_config( gliner_detection_threshold=gliner_detection_threshold, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, entity_labels=entity_labels, data_summary=data_summary, ) @@ -280,6 +282,7 @@ def build_detection_builder_for_seed( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, data_summary: str | None = None, job_index: int = 0, @@ -300,6 +303,7 @@ def build_detection_builder_for_seed( gliner_detection_threshold=gliner_detection_threshold, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, entity_labels=entity_labels, data_summary=data_summary, ) @@ -364,6 +368,7 @@ def run( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, privacy_goal: PrivacyGoal | None = None, data_summary: str | None = None, @@ -393,6 +398,7 @@ def run( gliner_detection_threshold=gliner_detection_threshold, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, entity_labels=entity_labels, data_summary=data_summary, preview_num_records=preview_num_records, diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 17a15e86..27dfbe8c 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -558,18 +558,58 @@ def test_validation_single_chunk_full_text_threads_to_generator_params( failed_records=[], ) workflow = EntityDetectionWorkflow(adapter=adapter) - workflow.detect_and_validate_entities( + workflow.run( pd.DataFrame({COL_TEXT: ["Alice"]}), model_configs=stub_detector_model_configs, selected_models=stub_detection_model_selection, gliner_detection_threshold=0.5, validation_single_chunk_full_text=False, + tag_latent_entities=False, ) columns = adapter.run_workflow.call_args.kwargs["columns"] params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params assert params.single_chunk_full_text is False +def test_build_detection_config_threads_validation_single_chunk_full_text( + tmp_path, + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + adapter = Mock() + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.build_detection_config( + pd.DataFrame({COL_TEXT: ["Alice"]}), + seed_path=tmp_path / "seed.parquet", + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + validation_single_chunk_full_text=False, + ) + columns = adapter.build_config.call_args.kwargs["columns"] + params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params + assert params.single_chunk_full_text is False + + +def test_build_detection_builder_for_seed_threads_validation_single_chunk_full_text( + tmp_path, + stub_detector_model_configs: list[ModelConfig], + stub_detection_model_selection: DetectionModelSelection, +) -> None: + adapter = Mock() + workflow = EntityDetectionWorkflow(adapter=adapter) + workflow.build_detection_builder_for_seed( + seed_path=tmp_path / "seed.parquet", + model_configs=stub_detector_model_configs, + selected_models=stub_detection_model_selection, + gliner_detection_threshold=0.5, + validation_single_chunk_full_text=False, + ) + columns = adapter.build_config_for_seed.call_args.kwargs["columns"] + params = _find_column(columns, COL_VALIDATION_DECISIONS).generator_params + assert params.single_chunk_full_text is False + + def test_build_detection_config_uses_default_validation_single_chunk_full_text( tmp_path, stub_detector_model_configs: list[ModelConfig],