Skip to content
46 changes: 45 additions & 1 deletion src/anonymizer/engine/detection/detection_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
EntitiesByValueSchema,
EntitiesSchema,
LatentEntitiesSchema,
LatentEntitySchema,
)

logger = logging.getLogger("anonymizer.detection")
Expand Down Expand Up @@ -242,7 +243,10 @@ def identify_latent_entities(
workflow_name="latent-entity-detection",
preview_num_records=preview_num_records,
)
return EntityDetectionResult(dataframe=latent_result.dataframe, failed_records=latent_result.failed_records)
return EntityDetectionResult(
dataframe=_pad_empty_latent_column(latent_result.dataframe),
failed_records=latent_result.failed_records,
)

def run(
self,
Expand Down Expand Up @@ -679,3 +683,43 @@ def _format_privacy_goal(privacy_goal: PrivacyGoal | None) -> str:
if privacy_goal is None:
return "Not provided"
return privacy_goal.to_prompt_string()


def _pad_empty_latent_column(df: pd.DataFrame) -> pd.DataFrame:
"""Inject a sentinel into any empty ``_latent_entities`` cell.

Downstream workflows write the DataFrame to parquet via DataDesigner,
which uses pyarrow. pyarrow raises ``Cannot write struct type with no
child field`` when every cell has ``latent_entities: []`` — it can't
infer the nested struct schema from only empty lists.
``LatentEntitiesSchema._ensure_parquet_writable`` covers this when
pydantic validation runs, but DD does not always route through
``model_validate`` (e.g. partial-failure fallback), so we pad again
at the DataFrame level.
"""
if COL_LATENT_ENTITIES not in df.columns:
return df
sentinel = [LatentEntitySchema().model_dump()]

def _fix(cell):
# Preserve each cell's existing shape: the column is uniformly the
# struct/dict shape ({"latent_entities": [...]}) from LatentEntitiesSchema
# in the normal DD path (dict branch), but tolerate a bare-list cell from
# alternate paths. The downstream reader (_coerce_entity_list) accepts
# either shape, so we never mix dict and list within one column.
# None / NaN (e.g. a row absent from a partial-failure fallback merge,
# reintroduced by a pandas reindex) normalizes to the canonical struct
# so the column stays parquet-writable.
if cell is None or (not isinstance(cell, (dict, list)) and pd.isna(cell)):
return {"latent_entities": sentinel}
if isinstance(cell, dict):
if not cell.get("latent_entities"):
return {**cell, "latent_entities": sentinel}
return cell
if isinstance(cell, list) and not cell:
return sentinel
return cell
Comment on lines +704 to +721

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 None/NaN cells returned unchanged will still fail PyArrow

The _fix helper covers dict with empty latent_entities and empty list, but silently passes through any other value — including None or float('nan') that pandas places in a column during partial-failure fallback. PyArrow will still raise a type error on those cells. An early None/NaN guard returning sentinel closes this gap.

Suggested change
def _fix(cell):
if isinstance(cell, dict):
if not cell.get("latent_entities"):
return {**cell, "latent_entities": sentinel}
return cell
if isinstance(cell, list) and not cell:
return sentinel
return cell
def _fix(cell):
if cell is None or (isinstance(cell, float) and pd.isna(cell)):
return sentinel
if isinstance(cell, dict):
if not cell.get("latent_entities"):
return {**cell, "latent_entities": sentinel}
return cell
if isinstance(cell, list) and not cell:
return sentinel
return cell


df = df.copy()
df[COL_LATENT_ENTITIES] = df[COL_LATENT_ENTITIES].map(_fix)
return df
4 changes: 2 additions & 2 deletions src/anonymizer/engine/replace/llm_replace_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str |
- "{{ entity.value }}" ({{ entity.labels_str }})
{%- endfor %}

Examples: {{ <<ENTITY_EXAMPLES_COLUMN>> }}
Per-label type references (generate a NEW realistic value of that kind; do not reuse these example values, and never copy this reference text literally): {{ <<ENTITY_EXAMPLES_COLUMN>> }}

Rules:
1. Related entities must stay consistent:
Expand Down Expand Up @@ -275,5 +275,5 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str |


_EXAMPLE_LOOKUP: dict[str, str] = {
label: f"(e.g. {', '.join(examples)})" for label, examples in ENTITY_LABEL_EXAMPLES.items()
label: f"such as {', '.join(examples)}" for label, examples in ENTITY_LABEL_EXAMPLES.items()
}
Loading
Loading