Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f648c0e
hits_clusterizer included in components.py
camacortespar Nov 13, 2025
ed5248e
Hits clusterizer step included in sophronia.py
camacortespar Nov 13, 2025
8ff46af
Update on 14/11
camacortespar Nov 14, 2025
7b5f1c5
New implementation of hits_clusterizer, factory version
camacortespar Nov 25, 2025
c9c4344
New version of cluster_hits in Sophronia. Main function in hits_funct…
camacortespar Jan 14, 2026
168a3a0
Update on how to call cluster_hits in Sophronia
camacortespar Jan 14, 2026
3650638
Pytests for cluster_tagger function
camacortespar Feb 23, 2026
0e64dad
Pytest for hits clusterizer feature. Also, reference file for exact r…
camacortespar Feb 23, 2026
f22bb75
New reference file including cluster label for hits
camacortespar Mar 3, 2026
e062786
New hits reference file, git problem solved
camacortespar Mar 3, 2026
4e43dc8
scikit-learn added
camacortespar Mar 4, 2026
c24a331
Update conda environment tag
camacortespar Mar 4, 2026
5c41eab
Update beersheba reference files
camacortespar Mar 4, 2026
58bd2a3
PR: first round of comments addressed
camacortespar Apr 22, 2026
0e4703d
Remove @settings
camacortespar Apr 23, 2026
2eadf57
PR: second round of comments addressed
camacortespar Apr 24, 2026
74dcedc
Add @settings
camacortespar May 5, 2026
8dda654
Removing unused import
camacortespar May 6, 2026
fe3d020
DBSCAN eps value set to 1.8 to retain only neighbouring hits as a clu…
camacortespar May 12, 2026
9af71ac
Update reference files for tests
camacortespar May 12, 2026
6009d78
Removing deadline for tests that use hypothesis
camacortespar May 12, 2026
995705d
Updating esmeralda reference file for tests
camacortespar May 12, 2026
0772fbf
Updating reference for esmeralda test
camacortespar May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions 228Th_10evt_hits.h5
Git LFS file not shown
31 changes: 31 additions & 0 deletions invisible_cities/cities/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from .. reco .corrections import get_df_to_z_converter
from .. reco .xy_algorithms import corona
from .. reco .xy_algorithms import barycenter
from .. reco .hits_functions import cluster_tagger
from .. filters.s1s2_filter import S12Selector
from .. filters.s1s2_filter import S12SelectorOutput
from .. filters.s1s2_filter import pmap_filter
Expand Down Expand Up @@ -1717,6 +1718,36 @@ def correct(hits : pd.DataFrame) -> pd.DataFrame:

return correct

@check_annotations
def hits_clusterizer( min_samples : int
, scale_xy : float
, scale_z : float
) -> Callable:
"""
Creates a callable for performing DBSCAN clustering on a dataFrame of hits.

Parameters
----------
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the (x, y) coordinates before clustering.
scale_z : float
Scaling factor to apply to the z coordinate before clustering.

Returns
-------
Callable
A function that takes a DataFrame of hits and returns the same DataFrame
with an added 'cluster' column, which contains the cluster labels assigned by DBSCAN
(-1 for noise).
"""
return partial( cluster_tagger
, min_samples = min_samples
, scale_xy = scale_xy
, scale_z = scale_z )


def identity(x : Any) -> Any:
return x
16 changes: 15 additions & 1 deletion invisible_cities/cities/sophronia.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from . components import collect
from . components import build_pointlike_event as pointlike_event_builder
from . components import hits_corrector
from . components import hits_clusterizer
from . components import identity

from typing import Optional
Expand Down Expand Up @@ -93,6 +94,7 @@ def sophronia( files_in : OneOrManyFiles
, sipm_charge_type : SiPMCharge
, same_peak : bool
, corrections : Optional[dict] = None
, clustering_params : Optional[dict] = None
):
"""
drift_v : float
Expand Down Expand Up @@ -137,6 +139,15 @@ def sophronia( files_in : OneOrManyFiles
Normalization strategy
norm_value : float, optional
Normalization value in case of `norm_strat = NormStrategy.custom`

clustering_params : dict
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the (x, y) coordinates before clustering.
scale_z : float
Scaling factor to apply to the z coordinate before clustering.
"""
global_reco = compute_xy_position( detector_db
, run_number
Expand Down Expand Up @@ -177,6 +188,9 @@ def sophronia( files_in : OneOrManyFiles

correct_hits = df.map( hits_corrector(**corrections) if corrections is not None else identity
, item = "hits")

cluster_hits = df.map( hits_clusterizer(**clustering_params) if clustering_params is not None else identity
, item = "hits")

build_pointlike_event = df.map( pointlike_event_builder( detector_db
, run_number
Expand All @@ -202,7 +216,7 @@ def sophronia( files_in : OneOrManyFiles
, args = "event_number enough_valid_hits".split())

hits_branch = ( make_hits, enough_valid_hits, df.branch(write_hits_filter)
, hits_select.filter, merge_nn_hits, correct_hits, write_hits)
, hits_select.filter, merge_nn_hits, correct_hits, cluster_hits, write_hits)
kdst_branch = build_pointlike_event, write_pointlike_event
collect_evt_numbers = "event_number", event_number_collector.sink

Expand Down
47 changes: 47 additions & 0 deletions invisible_cities/cities/sophronia_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from pytest import mark

from .. io import dst_io as dio
from .. core.testing_utils import assert_dataframes_equal
from .. core.testing_utils import assert_tables_equality
from .. core.testing_utils import ignore_warning
from .. core.system_of_units import pes
Expand Down Expand Up @@ -147,3 +149,48 @@ def test_sophronia_keeps_hitless_events(config_tmpdir, sophronia_config):
with tb.open_file(path_out) as output_file:
assert len(output_file.root.Run.events) == 1
assert "RECO" not in output_file.root


@ignore_warning.no_config_group
def test_sophronia_clustering_integration(config_tmpdir, sophronia_config):
"""
Runs Sophronia twice (once disabled, once enabled) to verify:
1. Backward compatibility: No 'cluster' column when disabled.
2. Feature activation: 'cluster' column exists when enabled.
3. Data consistency: Enabling clustering does NOT change any other data.
"""
path_out_no_cluster = os.path.join(config_tmpdir, 'test_sophronia_no_cluster.h5')
path_out_with_cluster = os.path.join(config_tmpdir, 'test_sophronia_with_cluster.h5')

# Clustering disabled
config_no_cluster = dict(**sophronia_config)
config_no_cluster.update(dict( file_out = path_out_no_cluster
, event_range = 1
, clustering_params = None))
sophronia(**config_no_cluster)

# Clustering enabled
clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0
)
config_with_cluster = dict(**sophronia_config)
config_with_cluster.update(dict( file_out = path_out_with_cluster
, event_range = 1
, clustering_params = clustering_params))
sophronia(**config_with_cluster)

# Load both outputs
df_no_cluster = dio.load_dst(path_out_no_cluster, "RECO", "Events")
df_with_cluster = dio.load_dst(path_out_with_cluster, "RECO", "Events")

# ----- Assertions
assert not df_no_cluster.empty
assert not df_with_cluster.empty
assert 'cluster' not in df_no_cluster.columns, "'cluster' column should not exist when clustering is disabled."
assert 'cluster' in df_with_cluster.columns, "'cluster' column should exist when clustering is enabled."

# Compare all columns except 'cluster' for equality
df_with_cluster_compare = df_with_cluster.drop(columns=['cluster'])
assert_dataframes_equal(df_no_cluster, df_with_cluster_compare)
6 changes: 6 additions & 0 deletions invisible_cities/config/sophronia.conf
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ corrections = dict(
apply_temp = True,
norm_strat = kr,
apply_z = False)

clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0
)
4 changes: 4 additions & 0 deletions invisible_cities/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,10 @@ def sophronia_config(Th228_pmaps, next100_mc_krmap):
filename = next100_mc_krmap,
apply_temp = False,
norm_strat = NormStrategy.kr)
, clustering_params = dict(
min_samples = 5,
scale_xy = 15.55,
scale_z = 4.0)
)
return config

Expand Down
4 changes: 2 additions & 2 deletions invisible_cities/database/test_data/228Th_10evt_deco.h5
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions invisible_cities/database/test_data/228Th_10evt_hits.h5
Git LFS file not shown
4 changes: 2 additions & 2 deletions invisible_cities/database/test_data/228Th_10evt_tracks.h5
Git LFS file not shown
85 changes: 82 additions & 3 deletions invisible_cities/reco/hits_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import numpy as np
import pandas as pd

from .. types.ic_types import NN
from itertools import compress
from copy import deepcopy
from typing import List
from sklearn.cluster import DBSCAN

from .. types.ic_types import NN

EPSILON = np.finfo(np.float64).eps

Expand Down Expand Up @@ -64,8 +69,6 @@ def sipms_above_threshold(xys: np.ndarray, qs: np.ndarray, thr:float, energy: fl
return xs, ys, qs, es




def merge_NN_hits(hits: pd.DataFrame, same_peak: bool = True) -> pd.DataFrame:
"""
Finds NN hits (defined as hits with Q=NN) and removes them without energy
Expand Down Expand Up @@ -238,3 +241,79 @@ def threshold_hits(hits: pd.DataFrame, th: float) -> pd.DataFrame:
if th <= 0: return hits
return (hits.groupby("Z", as_index=False)
.apply(apply_threshold, th=th))

def tag_hits_in_event(event_hits : pd.DataFrame
, *
, min_samples : int
, scale_xy : float
, scale_z : float
) -> pd.DataFrame:
"""
Applies DBSCAN clustering to a DataFrame containing hits from a single event.
Hits coordinates are scaled to account for the anisotropy of the detector geometry.
A 'cluster' column is added to the group with the resulting labels.

Parameters
----------
event_hits : pd.DataFrame
DataFrame with hits from a single event. Must contain 'X', 'Y', 'Z' columns.
min_samples : int
Minimum number of samples required to form a dense region (cluster).
This includes the point itself.
scale_xy : float
Scaling factor to apply to the XY coordinates before clustering.
scale_z : float
Scaling factor to apply to the Z coordinate before clustering.

Returns
-------
pd.DataFrame
The input DataFrame with a 'cluster' column added.
"""
coords = event_hits[['X', 'Y', 'Z']].to_numpy()
# A proper scaling leads to hits being separeted
# by a distance of 1 in the DBSCAN metric space
coords[:, :2] /= scale_xy
coords[:, 2] /= scale_z

# eps parameter is fixed to a value a bit higher of √3
# to retain diagonal neighbours in the same cluster
labels = DBSCAN(eps=1.8, min_samples=min_samples).fit_predict(coords)
event_hits['cluster'] = labels

return event_hits

def cluster_tagger(df_hits : pd.DataFrame
, *
, min_samples : int
, scale_xy : float
, scale_z : float
) -> pd.DataFrame:
"""
This function groups the input DataFrame by 'event' and applies the
`tag_hits_in_event` function to each event's group of hits.

Parameters
----------
df_hits : pd.DataFrame
DataFrame with hit information. Must contain 'X', 'Y', 'Z', and 'event'.
min_samples, scale_xy, scale_z :
See `tag_hits_in_event`

Returns
-------
pd.DataFrame
The input DataFrame with an added 'cluster' column indicating the
cluster label for each hit (-1 for noise).
"""
if df_hits.empty:
return df_hits.assign(cluster=pd.Series(dtype=int))

df_clustered = df_hits.groupby('event', as_index=False, group_keys=False) \
.apply( tag_hits_in_event
, min_samples = min_samples
, scale_xy = scale_xy
, scale_z = scale_z )

return df_clustered.set_index(df_hits.index)

Loading
Loading