From 8b0531c44d38898d704d554ea1a80331a894c4a0 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Wed, 27 May 2026 11:42:39 +0200 Subject: [PATCH] welcome aplose2raven --- src/post_processing/utils/formatting_utils.py | 179 ++++++++++++++++++ src/post_processing/utils/plot_utils.py | 67 ++++--- tests/test_formatting_utils.py | 79 ++++++++ 3 files changed, 294 insertions(+), 31 deletions(-) create mode 100644 src/post_processing/utils/formatting_utils.py create mode 100644 tests/test_formatting_utils.py diff --git a/src/post_processing/utils/formatting_utils.py b/src/post_processing/utils/formatting_utils.py new file mode 100644 index 0000000..3f21c1c --- /dev/null +++ b/src/post_processing/utils/formatting_utils.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +import numpy as np +from pandas import DataFrame, Timedelta, Timestamp + + +def aplose2raven( + aplose_result: DataFrame, + list_audio_begin_time: list[Timestamp], + audio_durations: list[Timedelta], +) -> DataFrame: + r"""Format an APLOSE result ``DataFrame`` to a Raven result ``DataFrame``. + + The list of audio files and durations considered for the Raven campaign should be + provided to account for the deviations between the advertised and actual + file durations. + + Parameters + ---------- + aplose_result: Dataframe, + APLOSE formatted result ``DataFrame``. + + list_audio_begin_time: list[Timestamp] + list of tz-aware timestamps from considered audio files begin time. + + audio_durations: list[Timedelta] + list of all considered audio file durations. + + Returns + ------- + Raven formatted ``DataFrame``. + + Example of use + -------------- + >>> from pathlib import Path + >>> from pandas import read_csv + >>> from osekit.core.audio_dataset import AudioDataset + >>> from osekit.utils.formatting import aplose2raven + + >>> dataset_folder = Path(r"path\to\audio\folder") + >>> dataset = AudioDataset.from_folder(dataset_folder, + >>> strptime_format="strptime_format", + >>> timezone='utc', + >>> ) + + >>> begin_list = sorted([f.begin for f in list(dataset.files)]) + >>> duration_list = sorted([f.duration for f in list(dataset.files)]) + + >>> csv = Path(r"path\to\result\csv") + >>> df = read_csv(csv, + >>> parse_dates=["start_datetime", "end_datetime"] + >>> ).sort_values("start_datetime") + >>> .reset_index(drop=True) + + >>> df_raven = aplose2raven(df, begin_list, duration_list) + >>> raven_result.to_csv('path/to/result/file.txt', sep='\t', index=False) + + """ + # index of the corresponding audio file for each detection + index_detection = ( + np.searchsorted( + list_audio_begin_time, + aplose_result["start_datetime"], + side="right", + ) + - 1 + ) + + """ + The following time adjustment is necessary because Raven does not account + for the duty cycle, nor for any potential offset between the end of one + file and the start of the next. To ensure that detection timestamps in + APLOSE format align with the spectrograms displayed by Raven, a correction + of the number of seconds is required, since the software only uses the + elapsed time from the beginning of the first file to generate the bounding boxes. + """ + + # Add the begin time of the audio file corresponding to each detection + aplose_result["wav_timestamp"] = [list_audio_begin_time[i] for i in index_detection] + + # Compute the time gaps between consecutive audio file begin time + audio_begin_timegap = list(np.diff(list_audio_begin_time).tolist()) + + # Adjustment values: difference between each file's duration + # and the gap until the next file. + # (Required to account for potential gaps/overlaps between files) + adjustment_values = [Timedelta(0)] + adjustment_values.extend( + [ + t1 - t2 + for (t1, t2) in zip(audio_durations[:-1], audio_begin_timegap, strict=False) + ], + ) + + # Cumulative adjustment in seconds, to realign all detection timestamps consistently + cumsum_adjust = list(np.cumsum(adjustment_values)) + + detection_begin_datetime_adjusted = [] + detection_end_datetime_adjusted = [] + for i in range(len(aplose_result)): + detection_begin_time = aplose_result["start_datetime"].iloc[i] + detection_end_time = aplose_result["end_datetime"].iloc[i] + audio_begin_time = aplose_result["wav_timestamp"].iloc[i] + ind = index_detection[i] + """ + For duty cycled data, if aplose_result detections were reshaped (eg to 60s duration), + the start or end of the detection might virtually be located in a OFF duty cycle phase. + This would cause issue in Raven, because the OFF part are not represented, + and the detection start will be located on the previous audio file. + The 2 following 'if' conditions apply the appropriate correction + to make the Raven box (1)starts or (2) ends. + at the appropriate timing in Raven (ie at the begining or end of an audio file). + """ + + audio_begin_time_adjusted = audio_begin_time + audio_durations[ind] + + if ind < len(audio_begin_timegap): + next_audio_begin_time_adjusted = audio_begin_time + audio_begin_timegap[ind] + else: + next_audio_begin_time_adjusted += audio_durations[ind] + + if ( + audio_begin_time_adjusted + < detection_begin_time + < next_audio_begin_time_adjusted + ): + correction_duration = list_audio_begin_time[ind + 1] - detection_begin_time + detection_begin_datetime_adjusted.append( + detection_begin_time + cumsum_adjust[ind + 1] + correction_duration, + ) + detection_end_datetime_adjusted.append( + detection_end_time + cumsum_adjust[ind + 1], + ) + elif ( + audio_begin_time_adjusted + < detection_end_time + < next_audio_begin_time_adjusted + ): + detection_begin_datetime_adjusted.append( + detection_begin_time + cumsum_adjust[ind], + ) + correction_duration = (detection_end_time - detection_begin_time) - ( + (audio_begin_time + audio_durations[ind]) - detection_begin_time + ) + detection_end_datetime_adjusted.append( + detection_end_time + cumsum_adjust[ind] - correction_duration, + ) + + else: + # Else, apply normal Raven time correction + detection_begin_datetime_adjusted.append( + detection_begin_time + cumsum_adjust[ind], + ) + detection_end_datetime_adjusted.append( + detection_end_time + cumsum_adjust[ind], + ) + + # Convert the datetimes to seconds from the start of first audio (raven format) + begin_time_adjusted = [ + (d - list_audio_begin_time[0]).total_seconds() + for d in detection_begin_datetime_adjusted + ] + end_time_adjusted = [ + (d - list_audio_begin_time[0]).total_seconds() + for d in detection_end_datetime_adjusted + ] + + # Build corrected Raven selection table + raven_result = DataFrame() + raven_result["Selection"] = list(range(1, len(aplose_result) + 1)) + raven_result["View"] = [1] * len(aplose_result) + raven_result["Channel"] = [1] * len(aplose_result) + raven_result["Begin Time (s)"] = begin_time_adjusted + raven_result["End Time (s)"] = end_time_adjusted + raven_result["Low Freq (Hz)"] = aplose_result["start_frequency"] + raven_result["High Freq (Hz)"] = aplose_result["end_frequency"] + raven_result["Begin Date Time Real"] = aplose_result["start_datetime"] + + return raven_result diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 8f84334..b61b159 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -31,7 +31,8 @@ get_labels_and_annotators, get_sun_times, get_time_range_and_bin_size, - timedelta_to_str, round_begin_end_timestamps, + timedelta_to_str, + round_begin_end_timestamps, ) from post_processing.utils.filtering_utils import ( filter_by_annotator, @@ -55,7 +56,12 @@ def histo( ax: plt.Axes, bin_size: Timedelta | BaseOffset, time_bin: Timedelta, - **kwargs: bool | str | list[str] | tuple[float, float] | list[Timestamp] | RecordingPeriod, # noqa: E501 + **kwargs: bool + | str + | list[str] + | tuple[float, float] + | list[Timestamp] + | RecordingPeriod, # noqa: E501 ) -> None: """Seasonality plot. @@ -89,9 +95,11 @@ def histo( annotators = list(annotators) if len(df) <= 1: - msg = (f"DataFrame with annotators '{', '.join(annotators)}'" - f" / labels '{', '.join(labels)}'" - f" do not contains enough detections.") + msg = ( + f"DataFrame with annotators '{', '.join(annotators)}'" + f" / labels '{', '.join(labels)}'" + f" do not contains enough detections." + ) logging.warning(msg) return @@ -283,12 +291,13 @@ def scatter( ) -def heatmap(df: DataFrame, - ax: Axes, - bin_size: Timedelta | BaseOffset, - time_range: DatetimeIndex, - **kwargs: bool | tuple[float, float], - ) -> None: +def heatmap( + df: DataFrame, + ax: Axes, + bin_size: Timedelta | BaseOffset, + time_range: DatetimeIndex, + **kwargs: bool | tuple[float, float], +) -> None: """Heatmap of detections for a given annotator and label. Parameters @@ -567,9 +576,7 @@ def timeline( labels, _ = get_labels_and_annotators(df) - color = ( - color or [c for _, c in zip(range(len(labels)), cycle(default_colors))] - ) + color = color or [c for _, c in zip(range(len(labels)), cycle(default_colors))] for i, label in enumerate(labels): time_det = df[(df["annotation"] == label)]["start_datetime"].to_list() @@ -579,7 +586,7 @@ def timeline( ax.grid(color="k", linestyle="-", linewidth=0.2) ax.set_yticks(np.arange(0, len(labels), 1)) - ax.set_yticklabels(labels[::-1]) + ax.set_yticklabels(labels) ax.set_xlabel("Date") ax.set_xlim( df["start_datetime"].min().floor("1d"), @@ -619,11 +626,12 @@ def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None: ax.set_ylabel(f"{current_label} (%)") -def set_dynamic_ylim(ax: plt.Axes, - df: DataFrame, - padding: float = 0.05, - nticks: int = 4, - ) -> None: +def set_dynamic_ylim( + ax: plt.Axes, + df: DataFrame, + padding: float = 0.05, + nticks: int = 4, +) -> None: """Set y-axis limits and ticks dynamically based on DataFrame values.""" max_val = np.nanmax(df.to_numpy()) upper_lim = int(ceil((1 + padding) * max_val)) @@ -635,10 +643,7 @@ def set_dynamic_ylim(ax: plt.Axes, def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> None: """Set plot title.""" - title = ( - f"annotator: {', '.join(set(annotators))}\n" - f"label: {', '.join(set(labels))}" - ) + title = f"annotator: {', '.join(set(annotators))}\nlabel: {', '.join(set(labels))}" ax.set_title(title) @@ -723,13 +728,13 @@ def shade_no_effort( def _draw_effort_spans( - ax: plt.Axes, - effort_index: DatetimeIndex, - width_days: float, - *, - facecolor: str, - alpha: float, - label: str, + ax: plt.Axes, + effort_index: DatetimeIndex, + width_days: float, + *, + facecolor: str, + alpha: float, + label: str, ) -> None: """Draw vertical lines for effort plot.""" for ts in effort_index: diff --git a/tests/test_formatting_utils.py b/tests/test_formatting_utils.py new file mode 100644 index 0000000..43bb1ce --- /dev/null +++ b/tests/test_formatting_utils.py @@ -0,0 +1,79 @@ +import pytest +from pandas import DataFrame, Timedelta, Timestamp, date_range + +from post_processing.utils.formatting_utils import aplose2raven + + +@pytest.fixture +def aplose_dataframe() -> DataFrame: + data = DataFrame( + { + "dataset": ["dataset_test", "dataset_test", "dataset_test", "dataset_test"], + "filename": ["file1.wav", "file2.wav", "file3.wav", "file4.wav"], + "start_time": [0, 0, 5.9, 0], + "end_time": [30, 30, 8.1, 30], + "start_frequency": [0, 0, 18500.0, 0], + "end_frequency": [96000, 96000, 53000.0, 96000], + "annotation": ["boat", "boat", "boat", "boat"], + "annotator": ["bbjuni", "bbjuni", "bbjuni", "bbjuni"], + "start_datetime": [ + Timestamp("2020-05-29T11:30:00.000+00:00"), + Timestamp("2020-05-29T11:31:00.000+00:00"), + Timestamp("2020-05-29T11:31:05.900+00:00"), + Timestamp("2020-05-29T11:32:50.000+00:00"), + ], + "end_datetime": [ + Timestamp("2020-05-29T11:30:30.000+00:00"), + Timestamp("2020-05-29T11:31:30.000+00:00"), + Timestamp("2020-05-29T11:31:08.100+00:00"), + Timestamp("2020-05-29T11:33:20.000+00:00"), + ], + "is_box": [0, 0, 1, 0], + }, + ) + + return data.reset_index(drop=True) + + +@pytest.fixture +def audio_timestamps() -> list: + return list( + date_range( + start="2020-05-29T11:30:00.000+00:00", + end="2020-05-29T11:35:00.000+00:00", + freq="1min", + ), + ) + + +@pytest.fixture +def audio_durations(audio_timestamps: list[Timestamp]) -> list: + return [Timedelta("30s")] * len(audio_timestamps) + + +@pytest.mark.unit +def test_aplose2raven( + aplose_dataframe: DataFrame, + audio_timestamps: list[Timestamp], + audio_durations: list[Timedelta], +) -> None: + raven_dataframe = aplose2raven( + aplose_result=aplose_dataframe, + list_audio_begin_time=audio_timestamps, + audio_durations=audio_durations, + ) + + expected_raven_dataframe = DataFrame( + { + "Selection": [1, 2, 3, 4], + "View": [1, 1, 1, 1], + "Channel": [1, 1, 1, 1], + "Begin Time (s)": [0.0, 30.0, 35.9, 90.0], + "End Time (s)": [30.0, 60.0, 38.1, 110.0], + "Low Freq (Hz)": [0.0, 0.0, 18500.0, 0.0], + "High Freq (Hz)": [96000.0, 96000.0, 53000.0, 96000.0], + "Begin Date Time Real": aplose_dataframe["start_datetime"], + }, + ) + + assert expected_raven_dataframe.equals(raven_dataframe)