From 8b0531c44d38898d704d554ea1a80331a894c4a0 Mon Sep 17 00:00:00 2001
From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com>
Date: Wed, 27 May 2026 11:42:39 +0200
Subject: [PATCH] welcome aplose2raven

---
 src/post_processing/utils/formatting_utils.py | 179 ++++++++++++++++++
 src/post_processing/utils/plot_utils.py       |  67 ++++---
 tests/test_formatting_utils.py                |  79 ++++++++
 3 files changed, 294 insertions(+), 31 deletions(-)
 create mode 100644 src/post_processing/utils/formatting_utils.py
 create mode 100644 tests/test_formatting_utils.py

diff --git a/src/post_processing/utils/formatting_utils.py b/src/post_processing/utils/formatting_utils.py
new file mode 100644
index 0000000..3f21c1c
--- /dev/null
+++ b/src/post_processing/utils/formatting_utils.py
@@ -0,0 +1,179 @@
+from __future__ import annotations
+
+import numpy as np
+from pandas import DataFrame, Timedelta, Timestamp
+
+
+def aplose2raven(
+    aplose_result: DataFrame,
+    list_audio_begin_time: list[Timestamp],
+    audio_durations: list[Timedelta],
+) -> DataFrame:
+    r"""Format an APLOSE result ``DataFrame`` to a Raven result ``DataFrame``.
+
+    The list of audio files and durations considered for the Raven campaign should be
+    provided to account for the deviations between the advertised and actual
+    file durations.
+
+    Parameters
+    ----------
+    aplose_result: Dataframe,
+        APLOSE formatted result ``DataFrame``.
+
+    list_audio_begin_time: list[Timestamp]
+        list of tz-aware timestamps from considered audio files begin time.
+
+    audio_durations: list[Timedelta]
+        list of all considered audio file durations.
+
+    Returns
+    -------
+    Raven formatted ``DataFrame``.
+
+    Example of use
+    --------------
+    >>> from pathlib import Path
+    >>> from pandas import read_csv
+    >>> from osekit.core.audio_dataset import AudioDataset
+    >>> from osekit.utils.formatting import aplose2raven
+
+    >>> dataset_folder = Path(r"path\to\audio\folder")
+    >>> dataset = AudioDataset.from_folder(dataset_folder,
+    >>>                                    strptime_format="strptime_format",
+    >>>                                    timezone='utc',
+    >>>                                   )
+
+    >>> begin_list = sorted([f.begin for f in list(dataset.files)])
+    >>> duration_list = sorted([f.duration for f in list(dataset.files)])
+
+    >>> csv = Path(r"path\to\result\csv")
+    >>> df = read_csv(csv,
+    >>>               parse_dates=["start_datetime", "end_datetime"]
+    >>>               ).sort_values("start_datetime")
+    >>>                .reset_index(drop=True)
+
+    >>> df_raven = aplose2raven(df, begin_list, duration_list)
+    >>> raven_result.to_csv('path/to/result/file.txt', sep='\t', index=False)
+
+    """
+    # index of the corresponding audio file for each detection
+    index_detection = (
+        np.searchsorted(
+            list_audio_begin_time,
+            aplose_result["start_datetime"],
+            side="right",
+        )
+        - 1
+    )
+
+    """
+    The following time adjustment is necessary because Raven does not account
+    for the duty cycle, nor for any potential offset between the end of one
+    file and the start of the next. To ensure that detection timestamps in
+    APLOSE format align with the spectrograms displayed by Raven, a correction
+    of the number of seconds is required, since the software only uses the
+    elapsed time from the beginning of the first file to generate the bounding boxes.
+    """
+
+    # Add the begin time of the audio file corresponding to each detection
+    aplose_result["wav_timestamp"] = [list_audio_begin_time[i] for i in index_detection]
+
+    # Compute the time gaps between consecutive audio file begin time
+    audio_begin_timegap = list(np.diff(list_audio_begin_time).tolist())
+
+    # Adjustment values: difference between each file's duration
+    # and the gap until the next file.
+    # (Required to account for potential gaps/overlaps between files)
+    adjustment_values = [Timedelta(0)]
+    adjustment_values.extend(
+        [
+            t1 - t2
+            for (t1, t2) in zip(audio_durations[:-1], audio_begin_timegap, strict=False)
+        ],
+    )
+
+    # Cumulative adjustment in seconds, to realign all detection timestamps consistently
+    cumsum_adjust = list(np.cumsum(adjustment_values))
+
+    detection_begin_datetime_adjusted = []
+    detection_end_datetime_adjusted = []
+    for i in range(len(aplose_result)):
+        detection_begin_time = aplose_result["start_datetime"].iloc[i]
+        detection_end_time = aplose_result["end_datetime"].iloc[i]
+        audio_begin_time = aplose_result["wav_timestamp"].iloc[i]
+        ind = index_detection[i]
+        """
+        For duty cycled data, if aplose_result detections were reshaped (eg to 60s duration),
+        the start or end of the detection might virtually be located in a OFF duty cycle phase.
+        This would cause issue in Raven, because the OFF part are not represented,
+        and the detection start will be located on the previous audio file.
+        The 2 following 'if' conditions apply the appropriate correction
+        to make the Raven box (1)starts or (2) ends.
+        at the appropriate timing in Raven (ie at the begining or end of an audio file).
+        """
+
+        audio_begin_time_adjusted = audio_begin_time + audio_durations[ind]
+
+        if ind < len(audio_begin_timegap):
+            next_audio_begin_time_adjusted = audio_begin_time + audio_begin_timegap[ind]
+        else:
+            next_audio_begin_time_adjusted += audio_durations[ind]
+
+        if (
+            audio_begin_time_adjusted
+            < detection_begin_time
+            < next_audio_begin_time_adjusted
+        ):
+            correction_duration = list_audio_begin_time[ind + 1] - detection_begin_time
+            detection_begin_datetime_adjusted.append(
+                detection_begin_time + cumsum_adjust[ind + 1] + correction_duration,
+            )
+            detection_end_datetime_adjusted.append(
+                detection_end_time + cumsum_adjust[ind + 1],
+            )
+        elif (
+            audio_begin_time_adjusted
+            < detection_end_time
+            < next_audio_begin_time_adjusted
+        ):
+            detection_begin_datetime_adjusted.append(
+                detection_begin_time + cumsum_adjust[ind],
+            )
+            correction_duration = (detection_end_time - detection_begin_time) - (
+                (audio_begin_time + audio_durations[ind]) - detection_begin_time
+            )
+            detection_end_datetime_adjusted.append(
+                detection_end_time + cumsum_adjust[ind] - correction_duration,
+            )
+
+        else:
+            # Else, apply normal Raven time correction
+            detection_begin_datetime_adjusted.append(
+                detection_begin_time + cumsum_adjust[ind],
+            )
+            detection_end_datetime_adjusted.append(
+                detection_end_time + cumsum_adjust[ind],
+            )
+
+    # Convert the datetimes to seconds from the start of first audio (raven format)
+    begin_time_adjusted = [
+        (d - list_audio_begin_time[0]).total_seconds()
+        for d in detection_begin_datetime_adjusted
+    ]
+    end_time_adjusted = [
+        (d - list_audio_begin_time[0]).total_seconds()
+        for d in detection_end_datetime_adjusted
+    ]
+
+    # Build corrected Raven selection table
+    raven_result = DataFrame()
+    raven_result["Selection"] = list(range(1, len(aplose_result) + 1))
+    raven_result["View"] = [1] * len(aplose_result)
+    raven_result["Channel"] = [1] * len(aplose_result)
+    raven_result["Begin Time (s)"] = begin_time_adjusted
+    raven_result["End Time (s)"] = end_time_adjusted
+    raven_result["Low Freq (Hz)"] = aplose_result["start_frequency"]
+    raven_result["High Freq (Hz)"] = aplose_result["end_frequency"]
+    raven_result["Begin Date Time Real"] = aplose_result["start_datetime"]
+
+    return raven_result
diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py
index 8f84334..b61b159 100644
--- a/src/post_processing/utils/plot_utils.py
+++ b/src/post_processing/utils/plot_utils.py
@@ -31,7 +31,8 @@
     get_labels_and_annotators,
     get_sun_times,
     get_time_range_and_bin_size,
-    timedelta_to_str, round_begin_end_timestamps,
+    timedelta_to_str,
+    round_begin_end_timestamps,
 )
 from post_processing.utils.filtering_utils import (
     filter_by_annotator,
@@ -55,7 +56,12 @@ def histo(
     ax: plt.Axes,
     bin_size: Timedelta | BaseOffset,
     time_bin: Timedelta,
-    **kwargs: bool | str | list[str] | tuple[float, float] | list[Timestamp] | RecordingPeriod,  # noqa: E501
+    **kwargs: bool
+    | str
+    | list[str]
+    | tuple[float, float]
+    | list[Timestamp]
+    | RecordingPeriod,  # noqa: E501
 ) -> None:
     """Seasonality plot.
 
@@ -89,9 +95,11 @@ def histo(
     annotators = list(annotators)
 
     if len(df) <= 1:
-        msg = (f"DataFrame with annotators '{', '.join(annotators)}'"
-               f" / labels '{', '.join(labels)}'"
-               f" do not contains enough detections.")
+        msg = (
+            f"DataFrame with annotators '{', '.join(annotators)}'"
+            f" / labels '{', '.join(labels)}'"
+            f" do not contains enough detections."
+        )
         logging.warning(msg)
         return
 
@@ -283,12 +291,13 @@ def scatter(
         )
 
 
-def heatmap(df: DataFrame,
-            ax: Axes,
-            bin_size: Timedelta | BaseOffset,
-            time_range: DatetimeIndex,
-            **kwargs: bool | tuple[float, float],
-            ) -> None:
+def heatmap(
+    df: DataFrame,
+    ax: Axes,
+    bin_size: Timedelta | BaseOffset,
+    time_range: DatetimeIndex,
+    **kwargs: bool | tuple[float, float],
+) -> None:
     """Heatmap of detections for a given annotator and label.
 
     Parameters
@@ -567,9 +576,7 @@ def timeline(
 
     labels, _ = get_labels_and_annotators(df)
 
-    color = (
-        color or [c for _, c in zip(range(len(labels)), cycle(default_colors))]
-    )
+    color = color or [c for _, c in zip(range(len(labels)), cycle(default_colors))]
 
     for i, label in enumerate(labels):
         time_det = df[(df["annotation"] == label)]["start_datetime"].to_list()
@@ -579,7 +586,7 @@ def timeline(
 
     ax.grid(color="k", linestyle="-", linewidth=0.2)
     ax.set_yticks(np.arange(0, len(labels), 1))
-    ax.set_yticklabels(labels[::-1])
+    ax.set_yticklabels(labels)
     ax.set_xlabel("Date")
     ax.set_xlim(
         df["start_datetime"].min().floor("1d"),
@@ -619,11 +626,12 @@ def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None:
         ax.set_ylabel(f"{current_label} (%)")
 
 
-def set_dynamic_ylim(ax: plt.Axes,
-                     df: DataFrame,
-                     padding: float = 0.05,
-                     nticks: int = 4,
-                     ) -> None:
+def set_dynamic_ylim(
+    ax: plt.Axes,
+    df: DataFrame,
+    padding: float = 0.05,
+    nticks: int = 4,
+) -> None:
     """Set y-axis limits and ticks dynamically based on DataFrame values."""
     max_val = np.nanmax(df.to_numpy())
     upper_lim = int(ceil((1 + padding) * max_val))
@@ -635,10 +643,7 @@ def set_dynamic_ylim(ax: plt.Axes,
 
 def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> None:
     """Set plot title."""
-    title = (
-        f"annotator: {', '.join(set(annotators))}\n"
-        f"label: {', '.join(set(labels))}"
-    )
+    title = f"annotator: {', '.join(set(annotators))}\nlabel: {', '.join(set(labels))}"
     ax.set_title(title)
 
 
@@ -723,13 +728,13 @@ def shade_no_effort(
 
 
 def _draw_effort_spans(
-        ax: plt.Axes,
-        effort_index: DatetimeIndex,
-        width_days: float,
-        *,
-        facecolor: str,
-        alpha: float,
-        label: str,
+    ax: plt.Axes,
+    effort_index: DatetimeIndex,
+    width_days: float,
+    *,
+    facecolor: str,
+    alpha: float,
+    label: str,
 ) -> None:
     """Draw vertical lines for effort plot."""
     for ts in effort_index:
diff --git a/tests/test_formatting_utils.py b/tests/test_formatting_utils.py
new file mode 100644
index 0000000..43bb1ce
--- /dev/null
+++ b/tests/test_formatting_utils.py
@@ -0,0 +1,79 @@
+import pytest
+from pandas import DataFrame, Timedelta, Timestamp, date_range
+
+from post_processing.utils.formatting_utils import aplose2raven
+
+
+@pytest.fixture
+def aplose_dataframe() -> DataFrame:
+    data = DataFrame(
+        {
+            "dataset": ["dataset_test", "dataset_test", "dataset_test", "dataset_test"],
+            "filename": ["file1.wav", "file2.wav", "file3.wav", "file4.wav"],
+            "start_time": [0, 0, 5.9, 0],
+            "end_time": [30, 30, 8.1, 30],
+            "start_frequency": [0, 0, 18500.0, 0],
+            "end_frequency": [96000, 96000, 53000.0, 96000],
+            "annotation": ["boat", "boat", "boat", "boat"],
+            "annotator": ["bbjuni", "bbjuni", "bbjuni", "bbjuni"],
+            "start_datetime": [
+                Timestamp("2020-05-29T11:30:00.000+00:00"),
+                Timestamp("2020-05-29T11:31:00.000+00:00"),
+                Timestamp("2020-05-29T11:31:05.900+00:00"),
+                Timestamp("2020-05-29T11:32:50.000+00:00"),
+            ],
+            "end_datetime": [
+                Timestamp("2020-05-29T11:30:30.000+00:00"),
+                Timestamp("2020-05-29T11:31:30.000+00:00"),
+                Timestamp("2020-05-29T11:31:08.100+00:00"),
+                Timestamp("2020-05-29T11:33:20.000+00:00"),
+            ],
+            "is_box": [0, 0, 1, 0],
+        },
+    )
+
+    return data.reset_index(drop=True)
+
+
+@pytest.fixture
+def audio_timestamps() -> list:
+    return list(
+        date_range(
+            start="2020-05-29T11:30:00.000+00:00",
+            end="2020-05-29T11:35:00.000+00:00",
+            freq="1min",
+        ),
+    )
+
+
+@pytest.fixture
+def audio_durations(audio_timestamps: list[Timestamp]) -> list:
+    return [Timedelta("30s")] * len(audio_timestamps)
+
+
+@pytest.mark.unit
+def test_aplose2raven(
+    aplose_dataframe: DataFrame,
+    audio_timestamps: list[Timestamp],
+    audio_durations: list[Timedelta],
+) -> None:
+    raven_dataframe = aplose2raven(
+        aplose_result=aplose_dataframe,
+        list_audio_begin_time=audio_timestamps,
+        audio_durations=audio_durations,
+    )
+
+    expected_raven_dataframe = DataFrame(
+        {
+            "Selection": [1, 2, 3, 4],
+            "View": [1, 1, 1, 1],
+            "Channel": [1, 1, 1, 1],
+            "Begin Time (s)": [0.0, 30.0, 35.9, 90.0],
+            "End Time (s)": [30.0, 60.0, 38.1, 110.0],
+            "Low Freq (Hz)": [0.0, 0.0, 18500.0, 0.0],
+            "High Freq (Hz)": [96000.0, 96000.0, 53000.0, 96000.0],
+            "Begin Date Time Real": aplose_dataframe["start_datetime"],
+        },
+    )
+
+    assert expected_raven_dataframe.equals(raven_dataframe)