Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions src/post_processing/utils/formatting_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from __future__ import annotations

import numpy as np
from pandas import DataFrame, Timedelta, Timestamp


def aplose2raven(
aplose_result: DataFrame,
list_audio_begin_time: list[Timestamp],
audio_durations: list[Timedelta],
) -> DataFrame:
r"""Format an APLOSE result ``DataFrame`` to a Raven result ``DataFrame``.

The list of audio files and durations considered for the Raven campaign should be
provided to account for the deviations between the advertised and actual
file durations.

Parameters
----------
aplose_result: Dataframe,
APLOSE formatted result ``DataFrame``.

list_audio_begin_time: list[Timestamp]
list of tz-aware timestamps from considered audio files begin time.

audio_durations: list[Timedelta]
list of all considered audio file durations.

Returns
-------
Raven formatted ``DataFrame``.

Example of use
--------------
>>> from pathlib import Path
>>> from pandas import read_csv
>>> from osekit.core.audio_dataset import AudioDataset
>>> from osekit.utils.formatting import aplose2raven

>>> dataset_folder = Path(r"path\to\audio\folder")
>>> dataset = AudioDataset.from_folder(dataset_folder,
>>> strptime_format="strptime_format",
>>> timezone='utc',
>>> )

>>> begin_list = sorted([f.begin for f in list(dataset.files)])
>>> duration_list = sorted([f.duration for f in list(dataset.files)])

>>> csv = Path(r"path\to\result\csv")
>>> df = read_csv(csv,
>>> parse_dates=["start_datetime", "end_datetime"]
>>> ).sort_values("start_datetime")
>>> .reset_index(drop=True)

>>> df_raven = aplose2raven(df, begin_list, duration_list)
>>> raven_result.to_csv('path/to/result/file.txt', sep='\t', index=False)

"""
# index of the corresponding audio file for each detection
index_detection = (
np.searchsorted(
list_audio_begin_time,
aplose_result["start_datetime"],
side="right",
)
- 1
)

"""
The following time adjustment is necessary because Raven does not account
for the duty cycle, nor for any potential offset between the end of one
file and the start of the next. To ensure that detection timestamps in
APLOSE format align with the spectrograms displayed by Raven, a correction
of the number of seconds is required, since the software only uses the
elapsed time from the beginning of the first file to generate the bounding boxes.
"""

# Add the begin time of the audio file corresponding to each detection
aplose_result["wav_timestamp"] = [list_audio_begin_time[i] for i in index_detection]

# Compute the time gaps between consecutive audio file begin time
audio_begin_timegap = list(np.diff(list_audio_begin_time).tolist())

# Adjustment values: difference between each file's duration
# and the gap until the next file.
# (Required to account for potential gaps/overlaps between files)
adjustment_values = [Timedelta(0)]
adjustment_values.extend(
[
t1 - t2
for (t1, t2) in zip(audio_durations[:-1], audio_begin_timegap, strict=False)
],
)

# Cumulative adjustment in seconds, to realign all detection timestamps consistently
cumsum_adjust = list(np.cumsum(adjustment_values))

detection_begin_datetime_adjusted = []
detection_end_datetime_adjusted = []
for i in range(len(aplose_result)):
detection_begin_time = aplose_result["start_datetime"].iloc[i]
detection_end_time = aplose_result["end_datetime"].iloc[i]
audio_begin_time = aplose_result["wav_timestamp"].iloc[i]
ind = index_detection[i]
"""
For duty cycled data, if aplose_result detections were reshaped (eg to 60s duration),
the start or end of the detection might virtually be located in a OFF duty cycle phase.
This would cause issue in Raven, because the OFF part are not represented,
and the detection start will be located on the previous audio file.
The 2 following 'if' conditions apply the appropriate correction
to make the Raven box (1)starts or (2) ends.
at the appropriate timing in Raven (ie at the begining or end of an audio file).
"""

audio_begin_time_adjusted = audio_begin_time + audio_durations[ind]

if ind < len(audio_begin_timegap):
next_audio_begin_time_adjusted = audio_begin_time + audio_begin_timegap[ind]
else:
next_audio_begin_time_adjusted += audio_durations[ind]

if (
audio_begin_time_adjusted
< detection_begin_time
< next_audio_begin_time_adjusted
):
correction_duration = list_audio_begin_time[ind + 1] - detection_begin_time
detection_begin_datetime_adjusted.append(
detection_begin_time + cumsum_adjust[ind + 1] + correction_duration,
)
detection_end_datetime_adjusted.append(
detection_end_time + cumsum_adjust[ind + 1],
)
elif (
audio_begin_time_adjusted
< detection_end_time
< next_audio_begin_time_adjusted
):
detection_begin_datetime_adjusted.append(
detection_begin_time + cumsum_adjust[ind],
)
correction_duration = (detection_end_time - detection_begin_time) - (
(audio_begin_time + audio_durations[ind]) - detection_begin_time
)
detection_end_datetime_adjusted.append(
detection_end_time + cumsum_adjust[ind] - correction_duration,
)

else:
# Else, apply normal Raven time correction
detection_begin_datetime_adjusted.append(
detection_begin_time + cumsum_adjust[ind],
)
detection_end_datetime_adjusted.append(
detection_end_time + cumsum_adjust[ind],
)

# Convert the datetimes to seconds from the start of first audio (raven format)
begin_time_adjusted = [
(d - list_audio_begin_time[0]).total_seconds()
for d in detection_begin_datetime_adjusted
]
end_time_adjusted = [
(d - list_audio_begin_time[0]).total_seconds()
for d in detection_end_datetime_adjusted
]

# Build corrected Raven selection table
raven_result = DataFrame()
raven_result["Selection"] = list(range(1, len(aplose_result) + 1))
raven_result["View"] = [1] * len(aplose_result)
raven_result["Channel"] = [1] * len(aplose_result)
raven_result["Begin Time (s)"] = begin_time_adjusted
raven_result["End Time (s)"] = end_time_adjusted
raven_result["Low Freq (Hz)"] = aplose_result["start_frequency"]
raven_result["High Freq (Hz)"] = aplose_result["end_frequency"]
raven_result["Begin Date Time Real"] = aplose_result["start_datetime"]

return raven_result
67 changes: 36 additions & 31 deletions src/post_processing/utils/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
get_labels_and_annotators,
get_sun_times,
get_time_range_and_bin_size,
timedelta_to_str, round_begin_end_timestamps,
timedelta_to_str,
round_begin_end_timestamps,
)
from post_processing.utils.filtering_utils import (
filter_by_annotator,
Expand All @@ -55,7 +56,12 @@ def histo(
ax: plt.Axes,
bin_size: Timedelta | BaseOffset,
time_bin: Timedelta,
**kwargs: bool | str | list[str] | tuple[float, float] | list[Timestamp] | RecordingPeriod, # noqa: E501
**kwargs: bool
| str
| list[str]
| tuple[float, float]
| list[Timestamp]
| RecordingPeriod, # noqa: E501
) -> None:
"""Seasonality plot.

Expand Down Expand Up @@ -89,9 +95,11 @@ def histo(
annotators = list(annotators)

if len(df) <= 1:
msg = (f"DataFrame with annotators '{', '.join(annotators)}'"
f" / labels '{', '.join(labels)}'"
f" do not contains enough detections.")
msg = (
f"DataFrame with annotators '{', '.join(annotators)}'"
f" / labels '{', '.join(labels)}'"
f" do not contains enough detections."
)
logging.warning(msg)
return

Expand Down Expand Up @@ -283,12 +291,13 @@ def scatter(
)


def heatmap(df: DataFrame,
ax: Axes,
bin_size: Timedelta | BaseOffset,
time_range: DatetimeIndex,
**kwargs: bool | tuple[float, float],
) -> None:
def heatmap(
df: DataFrame,
ax: Axes,
bin_size: Timedelta | BaseOffset,
time_range: DatetimeIndex,
**kwargs: bool | tuple[float, float],
) -> None:
"""Heatmap of detections for a given annotator and label.

Parameters
Expand Down Expand Up @@ -567,9 +576,7 @@ def timeline(

labels, _ = get_labels_and_annotators(df)

color = (
color or [c for _, c in zip(range(len(labels)), cycle(default_colors))]
)
color = color or [c for _, c in zip(range(len(labels)), cycle(default_colors))]

for i, label in enumerate(labels):
time_det = df[(df["annotation"] == label)]["start_datetime"].to_list()
Expand All @@ -579,7 +586,7 @@ def timeline(

ax.grid(color="k", linestyle="-", linewidth=0.2)
ax.set_yticks(np.arange(0, len(labels), 1))
ax.set_yticklabels(labels[::-1])
ax.set_yticklabels(labels)
ax.set_xlabel("Date")
ax.set_xlim(
df["start_datetime"].min().floor("1d"),
Expand Down Expand Up @@ -619,11 +626,12 @@ def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None:
ax.set_ylabel(f"{current_label} (%)")


def set_dynamic_ylim(ax: plt.Axes,
df: DataFrame,
padding: float = 0.05,
nticks: int = 4,
) -> None:
def set_dynamic_ylim(
ax: plt.Axes,
df: DataFrame,
padding: float = 0.05,
nticks: int = 4,
) -> None:
"""Set y-axis limits and ticks dynamically based on DataFrame values."""
max_val = np.nanmax(df.to_numpy())
upper_lim = int(ceil((1 + padding) * max_val))
Expand All @@ -635,10 +643,7 @@ def set_dynamic_ylim(ax: plt.Axes,

def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> None:
"""Set plot title."""
title = (
f"annotator: {', '.join(set(annotators))}\n"
f"label: {', '.join(set(labels))}"
)
title = f"annotator: {', '.join(set(annotators))}\nlabel: {', '.join(set(labels))}"
ax.set_title(title)


Expand Down Expand Up @@ -723,13 +728,13 @@ def shade_no_effort(


def _draw_effort_spans(
ax: plt.Axes,
effort_index: DatetimeIndex,
width_days: float,
*,
facecolor: str,
alpha: float,
label: str,
ax: plt.Axes,
effort_index: DatetimeIndex,
width_days: float,
*,
facecolor: str,
alpha: float,
label: str,
) -> None:
"""Draw vertical lines for effort plot."""
for ts in effort_index:
Expand Down
79 changes: 79 additions & 0 deletions tests/test_formatting_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import pytest
from pandas import DataFrame, Timedelta, Timestamp, date_range

from post_processing.utils.formatting_utils import aplose2raven


@pytest.fixture
def aplose_dataframe() -> DataFrame:
data = DataFrame(
{
"dataset": ["dataset_test", "dataset_test", "dataset_test", "dataset_test"],
"filename": ["file1.wav", "file2.wav", "file3.wav", "file4.wav"],
"start_time": [0, 0, 5.9, 0],
"end_time": [30, 30, 8.1, 30],
"start_frequency": [0, 0, 18500.0, 0],
"end_frequency": [96000, 96000, 53000.0, 96000],
"annotation": ["boat", "boat", "boat", "boat"],
"annotator": ["bbjuni", "bbjuni", "bbjuni", "bbjuni"],
"start_datetime": [
Timestamp("2020-05-29T11:30:00.000+00:00"),
Timestamp("2020-05-29T11:31:00.000+00:00"),
Timestamp("2020-05-29T11:31:05.900+00:00"),
Timestamp("2020-05-29T11:32:50.000+00:00"),
],
"end_datetime": [
Timestamp("2020-05-29T11:30:30.000+00:00"),
Timestamp("2020-05-29T11:31:30.000+00:00"),
Timestamp("2020-05-29T11:31:08.100+00:00"),
Timestamp("2020-05-29T11:33:20.000+00:00"),
],
"is_box": [0, 0, 1, 0],
},
)

return data.reset_index(drop=True)


@pytest.fixture
def audio_timestamps() -> list:
return list(
date_range(
start="2020-05-29T11:30:00.000+00:00",
end="2020-05-29T11:35:00.000+00:00",
freq="1min",
),
)


@pytest.fixture
def audio_durations(audio_timestamps: list[Timestamp]) -> list:
return [Timedelta("30s")] * len(audio_timestamps)


@pytest.mark.unit
def test_aplose2raven(
aplose_dataframe: DataFrame,
audio_timestamps: list[Timestamp],
audio_durations: list[Timedelta],
) -> None:
raven_dataframe = aplose2raven(
aplose_result=aplose_dataframe,
list_audio_begin_time=audio_timestamps,
audio_durations=audio_durations,
)

expected_raven_dataframe = DataFrame(
{
"Selection": [1, 2, 3, 4],
"View": [1, 1, 1, 1],
"Channel": [1, 1, 1, 1],
"Begin Time (s)": [0.0, 30.0, 35.9, 90.0],
"End Time (s)": [30.0, 60.0, 38.1, 110.0],
"Low Freq (Hz)": [0.0, 0.0, 18500.0, 0.0],
"High Freq (Hz)": [96000.0, 96000.0, 53000.0, 96000.0],
"Begin Date Time Real": aplose_dataframe["start_datetime"],
},
)

assert expected_raven_dataframe.equals(raven_dataframe)