diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 04324b8..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 8a30d25..e4db45c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,9 @@ *.userosscache *.sln.docstates +# macOS system files +.DS_Store + # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs diff --git a/README.md b/README.md index 5cc3df2..3e9f29f 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,10 @@ I. INTRODUCTION
II. GETTING STARTED
III. RUNNING THE PROGRAM
IV. OUTPUTS OF THE PROGRAM
-V. INTERACTIVE DEMOS
-VI. TESTING SUITE
-VII. SUBROUTINES
-VIII. JAX
-IX. REFERENCES
+V. TESTING SUITE
+VI. SUBROUTINES
+VII. JAX
+VIII. REFERENCES
## I. INTRODUCTION This code implements a model of time-varying auditory loudness in Python. @@ -56,16 +55,18 @@ According to Moore et al. (2018), the model includes "three stages with differen * Attack time (Ta/Tal) = how quickly the system responds to increases in level * Release time (Tr/Trl) = how quickly it responds to decreases in level +* aa/aal = attack coefficient +* ar/arl = release coefficient #### Original Constants -The original model used: +Moore's 2016 model used: Short-term: Ta = 22 ms (aa = 0.045), Tr = 50 ms (ar = 0.02) Long-term: Tal = 99 ms (aal = 0.01), Trl = 2000 ms (arl = 0.0005) -These were chosen to "give reasonable predictions of the way that loudness varies with duration" and "give reasonably accurate predictions of the overall loudness of sounds that are AM at low rates." +These were chosen to "give reasonable predictions of the way that loudness varies with duration" and "give reasonably accurate predictions of the overall loudness of sounds that are AM at low rates." (Moore, 2018) #### Modified Constants @@ -75,7 +76,9 @@ Short-term: Ta = 22 ms (aa = 0.045), Tr = 30 ms (ar = 0.033) Long-term: Tal = 99 ms (aal = 0.01), Trl = 751 ms (arl = 0.00133) -The paper doesn't provide explicit scientific justification for using two specific time constants. The closest it comes is describing their functional purposes - short-term for individual words/notes and long-term for sentences/phrases. The time constants appear to be empirically determined rather than derived from fundamental auditory principles. Moore et al. (2018) focused on refining these values through experimental data fitting rather than explaining their theoretical basis. +For more information on how and why these constants were modified, read section IV from the Moore et al. 2018 paper. + +Short-term is described by the paper as for individual words/notes and long-term for sentences/phrases. The time constants appear to be empirically determined rather than derived from fundamental auditory principles. Moore et al. (2018) focused on refining these values through experimental data fitting rather than explaining their theoretical basis. ## II. GETTING STARTED @@ -110,32 +113,22 @@ pip install jax jaxlib ## III. RUNNING THE PROGRAM -The main function for loudness calculation is main_tv2018, located in the tvl2018 module. +The main function for loudness calculation is compute_loudness, located in the tvl2018 module. -The function main_tv2018 takes five parameters and two optional parameters for results. +The function compute_loudness takes four parameters **FUNCTION SIGNATURE:** ```python -def main_tv2018( - filename_or_sound: Union[str, np.ndarray], +def compute_loudness( + sound: Union[np.ndarray, np.ndarray], db_max: float, - filter_filename: str, + filter: Union[np.ndarray, np.ndarray], rate: int = None, - debug_plot: bool = False, - debug_plot_filename: Optional[str] = None, - debug_summary_filename: Optional[str] = None ): ``` -**`filename_or_sound`**: The input sound, which can be: - -* A path to an audio file (e.g., 'audio.wav'). - -* A NumPy array containing audio data (must specify rate if using this option). - -* A string specifying a synthesized signal in the format 'synthesize_khz_ms' (e.g., 'synthesize_1khz_100ms') with sample rate set to 32000 hz. - +**`sound`**: Input sound data as a 2D-array **`db_max`**: The root-mean-square sound pressure level (SPL) of a full-scale sinusoid (i.e., a sinusoid whose peak amplitude is 1). This allows calibration of absolute level. Typical values: @@ -144,21 +137,22 @@ Typical values: * **60–80 dB SPL**: Noisy environments. * **Default**: 50 dB SPL. -**`filter_filename`**: The filename of the filter that specifies the transfer function through the outer and middle ear. -* `ff_32000.mat` for free-field presentation, +**`filter`**: The array specifies the three standard transfer functions through the outer and middle ear. +* `ff_32000` for free-field presentation, -* `df_32000.mat` for diffuse-field presentation, +* `df_32000` for diffuse-field presentation, -* `ed_32000.mat` for middle-ear only (when the signal is picked up at the eardrum, or headphones with a “flat” frequency response at the eardrum are used). +* `ed_32000` for middle-ear only (when the signal is picked up at the eardrum, or headphones with a “flat” frequency response at the eardrum are used). -**`rate`**: The sampling rate of the signal, can be specified. If providing your own array data for the signal, be sure to specify rate. If reading from a file or synthesizing a signal, the rate is determined automatically +**`rate`**: The sampling rate of the signal, can be specified. If providing your own array data for the signal, be sure to specify rate. If reading from a file or synthesizing a signal, the rate is determined automatically. -**`debug_plot`**: Boolean value, If true, generates and saves a plot of loudness over time. -**`debug_plot_filename`**: Where to store the loudness plot, if **`debug_plot`** is True. - -**`debug_summary_filename`**: Where to store a textual summary of the loudness. +

+ +

+This graph shows a visualization of each transfer function, ff for free-field, df for diffuse-field, and ed for eardrum. +For more information about each transfer function, go to [transfer_functions.py](transfer_functions.py) ## IV. OUTPUTS OF THE PROGRAM The function returns three main results: @@ -168,65 +162,46 @@ The function returns three main results: Each is provided as an array with 1 ms intervals starting from t = 0 ms. -**Optional Outputs:** - -**Plot:** If debug_plot is True, a plot showing instantaneous, short-term, and long-term loudness over time is saved to debug_plot_filename. - -**Text Summary:** If debug_summary_filename is provided, a detailed text file containing loudness metrics is saved. - **EXAMPLE INPUT** ```python -from tvl2018 import main_tv2018 +from tvl2018 import compute_loudness + +frequency = 1000 # Hz - frequency of the tone +duration = 0.1 # seconds - length of the tone +rate = 32000 # Hz - sample rate +db_max = 50 # dB SPL - reference level -filename_or_sound = 'synthesize_1khz_100ms' # this can be replaced with a user-provided audio file of similar length -db_max = 50 -filter_filename = 'transfer functions/ff_32000.mat' +# Synthesize the sound +sound = tvl.synthesize_sound(frequency, duration, rate) -loudness, short_term_loudness, long_term_loudness = main_tv2018( - filename_or_sound, +# Calculate loudness +loudness, short_term, long_term = tvl.compute_loudness( + sound, db_max, - filter_filename, - debug_plot=True, - debug_plot_filename='results/loudness_plot_synthesize_1khz_100ms_50dB.png', - debug_summary_filename='results/loudness_summary_synthesize_1khz_100ms_50dB.txt' + transfer_functions.ff_32000, + rate ) - -print(f"\nPlot saved to: results/loudness_plot_synthesize_1khz_100ms_50dB.png") -print(f"Summary saved to: results/loudness_summary_synthesize_1khz_100ms_50dB.txt") - ``` - -Running the code above calculates loudness for the synthesized 1khz 100ms audio data. The signal is a 100-ms segment of a 1000-Hz tone with a level 10 dB below the full-scale level. If a full-scale sinusoid has a level of 50 dB SPL (as specified by the “50” in the example above), the signal in the example wav file would have a level of 40 dB SPL and the outputs show the loudness of a 1-kHz pure tone with a duration of 100 ms and a level of 40 dB SPL. To calculate the loudness of a 1-kHz pure tone with a duration of 100 ms and a level of X dB SPL, specify the full-scale level as X+10. - -**EXAMPLE OUTPUTS:**
-With the arguments above the main_tv2018 function creates two files: a textual summary and a summary plot: - -[Download the generated text file here.](results/synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt) - -![Loudness Plot](results/synthesize_1khz_100ms_50dB_loudness_plot.png) - -## V. INTERACTIVE DEMOS +## INTERACTIVE DEMOS [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JQcklNVzuwJVy3fBco64IlO87RQ1WeH5?usp=sharing) -These demos provide interactive demonstrations of loudness perception using the TVL2018 model. Each demo highlights different aspects of auditory perception: +These demos provide interactive demonstrations of loudness perception using the TVL2018 model. Each demo highlights different aspects of the program. It provides plots and statistics on analyzed audio data. * Demo 1: Basic Loudness Analysis - Understanding fundamental loudness measurements * Demo 2: Real-World Sound Loudness Analysis - Analyzing loudness in real-world audio files -* Demo 3: Phase Optimization - How phase relationships affect perceived loudness +* Demo 3: Parameter Effects - The impact of frequency, duration, and level -* Demo 4: Parameter Effects - The impact of frequency, duration, and level - -## VI. TESTING SUITE +## V. TESTING SUITE This test suite validates the implementation of the TVL2018 loudness model by covering a general overall test, precision tests, and individual utility functions. ### Basic Tests -- **`test_basic_example`**: Tests the `main_tv2018` function with a 100ms synthesized 1 kHz tone at 50 dB SPL and 32k sample rate with free field transform, checking short-term and long-term loudness calculations. You can change inputs here to get different plots and summary files. +- **`test_basic_example`**: Tests the `compute_loudness` function with a 100ms synthesized 1 kHz tone at 50 dB SPL and 32k sample rate with free field transform, checking short-term and long-term loudness calculations. You can change inputs here to get different plots and summary files. - **`test_peak_constrained_power_optimization`**: Validates and demonstrates that phase adjustments can increase power/loudness while maintaining peak amplitude constraints. Tests different phase configurations (cosine phase baseline, all-pass filter, random phases) to verify improvements in RMS and loudness while keeping peak amplitude constant. @@ -274,14 +249,16 @@ Ensure all dependencies are installed and the `tvl2018` module is accessible. Th -## VII. SUBROUTINES +## VI. SUBROUTINES You will find many useful subroutines in the main directory and subdirectory ‘functions’. They may be used to calculate excitation patterns, perform a Fast Fourier Transform (FFT), convert sone to phon or Hz to Cam (the units of the ERBN-number scale), calculate the equivalent rectangular bandwidth of the auditory filter, calculate binaural inhibition, and implement automatic gain circuits, among other things. -## VIII. JAX +## VII. JAX + +The Numpy code was translated to JAX and runs. Unfortunately it does not compile as there are several portions of the implementation that are [not pure](https://docs.jax.dev/en/latest/notebooks/Common_Gotchas_in_JAX.html). We want to fix these details, but also welcome contributions from the community. -A JAX version of this model is also available. To use this code, import +To use this code in JAX, import ```python import tvl2018_jax as tvl ``` @@ -294,7 +271,7 @@ python tvl2018_jax_test.py -## IX. REFERENCES +## VIII. REFERENCES Glasberg, B. R., and Moore, B. C. J. (2006). "Prediction of absolute thresholds and equal-loudness contours using a modified loudness model," J. Acoust. Soc. Am. 120, 585-588 diff --git a/results/synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt b/results/synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt deleted file mode 100644 index ae6a7ff..0000000 --- a/results/synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt +++ /dev/null @@ -1,121 +0,0 @@ -results/synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt - -Calibration level: 50 dB SPL (RMS level of a full-scale sinusoid) -Filename of FIR filter: transfer functions/ff_32000.mat - -Maximum of long-term loudness: 0.74 sone - 36.23 phon -Maximum of short-term loudness: 1.41 sone - 44.53 phon - -Loudness over time -1st column: time in milliseconds -2nd column: short-term loudness in sone -3rd column: short-term loudness level in phon -4th column: long-term loudness in sone -5th column: long-term loudness level in phon - - time short-t. loudness long-t. loudness - ms sone phon sone phon - 0 0.09 17.3 0.00 1.1 - 1 0.17 21.8 0.00 3.3 - 2 0.24 24.6 0.01 4.7 - 3 0.30 26.6 0.01 5.8 - 4 0.35 28.2 0.01 6.8 - 5 0.40 29.5 0.02 7.9 - 6 0.44 30.5 0.02 8.8 - 7 0.48 31.5 0.02 9.8 - 8 0.52 32.3 0.03 10.7 - 9 0.56 33.1 0.03 11.5 - 10 0.59 33.8 0.04 12.3 - 11 0.63 34.4 0.05 13.0 - 12 0.66 35.0 0.05 13.7 - 13 0.69 35.5 0.06 14.4 - 14 0.72 36.0 0.07 15.0 - 15 0.75 36.5 0.07 15.6 - 16 0.78 36.9 0.08 16.2 - 17 0.80 37.3 0.09 16.8 - 18 0.83 37.6 0.09 17.3 - 19 0.85 38.0 0.10 17.9 - 20 0.87 38.3 0.11 18.4 - 21 0.89 38.6 0.12 18.9 - 22 0.91 38.9 0.12 19.3 - 23 0.93 39.1 0.13 19.8 - 24 0.95 39.4 0.14 20.2 - 25 0.97 39.6 0.15 20.7 - 26 0.98 39.8 0.16 21.1 - 27 1.00 40.0 0.17 21.5 - 28 1.02 40.2 0.17 21.9 - 29 1.03 40.4 0.18 22.3 - 30 1.04 40.6 0.19 22.7 - 31 1.06 40.7 0.20 9.6 - 32 1.07 40.9 0.21 23.4 - 33 1.08 41.0 0.22 23.7 - 34 1.09 41.2 0.23 24.1 - 35 1.11 41.3 0.24 24.4 - 36 1.12 41.4 0.24 24.8 - 37 1.13 41.5 0.25 22.6 - 38 1.14 41.6 0.26 25.4 - 39 1.14 41.8 0.27 25.7 - 40 1.15 41.8 0.28 26.0 - 41 1.16 41.9 0.29 26.3 - 42 1.17 42.0 0.30 26.6 - 43 1.18 42.1 0.31 26.8 - 44 1.18 42.2 0.31 27.1 - 45 1.19 42.3 0.32 27.4 - 46 1.20 42.3 0.33 27.6 - 47 1.20 42.4 0.34 27.9 - 48 1.21 42.5 0.35 28.1 - 49 1.22 42.5 0.36 28.4 - 50 1.22 42.6 0.37 28.6 - 51 1.23 42.7 0.38 28.8 - 52 1.23 42.7 0.38 29.1 - 53 1.24 42.8 0.39 29.3 - 54 1.24 42.8 0.40 29.5 - 55 1.24 42.9 0.41 29.7 - 56 1.25 42.9 0.42 29.9 - 57 1.25 42.9 0.43 30.1 - 58 1.26 43.0 0.43 30.3 - 59 1.26 43.0 0.44 30.5 - 60 1.26 43.0 0.45 30.7 - 61 1.27 43.1 0.46 30.9 - 62 1.27 43.1 0.47 31.1 - 63 1.27 43.1 0.48 31.3 - 64 1.27 43.2 0.48 31.5 - 65 1.28 43.2 0.49 31.6 - 66 1.28 43.2 0.50 31.8 - 67 1.28 43.3 0.51 32.0 - 68 1.28 43.3 0.51 32.2 - 69 1.29 43.3 0.52 32.3 - 70 1.29 43.3 0.53 32.5 - 71 1.29 43.3 0.54 32.6 - 72 1.29 43.4 0.55 32.8 - 73 1.29 43.4 0.55 32.9 - 74 1.30 43.4 0.56 33.1 - 75 1.30 43.4 0.57 33.2 - 76 1.30 43.4 0.57 33.4 - 77 1.30 43.4 0.58 33.5 - 78 1.30 43.5 0.59 33.7 - 79 1.30 43.5 0.60 33.8 - 80 1.31 43.5 0.60 33.9 - 81 1.31 43.5 0.61 34.1 - 82 1.31 43.5 0.62 34.2 - 83 1.31 43.5 0.62 34.3 - 84 1.31 43.5 0.63 34.4 - 85 1.31 43.5 0.64 34.6 - 86 1.31 43.6 0.64 34.7 - 87 1.31 43.6 0.65 34.8 - 88 1.31 43.6 0.66 34.9 - 89 1.31 43.6 0.66 35.0 - 90 1.32 43.6 0.67 35.2 - 91 1.32 43.6 0.68 35.3 - 92 1.32 43.6 0.68 35.4 - 93 1.32 43.6 0.69 35.5 - 94 1.32 43.6 0.70 35.6 - 95 1.32 43.7 0.70 35.7 - 96 1.33 43.7 0.71 35.8 - 97 1.34 43.8 0.72 35.9 - 98 1.35 44.0 0.72 36.0 - 99 1.38 44.2 0.73 36.1 - 100 1.41 44.5 0.74 36.2 -max 1.41 44.5 0.74 36.2 diff --git a/results/synthesize_1khz_100ms_50dB_loudness_plot.png b/results/synthesize_1khz_100ms_50dB_loudness_plot.png deleted file mode 100644 index 18bfaa7..0000000 Binary files a/results/synthesize_1khz_100ms_50dB_loudness_plot.png and /dev/null differ diff --git a/results/test_basic_example_loudness_plot.png b/results/test_basic_example_loudness_plot.png index a0ece74..33e2c7d 100644 Binary files a/results/test_basic_example_loudness_plot.png and b/results/test_basic_example_loudness_plot.png differ diff --git a/results/test_interpolation_linear.png b/results/test_interpolation_linear.png index 77ca530..eef25e3 100644 Binary files a/results/test_interpolation_linear.png and b/results/test_interpolation_linear.png differ diff --git a/results/test_interpolation_pchip.png b/results/test_interpolation_pchip.png index 80bdd9f..c963518 100644 Binary files a/results/test_interpolation_pchip.png and b/results/test_interpolation_pchip.png differ diff --git a/samples/six_second_speech.wav b/samples/six_second_speech.wav new file mode 100644 index 0000000..f47c871 Binary files /dev/null and b/samples/six_second_speech.wav differ diff --git a/samples/two_second_speech.wav b/samples/two_second_speech.wav new file mode 100644 index 0000000..2387c24 Binary files /dev/null and b/samples/two_second_speech.wav differ diff --git a/transfer_functions.py b/transfer_functions.py index 7b26efe..7168c25 100644 --- a/transfer_functions.py +++ b/transfer_functions.py @@ -6,7 +6,7 @@ The ISO532-2 paper has this to say about the three transfer functions included here: -7.2.2 Free field and diffuse field transfer functions for sound picked up by a +"7.2.2 Free field and diffuse field transfer functions for sound picked up by a single microphone These transfer functions are applicable when the sound is picked up via a @@ -18,7 +18,7 @@ The first, applicable to free field (ff) listening with frontal incidence of the sound source, is specified in column 2 of Table 1. The second, applicable to listening in a diffuse field, is specified in column 3 of Table 1. The transfer -functions represent the mean for adult humans[19-21]. +functions represent the mean for adult humans. The diffuse field (df) transfer function can also be used for sounds presented via earphones that are designed to have a diffuse field response (see 7.2.3). @@ -28,7 +28,21 @@ absolute threshold values given in ISO 389-7:2005. It is acknowledged that these transfer functions do not comply with those -specified in ISO 11904-1:2002. +specified in ISO 11904-1:2002." + +The paper also has this to say about ed_32000, or the middle-ear transfer function: + +The transmission of sound through the middle ear from the tympanic membrane to the oval window +(the cochlea) is taken into account by a middle ear transfer function specified in column 4 of Table 1. +The shape of the function represents the difference between the sound pressure level in the cochlea +and the sound pressure level at the tympanic membrane. The whole function is scaled so that an input +signal consisting of a 1 000 Hz sinusoid presented in a free field with frontal incidence at a sound +pressure level of 0 dB leads to a sound pressure level at the cochlea of 0 dB. The interpolation procedure +described in 7.2.6 is used to determine values at intermediate frequencies. The values for frequencies +above 12 500 Hz are based on extrapolation and have not been validated, so they are shown in italics +in Table 1. They are included for the user who wishes to predict the loudness of sounds with frequency +components above 12 500 Hz. The end result of this stage is a specification of the spectrum of the +pressure variation applied to the cochlea. """ # Diffuse field transfer function @@ -1061,7 +1075,7 @@ # Created from LoudnessModel/transfer_functions/ed_32000.mat -# ??? field transfer function +# Middle-ear transfer function ed_32000 = np.array([ -3.2505878004054887e-08, -3.287291791016491e-08, -3.8671111178845556e-08, -3.38288794349589e-08, -5.245768177408164e-08, -4.40615908419177e-08, -3.981347632028548e-08, -5.037654786235391e-08, diff --git a/tvl2018.py b/tvl2018.py index b6432ae..0710ac4 100644 --- a/tvl2018.py +++ b/tvl2018.py @@ -1,8 +1,9 @@ import os - -import matplotlib.pyplot as plt import numpy as np -from scipy.io import loadmat, wavfile + +from multiprocessing import Pool +from functools import partial +from scipy.io import wavfile from scipy.signal import convolve, resample from scipy.interpolate import PchipInterpolator, interp1d from typing import List, Optional, Tuple, Union @@ -596,8 +597,7 @@ def excitation_threshold_tvl(f: Union[List[float], np.ndarray]) -> np.ndarray: def excitation_to_specific_loudness_binaural_025( - excitation_levels: Union[List[float], np.ndarray], - plot_flag: bool = False) -> np.ndarray: + excitation_levels: Union[List[float], np.ndarray]) -> np.ndarray: """ Calculate specific loudness from excitation patterns at 0.25-ERB steps, accounting for binaural loudness. @@ -632,17 +632,7 @@ def excitation_to_specific_loudness_binaural_025( # Combine results based on conditions out = np.where(excitation > threshold, np.where( excitation < 1e10, n_1, n_3), n_2) - # Original MATLAB code includes a commented out plot, recreated here: - if plot_flag: - plt.figure() - plt.semilogy(excitation_levels, out, label='Specific Loudness') - plt.xlim([0, 110]) - plt.ylim([0.005, 10]) - plt.xlabel('Excitation Levels') - plt.ylabel('Specific Loudness (Sones)') - plt.title('Excitation vs. Specific Loudness') - plt.grid(True, which="both", ls="--") - plt.show() + return out @@ -980,6 +970,32 @@ def monaural_specific_loudness_to_binaural_loudness_025( return loudness, loudness_left, loudness_right +def process_segment(segment, rate, db_max, w_hann, v_limiting_indices): + """Process a single segment - to be used with multiprocessing.""" + # Calculate relevant frequencies and levels + f_left_relevant, l_left_relevant, f_right_relevant, l_right_relevant = \ + signal_segment_to_spectrum(segment, rate, db_max, w_hann, v_limiting_indices) + + # Process left channel + if l_left_relevant.size == 0: + specific_loudness_left = np.zeros(150) + else: + excitation_levels_left = \ + spectrum_to_excitation_pattern_025(f_left_relevant, l_left_relevant) + specific_loudness_left = \ + excitation_to_specific_loudness_binaural_025(excitation_levels_left) + + # Process right channel + if l_right_relevant.size == 0: + specific_loudness_right = np.zeros(150) + else: + excitation_levels_right = \ + spectrum_to_excitation_pattern_025(f_right_relevant, l_right_relevant) + specific_loudness_right = \ + excitation_to_specific_loudness_binaural_025(excitation_levels_right) + + return specific_loudness_left, specific_loudness_right + def filtered_signal_to_monaural_instantaneous_specific_loudness( signal: np.ndarray, rate: int, db_max: float ) -> Tuple[np.ndarray, np.ndarray]: @@ -1001,18 +1017,20 @@ def filtered_signal_to_monaural_instantaneous_specific_loudness( n_samples_per_segment = int(rate / 1000 * n_segment_duration) # 32 n_segments_in_signal = int( np.floor((len(signal) - npts) / rate * 1000 / n_segment_duration)) + # Hann windows for 6 FFTs; 1st column 64 ms, 6th column 2 ms w_hann = np.zeros((npts, 6)) for i in range(6): half_window_size = npts // (2 ** i) pad_size = (int((1 - 1 / 2**(i)) / 2 * npts)) - w_hann[:, i] = np.concatenate([ - np.zeros(pad_size), - np.hanning(half_window_size), - np.zeros(pad_size) - ]) - - # Indices which shall be used from the FFTs, 20-80 Hz from the first... + if half_window_size > 0: + w_hann[:, i] = np.concatenate([ + np.zeros(pad_size), + np.hanning(half_window_size), + np.zeros(npts - pad_size - half_window_size) + ]) + + # Limiting frequencies and indices v_limiting_f = [20, 80, 500, 1250, 2540, 4050, 15000] v_limiting_indices = [int(f / (rate / npts)) + 1 for f in v_limiting_f] @@ -1024,41 +1042,28 @@ def filtered_signal_to_monaural_instantaneous_specific_loudness( instantaneous_specific_loudness_right = np.zeros( (n_segments_in_signal + 1, loudness_length)) + # Prepare segments + segments = [] for i_segment in range(n_segments_in_signal + 1): - """ - # Display progress every 50 segments - if iSegment % 50 == 0: - print(f"{iSegment} ms of {nSegmentsInSignal} ms analyzed") - """ - segment_start = i_segment * n_samples_per_segment segment_end = segment_start + npts - segment = signal[segment_start:segment_end, :] - - # Calculate relevant frequencies and levels - f_left_relevant, l_left_relevant, f_right_relevant, l_right_relevant = \ - signal_segment_to_spectrum(segment, rate, db_max, w_hann, v_limiting_indices) - - # If left has no relevant spectral components - if l_left_relevant.size == 0: - specific_loudness_left = np.zeros(150) - else: - excitation_levels_left = \ - spectrum_to_excitation_pattern_025(f_left_relevant, l_left_relevant) - specific_loudness_left = \ - excitation_to_specific_loudness_binaural_025(excitation_levels_left) - - # If right has no relevant spectral components - if l_right_relevant.size == 0: - specific_loudness_right = np.zeros(150) - else: - excitation_levels_right = \ - spectrum_to_excitation_pattern_025(f_right_relevant, l_right_relevant) - specific_loudness_right = \ - excitation_to_specific_loudness_binaural_025(excitation_levels_right) - - instantaneous_specific_loudness_left[i_segment, :] = specific_loudness_left - instantaneous_specific_loudness_right[i_segment, :] = specific_loudness_right + segments.append(signal[segment_start:segment_end]) + + # Process segments in parallel + process_func = partial(process_segment, + rate=rate, + db_max=db_max, + w_hann=w_hann, + v_limiting_indices=v_limiting_indices) + + # Use number of CPU cores for parallel processing + with Pool() as pool: + results = pool.map(process_func, segments) + + # Store results + for i, (left, right) in enumerate(results): + instantaneous_specific_loudness_left[i] = left + instantaneous_specific_loudness_right[i] = right return instantaneous_specific_loudness_left, instantaneous_specific_loudness_right @@ -1086,35 +1091,30 @@ def shortterm_loudness_to_longterm_loudness(Nst: np.ndarray) -> np.ndarray: return Nlt -def sound_field_to_cochlea(s: np.ndarray, filter_filename: str) -> np.ndarray: +def sound_field_to_cochlea(s: np.ndarray, filter: np.ndarray) -> np.ndarray: """ Applies a filter to the sound field data to simulate cochlear processing. Args: s: The input sound field data, a 2D array with two columns for stereo signals. - filter_filename: The filename of the filter coefficients (MAT-file). + filter: Filter coefficients, 1-d array. Returns: out: The filtered output signal. """ - # Load the filter coefficients from the MAT file - filter_data = loadmat(f'{filter_filename}') - # Assuming vecCoefficients is a 1D array - vec_coefficients = filter_data['vecCoefficients'].flatten() - # Apply the filter to the left channel - out_left = convolve(s[:, 0], vec_coefficients, mode='full') + out_left = convolve(s[:, 0], filter, mode='full') # Check if there's a right channel and apply the filter if s.shape[1] > 1: - out_right = convolve(s[:, 1], vec_coefficients, mode='full') + out_right = convolve(s[:, 1], filter, mode='full') else: out_right = out_left # Use vstack to maintain stereo format out = np.vstack((out_left[1024:-1024], out_right[1024:-1024])).T - # Without filter, included in original code: + # Without filter, included in original MATLAB code: # out = np.vstack((np.zeros((1024, 2)), s, np.zeros((1024, 2)))) return out @@ -1145,58 +1145,28 @@ def synthesize_sound(frequency: float, duration: float, rate: int) -> np.ndarray return stereo_sound -def main_tv2018(filename_or_sound: Union[str, np.ndarray], +def compute_loudness(sound: Union[np.ndarray, np.ndarray], db_max: float, - filter_filename: str, - rate: int = None, - debug_plot: bool = False, - debug_plot_filename: Optional[str] = None, - debug_summary_filename: Optional[str] = None): + filter: Union[np.ndarray, np.ndarray], + rate) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Calculate loudness according to Moore, Glasberg & Schlittenlacher (2016). Args: - filename_or_sound: The path to filename of the sound file, the audio data - itself, or synthesized sound frequency (ex. synthesize_1000khz100ms) + sound: Input sound data as a 2D-array db_max: The root-mean-square sound pressure level of a full-scale sinusoid, i.e. a sinusoid whose peak amplitude is 1 in Matlab. - filter_filename: The filename of the FIR filter. - rate: Sampling frequency. If not provided, it will be determined from the file. - debug_plot: Whether to show a summary plot - debug_plot_filename: If plotting, where to store the summary plot. - debug_summary_filename: If non-null, where to put the text summary + filter: Filter coefficients array for the transfer function representing + free-field (ff), diffuse-field (df), or eardrum (ed) response + rate: Sampling frequency. Recommended 32000. + Returns: Calculated loudness metrics: short-term, long-term, highest loudness value """ - if isinstance(filename_or_sound, str): - if filename_or_sound.startswith("synthesize_"): - try: - parts = filename_or_sound.split("_") - frequency = int(parts[1].replace("khz", "")) * 1000 - duration = float(parts[2].replace("ms", "")) / 1000 - if not rate: - rate = 32000 # Default rate if not provided - data = synthesize_sound(frequency, duration, rate=rate) - except ValueError: - raise ValueError("Invalid argument: filename_or_sound must be a string, " - "numpy array, or one of the provided synthesized " - "sounds (e.g., 'synthesize_{}khz_{}ms').") - else: - # Assume it's a path to audio file and not a request for synthesized sound - if not os.path.exists(filename_or_sound): - raise FileNotFoundError("No file found. filename_or_sound must be a string, " - "numpy array, or one of the provided synthesized sounds " - "(e.g., 'synthesize_{}khz_{}ms').") - data, rate = read_and_resample(filename_or_sound) - elif isinstance(filename_or_sound, np.ndarray): - if not rate: - raise ValueError("Rate must be specified when providing sound data directly.") - data = filename_or_sound - # Filter the sound field to cochlea - data = sound_field_to_cochlea(data, filter_filename) + data = sound_field_to_cochlea(sound, filter) # Calculate instantaneous specific loudness instantaneous_specific_loudness_left, instantaneous_specific_loudness_right = \ @@ -1230,58 +1200,4 @@ def main_tv2018(filename_or_sound: Union[str, np.ndarray], short_term_loudness = \ short_term_loudness_left.flatten() + short_term_loudness_right.flatten() - # Plotting the results - if debug_plot: - plt.figure() - plt.plot(range(len(short_term_loudness)), short_term_loudness, - 'b-', label='Short-term loudness') - plt.plot(range(len(long_term_loudness)), long_term_loudness, - 'r-', label='Long-term loudness') - plt.xlabel('Time (ms)') - plt.ylabel('Loudness (sone)') - plt.title(f'Loudness Analysis: {filename_or_sound}\nReference Level: {db_max} dB SPL') - plt.legend() - plt.grid(True) - plt.show - # save plot to results folder - if debug_plot_filename: - plt.savefig(debug_plot_filename) - print(f"\nPlot saved to: {debug_plot_filename}") - - - # Writing results to text file - if debug_summary_filename: - with open(debug_summary_filename, 'w') as fid: - fid.write(f"{debug_summary_filename}\n\n") - fid.write(f"Calibration level: {db_max} " - f"dB SPL (RMS level of a full-scale sinusoid)\n") - fid.write(f"Filename of FIR filter: {filter_filename}\n\n") - fid.write(f"Maximum of long-term loudness: " - f"{np.max(long_term_loudness):9.2f} sone\n") - fid.write(f" " - f"{np.max(sone_to_phon_tv2015(long_term_loudness)):9.2f} phon\n") - fid.write(f"Maximum of short-term loudness: " - f"{np.max(short_term_loudness):9.2f} sone\n") - fid.write(f" " - f"{np.max(sone_to_phon_tv2015(short_term_loudness)):9.2f} phon\n\n") - fid.write("Loudness over time\n") - fid.write("1st column: time in milliseconds\n") - fid.write("2nd column: short-term loudness in sone\n") - fid.write("3rd column: short-term loudness level in phon\n") - fid.write("4th column: long-term loudness in sone\n") - fid.write("5th column: long-term loudness level in phon\n\n") - fid.write(" time short-t. loudness long-t. loudness\n") - fid.write(" ms sone phon sone phon\n") - for i in range(len(long_term_loudness)): - fid.write(f"{i:7.0f} {short_term_loudness[i]:9.2f} " - f"{sone_to_phon_tv2015(short_term_loudness[i]):9.1f} " - f"{long_term_loudness[i]:9.2f} " - f"{sone_to_phon_tv2015(long_term_loudness[i]):9.1f}\n") - fid.write(f"max {np.max(short_term_loudness):9.2f} " - f"{np.max(sone_to_phon_tv2015(short_term_loudness)):9.1f} " - f"{np.max(long_term_loudness):9.2f} " - f"{np.max(sone_to_phon_tv2015(long_term_loudness)):9.1f}\n") - print(f"Summary saved to: {debug_summary_filename}") - - return loudness, short_term_loudness, long_term_loudness diff --git a/tvl2018_jax.py b/tvl2018_jax.py index 14a128d..a761383 100644 --- a/tvl2018_jax.py +++ b/tvl2018_jax.py @@ -1121,7 +1121,7 @@ def sound_field_to_cochlea(s: jnp.ndarray, filter: jnp.ndarray) -> jnp.ndarray: # Use vstack to maintain stereo format out = jnp.vstack((out_left[1024:-1024], out_right[1024:-1024])).T - # Without filter, included in original code: + # Without filter, included in original MATLAB code: # out = jnp.vstack((jnp.zeros((1024, 2)), s, jnp.zeros((1024, 2)))) return out @@ -1152,7 +1152,7 @@ def synthesize_sound(frequency: float, duration: float, rate: int) -> jnp.ndarra return stereo_sound -def main_tv2018(sound: Union[np.ndarray, jnp.ndarray], +def compute_loudness(sound: Union[np.ndarray, jnp.ndarray], db_max: float, filter: Union[np.ndarray, jnp.ndarray], rate) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: @@ -1160,13 +1160,14 @@ def main_tv2018(sound: Union[np.ndarray, jnp.ndarray], Calculate loudness according to Moore, Glasberg & Schlittenlacher (2016). Args: - filename_or_sound: The path to filename of the sound file, the audio data - itself, or synthesized sound frequency (ex. synthesize_1000khz100ms) + sound: Input sound data as a 2D-array db_max: The root-mean-square sound pressure level of a full-scale sinusoid, i.e. a sinusoid whose peak amplitude is 1 in Matlab. - filter_filename: The filename of the FIR filter. - rate: Sampling frequency. If not provided, it will be determined from the file. + filter: Filter coefficients array for the transfer function representing + free-field (ff), diffuse-field (df), or eardrum (ed) response + rate: Sampling frequency. Recommended 32000. + Returns: Calculated loudness metrics: short-term, long-term, highest loudness value @@ -1211,7 +1212,4 @@ def main_tv2018(sound: Union[np.ndarray, jnp.ndarray], short_term_loudness = \ short_term_loudness_left_adjusted.flatten() + short_term_loudness_right_adjusted.flatten() - short_term_loudness_np = np.asarray(short_term_loudness) - long_term_loudness_np = np.asarray(long_term_loudness) - return loudness, short_term_loudness, long_term_loudness diff --git a/tvl2018_jax_test.py b/tvl2018_jax_test.py index eb40f06..a3b3bb2 100644 --- a/tvl2018_jax_test.py +++ b/tvl2018_jax_test.py @@ -17,18 +17,17 @@ class LoudnessModelTests(absltest.TestCase): """Test suite for the TVL2018 loudness model implementation.""" def test_basic_example(self): - """Test the main_tv2018 function with a synthesized 1 kHz tone.""" + """Test the compute_loudness function with a synthesized 1 kHz tone.""" # Ensure results directory exists os.makedirs('results', exist_ok=True) + # Here, you can modify the input values into compute_loudness frequency = 1000 # Hz duration = 0.1 # seconds rate = 32000 # Hz sound = tvl.synthesize_sound(frequency, duration, rate) - - # Here, you can modify the input values into tvl.main_tv2018 db_max = 50 # Example SPL value - _, short_term_loudness, long_term_loudness = tvl.main_tv2018( + _, short_term_loudness, long_term_loudness = tvl.compute_loudness( sound, db_max, transfer_functions.ff_32000, @@ -67,7 +66,7 @@ def setUpClass(cls): cls.sample_rate) cls.filter = transfer_functions.ff_32000 - cls.loudness, cls.short_term_loudness, cls.long_term_loudness = tvl.main_tv2018( + cls.loudness, cls.short_term_loudness, cls.long_term_loudness = tvl.compute_loudness( cls.sound, cls.db_max, cls.filter, @@ -154,7 +153,7 @@ def process_signal(mono_signal): signal = mono_signal * (peak_constraint / jnp.max(jnp.abs(mono_signal))) stereo = jnp.column_stack((signal, signal)) rms = jnp.sqrt(jnp.mean(signal ** 2)) - loudness, _, _ = tvl.main_tv2018(stereo, db_max, filter, rate=rate) + loudness, _, _ = tvl.compute_loudness(stereo, db_max, filter, rate=rate) return rms, loudness def create_signal(magnitudes, phases): @@ -258,7 +257,7 @@ def test_long_term_loudness(self): # Ensure results directory exists os.makedirs('results', exist_ok=True) - # Call main_tv2018 + # Call compute_loudness long_term_loudness = self.long_term_loudness # Assert first five long-term loudness values diff --git a/tvl2018_test.py b/tvl2018_test.py index bce0e6a..6e73c72 100644 --- a/tvl2018_test.py +++ b/tvl2018_test.py @@ -4,6 +4,7 @@ import numpy as np from absl.testing import absltest +import transfer_functions import tvl2018 as tvl import scipy.signal @@ -11,26 +12,22 @@ class LoudnessModelTests(absltest.TestCase): """Test suite for the TVL2018 loudness model implementation.""" def test_basic_example(self): - """Test the main_tv2018 function with a synthesized 1 kHz tone.""" + """Test the compute_loudness function with a synthesized 1 kHz tone.""" # Ensure results directory exists os.makedirs('results', exist_ok=True) - debug_plot_filename = os.path.join( - 'results', 'synthesize_1khz_100ms_50dB_loudness_plot.png') - debug_summary_filename = os.path.join( - 'results', 'synthesize_1khz_100ms_50dB_calibration_level_TVL_2018.txt') - - # Here, you can modify the input values into tvl.main_tv2018 + # Here, you can modify the input values into tvl.compute_loudness + frequency = 1000 # Hz + duration = 0.1 # seconds + rate = 32000 # Hz + sound = tvl.synthesize_sound(frequency, duration, rate) db_max = 50 # Example SPL value - filename_or_sound = 'synthesize_1khz_100ms' - filter_filename = 'transfer functions/ff_32000.mat' - _, short_term_loudness, long_term_loudness = tvl.main_tv2018( - filename_or_sound, + + _, short_term_loudness, long_term_loudness = tvl.compute_loudness( + sound, db_max, - filter_filename, - debug_plot=True, - debug_plot_filename=debug_plot_filename, - debug_summary_filename=debug_summary_filename + transfer_functions.ff_32000, + rate=rate, ) # Weak sanity tests @@ -63,15 +60,15 @@ def setUpClass(cls): cls.duration = 0.1 cls.sample_rate = 32000 # Hz cls.db_max = 50 # Example SPL value - cls.filename_or_sound = 'synthesize_1khz_100ms' - cls.filter_filename = 'transfer functions/ff_32000.mat' + cls.sound = tvl.synthesize_sound(cls.frequency, cls.duration, + cls.sample_rate) + cls.filter = transfer_functions.ff_32000 - cls.loudness, cls.short_term_loudness, cls.long_term_loudness = tvl.main_tv2018( - cls.filename_or_sound, + cls.loudness, cls.short_term_loudness, cls.long_term_loudness = tvl.compute_loudness( + cls.sound, cls.db_max, - cls.filter_filename, - debug_plot=False, # Disable plotting during tests - debug_summary_filename=None # Disable summary file + cls.filter, + cls.sample_rate, ) # Generate Hann windows for testing cls.npts = int(cls.sample_rate / 1000 * 64) # 2048 @@ -144,14 +141,14 @@ def test_peak_constrained_power(self): n_harmonics = 10 peak_constraint = 0.8 db_max = 50 - filter_filename = 'transfer functions/ff_32000.mat' + filter = transfer_functions.ff_32000 def process_signal(mono_signal): """Process mono signal: normalize, make stereo, calculate metrics.""" signal = mono_signal * (peak_constraint / np.max(np.abs(mono_signal))) stereo = np.column_stack((signal, signal)) rms = np.sqrt(np.mean(signal ** 2)) - loudness, _, _ = tvl.main_tv2018(stereo, db_max, filter_filename, rate=rate) + loudness, _, _ = tvl.compute_loudness(stereo, db_max, filter, rate=rate) return rms, loudness def create_signal(magnitudes, phases): @@ -207,13 +204,10 @@ def test_overall_loudness(self): # 'results', 'test_overall_loudness_plot.png') # debug_summary_filename = os.path.join( # 'results', 'test_overall_loudness_summary.txt') - # loudness, _, _ = tvl.main_tv2018( - # self.filename_or_sound, + # loudness, _, _ = tvl.compute_loudness( + # self.sound, # self.db_max, - # self.filter_filename, - # debug_plot=True, - # debug_plot_filename=debug_plot_filename, - # debug_summary_filename=debug_summary_filename + # self.filter, # ) # Assert maximum loudness @@ -229,21 +223,6 @@ def test_short_term_loudness(self): # Ensure results directory exists os.makedirs('results', exist_ok=True) short_term_loudness = self.short_term_loudness - - # Uncomment this code for debug files and plot if needed - # debug_plot_filename = os.path.join( - # 'results', 'test_short_term_loudness_plot.png') - # debug_summary_filename = os.path.join( - # 'results', 'test_short_term_loudness_summary.txt') - # loudness, short_term_loudness, long_term_loudness = tvl.main_tv2018( - # self.filename_or_sound, - # self.db_max, - # self.filter_filename, - # debug_plot=True, - # debug_plot_filename=debug_plot_filename, - # debug_summary_filename=debug_summary_filename - # ) - # Assert first five short-term loudness values np.testing.assert_allclose( short_term_loudness[:5], @@ -257,24 +236,9 @@ def test_long_term_loudness(self): # Ensure results directory exists os.makedirs('results', exist_ok=True) - # Call main_tv2018 + # Call compute_loudness long_term_loudness = self.long_term_loudness - # Uncomment this code for debug files and plot if needed - # debug_plot_filename = os.path.join( - # 'results', 'test_long_term_loudness_plot.png') - # debug_summary_filename = os.path.join( - # 'results', 'test_long_term_loudness_summary.txt') - # _, _, long_term_loudness = tvl.main_tv2018( - # self.filename_or_sound, - # self.db_max, - # self.filter_filename, - # debug_plot=True, - # debug_plot_filename=debug_plot_filename, - # debug_summary_filename=debug_summary_filename - # ) - - # Assert first five long-term loudness values np.testing.assert_allclose( long_term_loudness[:5], @@ -286,10 +250,10 @@ def test_long_term_loudness(self): def test_signal_segment_to_spectrum(self): """Test the signal segment to spectrum conversion against expected values.""" # Use the filtered signal from test_basic_example - filter_filename = self.filter_filename # Synthesize and filter the sound - sound = tvl.synthesize_sound(self.frequency, self.duration, rate=self.sample_rate) - cochlea_filtered = tvl.sound_field_to_cochlea(sound, filter_filename) + sound = tvl.synthesize_sound(self.frequency, self.duration, + rate=self.sample_rate) + cochlea_filtered = tvl.sound_field_to_cochlea(sound, self.filter) # Process the first segment segment = cochlea_filtered[:2048, :] # First segment f_left_relevant, l_left_relevant, f_right_relevant, l_right_relevant = tvl.signal_segment_to_spectrum( @@ -425,18 +389,18 @@ def test_excitation_to_specific_loudness_binaural_025_selected(self): def test_filtered_signal_to_monaural_instantaneous_specific_loudness_selected(self): """Test selected point against expected values.""" - # Example parameters (consistent with expected value generation) + # Example parameters (consistent with expected value generation) frequency = self.frequency # Hz duration = self.duration # seconds rate = self.sample_rate db_max = self.db_max - filter_filename = self.filter_filename + filter = self.filter # Synthesize sound sound = tvl.synthesize_sound(frequency, duration, rate) # Filter sound - cochlea_filtered = tvl.sound_field_to_cochlea(sound, filter_filename) + cochlea_filtered = tvl.sound_field_to_cochlea(sound, filter) # Call the function under test ist_loudness_left, ist_loudness_right =\