Skip to content
Snippets Groups Projects
Select Git revision
  • 4f6e95c413e1c334814947b820c184e9fd2e19ff
  • main default protected
  • V1
3 results

get_train_val_YOLO.py

Blame
  • whistles_analyses.ipynb 643.59 KiB

    Whistles features analyses

    In [1]:
    #%%## IMPORTATIONS #####
    import os
    import json
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    from time import time
    from datetime import datetime
    
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    from scipy import stats
    import statsmodels.api as sm
    from statsmodels.formula.api import glm, ols
    from dtaidistance import dtw, dtw_ndim
    from dtaidistance import dtw_visualisation as dtwvis
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    
    from utils.misc import (NumpyEncoder, import_contours_to_curves,
                                 load_spectrogram, plot_contours_on_spectrogram,
                                 refine_contours2pixels, qqplot_perso, custom_draftmans_plot)
    In [2]:
    output_dir = "Whistles_kit/features"
    output_img = "Whistles_kit/features/pictures"
    #%%## DATA IMPORTATION #####
    filtered_data = pd.read_csv(
        os.path.join(output_dir, "whistle_features_filtered.csv"), 
        parse_dates=["datetime"])
    better_filtered_data = filtered_data.copy()
    better_filtered_data["abs_angular_speeds"] = np.abs(better_filtered_data["angular_speeds"])
    better_filtered_data["min_f"] = better_filtered_data["min_f"]/1000
    better_filtered_data["mean_f"] = better_filtered_data["mean_f"]/1000
    better_filtered_data["max_f"] = better_filtered_data["max_f"]/1000
    
    print("\nSummary of results (after filtering):")
    print(f"\tNumber of whistles: {len(better_filtered_data)}")
    print(f"\t\t{len(better_filtered_data)-np.isnan(better_filtered_data['angular_speeds']).sum()} with angular speed available")
    print(f"\t\t{np.isnan(better_filtered_data['angular_speeds']).sum()} without angular speed")
    print(f"\tMin frequency: {better_filtered_data['min_f'].min():.2f}")
    print(f"\tMax frequency: {better_filtered_data['max_f'].max():.2f}")
    print(f"\tDuration (in seconds): {np.mean(better_filtered_data['duration']):.2f} [Mean]")
    print(f"\tSNR (in dB): {np.mean(better_filtered_data['SNR']):.2f} [Mean]")
    print(f"\t# harmonics: {np.mean(better_filtered_data['n_harmonics']):.4f} [Mean]")
    Out [2]:
    
    Summary of results (after filtering):
    	Number of whistles: 411
    		188 with angular speed available
    		223 without angular speed
    	Min frequency: 4.75
    	Max frequency: 26.16
    	Duration (in seconds): 0.50 [Mean]
    	SNR (in dB): 15.32 [Mean]
    	# harmonics: 0.0438 [Mean]
    
    In [3]:
    ax = sns.histplot(
        data=better_filtered_data,
        x="abs_angular_speeds", kde=True,
        color="#6ba4b6ff",
    )
    plt.xlabel("Orientation (in °)")
    plt.title("Orientation of on-axis dolphins at whistling time")
    ax.get_figure().savefig(os.path.join(output_img, "orientation_distrib.pdf"))
    plt.show()
    plt.close("all")
    out [3]:
    In [4]:
    #%%## Draftman's plot #####
    fig, axs = custom_draftmans_plot(
        better_filtered_data[['duration', 'SNR', 'abs_angular_speeds', 'min_f', 'mean_f', 'max_f']].copy(),
        old_names = ['duration', 'SNR', 'abs_angular_speeds', 'min_f', 'mean_f', 'max_f'],
        new_names = [
            'Duration (s)', 'SNR (dB)', 'Orientation\n(deg)', 
            'Min. freq.\n(kHz)', 'Mean freq.\n(kHz)', 'Max. freq.\n(kHz)'])
    for i in range(6):
        for j in range(6):
            axs[i,j].tick_params(axis='both', which='major', labelsize=12)
            axs[i,j].xaxis.get_label().set_fontsize(14)
            axs[i,j].yaxis.get_label().set_fontsize(14)
            axs[i,j].yaxis.set_label_coords(-0.25,0.5)
    plt.show()
    fig.savefig(os.path.join(output_img, "draftman_plot.pdf"))
    out [4]:

    Annex : draftman's analyses

    Introducing the coefs > 0.25 or < -0.25

    SNR-duration

    Intuitively, for a same duration, if a whistle is emitted louder, the hydrophone should catch the sound for a longer period of time (because it is then ininterrupted by noise). Let's prove that from our data.

    Duration-freq

    From the graphs, there also seems to be a relation between duration and min freq (negative) and max freq (positive). The longer a whistle is, the less likely it is to be of low frequency, but the less likely it is to be of high frequency. Why ? well good question...

    Frequencies

    Obviously, min/max/mean frequencies are correlated.

    SNR-duration

    In [5]:
    fig = qqplot_perso((better_filtered_data["duration"]))
    fig.set_size_inches(4, 3)
    plt.show()
    fig = qqplot_perso((better_filtered_data["SNR"]))
    fig.set_size_inches(4, 3)
    plt.show()
    
    out [5]:
    In [6]:
    # better_filtered_data[["log_SNR", "log_duration"]] = np.log(better_filtered_data[["SNR", "duration"]])
    better_filtered_data.reset_index(inplace=True, drop=True)
    mod1 = glm(
        "SNR ~ duration + 1",
        data=better_filtered_data,
        family=sm.families.NegativeBinomial()).fit()
    # best AIC for Gamma, best BIC for NegativeBinomial
    print(mod1.summary2())
    Out [6]:
                   Results: Generalized linear model
    ===============================================================
    Model:              GLM              AIC:            3091.3517 
    Link Function:      Log              BIC:            -2445.2816
    Dependent Variable: SNR              Log-Likelihood: -1543.7   
    Date:               2024-07-10 14:52 LL-Null:        -1545.7   
    No. Observations:   411              Deviance:       16.323    
    Df Model:           1                Pearson chi2:   17.2      
    Df Residuals:       409              Scale:          1.0000    
    Method:             IRLS                                       
    ----------------------------------------------------------------
                   Coef.   Std.Err.     z     P>|z|   [0.025  0.975]
    ----------------------------------------------------------------
    Intercept      2.5495    0.1004  25.3824  0.0000  2.3526  2.7463
    duration       0.3464    0.1716   2.0190  0.0435  0.0101  0.6826
    ===============================================================
    
    
    Out [6]:
    /home/loic/anaconda3/envs/VIRTUBA/lib/python3.9/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
      warnings.warn("Negative binomial dispersion parameter alpha not "
    
    In [7]:
    # model shows an effect, now verify hypotheses
    fig = qqplot_perso(mod1.resid_response)
    print(fig.axes[0].texts[0].get_text())
    plt.close(fig)
    # normality not expected anw
    
    # What about independance of residuals ?
    r, p = stats.kendalltau(mod1.predict(), mod1.resid_response)
    print(f"\nkendall-tau: r={r:.2f}, p={p:.2g}")
    if p<0.05:
        print("H0 rejected (Presence of correlation)")
    else:
        print("H0 cannot be rejected (Absence of correlation)")
    Out [7]:
    shapiro-wilk: s=0.96, p=2.7e-09
    H0 rejected (distrib not normal)
    
    kendall-tau: r=-0.02, p=0.5
    H0 cannot be rejected (Absence of correlation)
    
    In [8]:
    # Show a plot of the results
    fig = sns.jointplot(
        x=better_filtered_data["duration"], 
        y=better_filtered_data["SNR"]
    )
    
    predictions = mod1.get_prediction(
        better_filtered_data["duration"]).summary_frame()
    sns.lineplot(
        x=better_filtered_data["duration"],
        y=predictions["mean"],
        label="Prediction mean",
        color="tab:red", lw=2,
        ax=fig.figure.axes[0]
    )
    fig.figure.axes[0].fill_between(
        better_filtered_data["duration"].loc[np.argsort(better_filtered_data["duration"])], 
        (predictions["mean"]+predictions["mean_se"]).loc[np.argsort(better_filtered_data["duration"])], 
        (predictions["mean"]-predictions["mean_se"]).loc[np.argsort(better_filtered_data["duration"])], 
        color='tab:red', alpha=.2, lw=0, label="mean_se")
    fig.figure.axes[0].fill_between(
        better_filtered_data["duration"].loc[np.argsort(better_filtered_data["duration"])], 
        (predictions["mean_ci_upper"]).loc[np.argsort(better_filtered_data["duration"])], 
        (predictions["mean_ci_lower"]).loc[np.argsort(better_filtered_data["duration"])], 
        color='tab:green', alpha=.2, lw=0, label="mean_ci")
    fig.figure.axes[0].tick_params(axis='both', which='major', labelsize=12)
    fig.figure.axes[1].set_title("Model estimates VS data")
    fig.figure.tight_layout()
    plt.legend(fontsize=11)
    plt.xlabel("Duration (s)", fontsize=15)
    plt.ylabel("SNR (dB)", fontsize=15)
    plt.show()
    fig.savefig(os.path.join(output_img, "duration_SNR.pdf"))
    out [8]:

    Duration-frequency

    We'll do it with max_frequency only, it is redundant with min_frequency

    In [9]:
    fig = qqplot_perso((better_filtered_data["duration"]))
    fig.set_size_inches(4, 3)
    plt.show()
    fig = qqplot_perso(np.log(better_filtered_data["max_f"]))
    fig.set_size_inches(4, 3)
    plt.show()
    out [9]:
    In [10]:
    mod2 = glm(
        "max_f ~ duration + 1",
        data=better_filtered_data,
        family=sm.families.NegativeBinomial()).fit()
    # best AIC for Gamma, best BIC for NegativeBinomial
    print(mod2.summary2())
    Out [10]:
                   Results: Generalized linear model
    ===============================================================
    Model:              GLM              AIC:            3023.6197 
    Link Function:      Log              BIC:            -2448.2413
    Dependent Variable: max_f            Log-Likelihood: -1509.8   
    Date:               2024-07-10 14:52 LL-Null:        -1511.2   
    No. Observations:   411              Deviance:       13.363    
    Df Model:           1                Pearson chi2:   13.5      
    Df Residuals:       409              Scale:          1.0000    
    Method:             IRLS                                       
    ----------------------------------------------------------------
                  Coef.   Std.Err.     z     P>|z|    [0.025  0.975]
    ----------------------------------------------------------------
    Intercept     2.4949    0.1007  24.7740  0.0000   2.2975  2.6922
    duration      0.2856    0.1721   1.6598  0.0969  -0.0516  0.6228
    ===============================================================
    
    
    Out [10]:
    /home/loic/anaconda3/envs/VIRTUBA/lib/python3.9/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
      warnings.warn("Negative binomial dispersion parameter alpha not "
    

    No effect (statistically) of the duration according to this model. It comes at no surprise as the correlation coefficient computed in the draftman's plot are small.

    In [11]:
    better_filtered_data[["log_SNR", "log_duration"]] = np.log(better_filtered_data[["SNR", "duration"]])
    better_filtered_data.reset_index(inplace=True, drop=True)
    mod1 = glm(
        "SNR ~ duration + 1",
        data=better_filtered_data,
        family=sm.families.NegativeBinomial()).fit()
    # best AIC for Gamma, best BIC for NegativeBinomial
    print(mod1.summary2())
    Out [11]:
                   Results: Generalized linear model
    ===============================================================
    Model:              GLM              AIC:            3091.3517 
    Link Function:      Log              BIC:            -2445.2816
    Dependent Variable: SNR              Log-Likelihood: -1543.7   
    Date:               2024-07-10 14:52 LL-Null:        -1545.7   
    No. Observations:   411              Deviance:       16.323    
    Df Model:           1                Pearson chi2:   17.2      
    Df Residuals:       409              Scale:          1.0000    
    Method:             IRLS                                       
    ----------------------------------------------------------------
                   Coef.   Std.Err.     z     P>|z|   [0.025  0.975]
    ----------------------------------------------------------------
    Intercept      2.5495    0.1004  25.3824  0.0000  2.3526  2.7463
    duration       0.3464    0.1716   2.0190  0.0435  0.0101  0.6826
    ===============================================================
    
    
    Out [11]:
    /home/loic/anaconda3/envs/VIRTUBA/lib/python3.9/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
      warnings.warn("Negative binomial dispersion parameter alpha not "
    

    Whistles VS Orientation

    In [12]:
    # add categories
    better_filtered_data["Orientation"] = [
        "unknown" if boolean
        else "on-axis"
        for boolean in np.isnan(better_filtered_data["angular_speeds"])
    ]
    
    better_filtered_data.head()
    Out [12]:
    (Click to sort ascending)
    datetime
    (Click to sort ascending)
    duration
    (Click to sort ascending)
    SNR
    (Click to sort ascending)
    signal_dB
    (Click to sort ascending)
    angular_speeds
    (Click to sort ascending)
    max_f
    (Click to sort ascending)
    min_f
    (Click to sort ascending)
    mean_f
    (Click to sort ascending)
    ID
    (Click to sort ascending)
    n_harmonics
    (Click to sort ascending)
    abs_angular_speeds
    (Click to sort ascending)
    log_SNR
    (Click to sort ascending)
    log_duration
    (Click to sort ascending)
    Orientation
    (Click to sort ascending)
    02021-07-17 15:26:020.50399414.244876-73.021835NaN13.23707311.89463412.595339Line1_202171715260NaN2.656397-0.685192unknown
    12021-07-17 15:26:020.39199511.187649-75.223240NaN12.64390210.95804911.845575Line2_202171715260NaN2.414810-0.936506unknown
    22021-07-17 15:26:020.25599712.564945-74.969270NaN13.20585412.30048812.665366Line3_202171715260NaN2.530911-1.362590unknown
    32021-07-17 15:26:020.27199713.117639-72.367035NaN14.01756112.45658513.215954Line4_202171715260NaN2.573958-1.301966unknown
    42021-07-17 15:26:020.50399416.225741-70.029130NaN11.9882938.99122010.175579Line6_202171715260NaN2.786599-0.685192unknown
    In [13]:
    # local function : CDF curves
    def cdf_with_thresholds(df, metric, category, unit="", threshold=0.5, verbose=True, hue_order=None):
        histplot = sns.kdeplot(
            data=df, 
            cumulative=True, common_norm=False,
            x=metric, hue=category,
            hue_order=hue_order
        )
        for l, line in enumerate(histplot.get_lines()):
            curve = line.get_xydata()
            # value for 50% density
            x50 = np.interp(threshold, curve[:,1], curve[:,0])
            plt.text(
                .66, (l+1)/20,
                f"X = {x50:.2f}",
                color = line.get_color(),
                transform=histplot.get_figure().get_axes()[0].transAxes
            )
        plt.text(
            .66, (l+2)/20,
            f"P({metric} > X) = {threshold*100:.0f}%",
            transform=histplot.get_figure().get_axes()[0].transAxes
        )
        plt.xlabel(f"{metric} {'('+unit+')' if len(unit)>0 else ''}")
    
        if verbose:
            sample_args = tuple(df.groupby(category)[metric].apply(list).reset_index()[metric])
            if len(sample_args) == 2:
                u, p = stats.mannwhitneyu(sample_args[0], sample_args[1])
                print(f"Mann-Whitney U test: statistic={u:.2f}, pvalue={p:.2g}")
            else:
                
                h, p = stats.kruskal(*sample_args)
                print(f"Kruskal-Wallis test: statistic={h:.2f}, pvalue={p:.2g}")   
    
        return histplot

    SNR VS orientation

    In [36]:
    # # normality ?
    # qqplot_perso(better_filtered_data["SNR"])
    # plt.show()
    # Nope. So KW is good.
    
    # SNR 
    # cdfplot = cdf_with_thresholds(
    #     better_filtered_data, "SNR", "Orientation", 
    #     hue_order=["on-axis","unknown"])
    # plt.xlabel("SNR (dB)")
    # plt.show()
    # cdfplot.get_figure().savefig(os.path.join(output_img, "SNR_orientation.pdf"))
    
    fig, ax = plt.subplots(1,1,figsize=(3,6))
    sns.boxplot(
        data=better_filtered_data, 
        y="SNR", fill=False,
        x="Orientation", hue="Orientation",
        hue_order=["unknown","on-axis"],
        order=["on-axis","unknown"],
        ax=ax
    )
    
    plt.ylabel("SNR (dB)", fontsize=15)
    plt.xlabel("Orientation", fontsize=15)
    ax.tick_params(which="both", labelsize=12)
    plt.ylim((9, 33))
    fig.tight_layout()
    plt.show()
    fig.savefig(os.path.join(output_img, "SNR_orientation_box.pdf"))
    out [36]:

    Interesting : there is a significant difference between SNR emitted by on-axis dolphins VS unknown (most probably off-axis) dolphins. Let's investigate further by making subsets of the 'on-axis' orientation. Maybe more low-frequencies in on-axis emitted whistles ?

    Mean frequency VS orientation

    In [37]:
    # cdfplot = cdf_with_thresholds(
    #     better_filtered_data, "mean_f", "Orientation", 
    #     hue_order=["on-axis","unknown"])
    # plt.xlabel("Mean frequency (kHz)")
    # plt.show()
    # cdfplot.get_figure().savefig(os.path.join(output_img, "mean-freq_orientation.pdf"))
    
    fig, ax = plt.subplots(1,1,figsize=(3,6))
    sns.boxplot(
        data=better_filtered_data, 
        y="mean_f", fill=False,
        x="Orientation", hue="Orientation",
        hue_order=["unknown","on-axis"],
        order=["on-axis","unknown"],
        ax=ax
    )
    plt.ylabel("Mean frequency (kHz)", fontsize=15)
    plt.xlabel("Orientation", fontsize=15)
    ax.tick_params(which="both", labelsize=12)
    plt.ylim((6, 23))
    fig.tight_layout()
    plt.show()
    fig.savefig(os.path.join(output_img, "mean-freq_orientation_box.pdf"))
    out [37]:

    Frequencies

    In published papers, diffusion of whistles is shown in function of frequency ranges. Can we do the same ?

    In [16]:
    freq_categories = [
        [5000, 8000],
        [8000, 12000],
        [12000, 15000],
        [15000, np.inf],
    ]
    categories = []
    for freq in better_filtered_data.mean_f:
        freq = freq*1000
        for f_range in freq_categories:
            if (freq >= f_range[0]) and (freq < f_range[1]):
                if f_range == freq_categories[-1]:
                    categories += [str(f_range[0]//1000)+"+ kHz"]
                else:
                    categories += [
                        "-".join((np.array(f_range)//1000).astype(int).astype(str)) +
                        " kHz"
                    ]
    better_filtered_data["Freq. range"] = categories
    
    freq_range_names = [
        str(freq_range[0]//1000) + "-" + str(freq_range[1]//1000) + " kHz"
        if (freq_range[1] != np.inf) else
        str(freq_range[0]//1000) + "+ kHz"
        for freq_range in freq_categories
    ]
    
    better_filtered_data.head()
    Out [16]:
    (Click to sort ascending)
    datetime
    (Click to sort ascending)
    duration
    (Click to sort ascending)
    SNR
    (Click to sort ascending)
    signal_dB
    (Click to sort ascending)
    angular_speeds
    (Click to sort ascending)
    max_f
    (Click to sort ascending)
    min_f
    (Click to sort ascending)
    mean_f
    (Click to sort ascending)
    ID
    (Click to sort ascending)
    n_harmonics
    (Click to sort ascending)
    abs_angular_speeds
    (Click to sort ascending)
    log_SNR
    (Click to sort ascending)
    log_duration
    (Click to sort ascending)
    Orientation
    (Click to sort ascending)
    Freq. range
    (Click to sort ascending)
    02021-07-17 15:26:020.50399414.244876-73.021835NaN13.23707311.89463412.595339Line1_202171715260NaN2.656397-0.685192unknown12-15 kHz
    12021-07-17 15:26:020.39199511.187649-75.223240NaN12.64390210.95804911.845575Line2_202171715260NaN2.414810-0.936506unknown8-12 kHz
    22021-07-17 15:26:020.25599712.564945-74.969270NaN13.20585412.30048812.665366Line3_202171715260NaN2.530911-1.362590unknown12-15 kHz
    32021-07-17 15:26:020.27199713.117639-72.367035NaN14.01756112.45658513.215954Line4_202171715260NaN2.573958-1.301966unknown12-15 kHz
    42021-07-17 15:26:020.50399416.225741-70.029130NaN11.9882938.99122010.175579Line6_202171715260NaN2.786599-0.685192unknown8-12 kHz
    In [17]:
    sns.histplot(
        data=better_filtered_data,
        x="Orientation", hue="Freq. range",
        hue_order=freq_range_names,
        fill=True, stat="density", multiple="dodge",
        edgecolor="white", discrete=True, shrink=0.66
    )
    plt.show()
    out [17]:

    Sub-orientations

    In [18]:
    orientation_limits = [
        [0, 2.5],
        [2.5, 5],
        [5, np.inf],
    ]
    orientation_names = ['< 2.5°', '[2.5, 5]°', '> 5°']
    
    categories = []
    for angle in better_filtered_data.abs_angular_speeds:
        if np.isnan(angle):
            categories += ["unknown"]
        else:
            for i, angle_range in enumerate(orientation_limits):
                if (angle >= angle_range[0]) and (angle < angle_range[1]):
                    categories += [orientation_names[i]]
    
    better_filtered_data["Orientations"] = categories
    data_without_unknowns = better_filtered_data.copy()
    data_without_unknowns = data_without_unknowns.dropna(how="any")
    
    print(np.unique(data_without_unknowns["Orientations"], return_counts=True))
    Out [18]:
    (array(['< 2.5°', '> 5°', '[2.5, 5]°'], dtype=object), array([121,  33,  34]))
    
    In [33]:
    # cdfplot = cdf_with_thresholds(
    #     data_without_unknowns, "SNR", "Orientations", 
    #     hue_order=['< 2.5°', '[2.5, 5]°', '> 5°']
    #     )
    # plt.xlabel("SNR (dB)")
    # plt.show()
    # cdfplot.get_figure().savefig(os.path.join(output_img, "SNR_orientations.pdf"))
    
    fig, ax = plt.subplots(figsize=(4,6))
    sns.boxplot(
        data=data_without_unknowns, 
        y="SNR", fill=False,
        x="Orientations", hue="Orientations",
        hue_order=['< 2.5°', '[2.5, 5]°', '> 5°'],
        order=['< 2.5°', '[2.5, 5]°', '> 5°'],
        ax=ax,
        palette=["xkcd:light orange", "xkcd:orange", "xkcd:dark orange"],
    )
    ax.set_ylabel("SNR (dB)", fontsize=15)
    ax.set_xlabel("On-axis deviations", fontsize=15)
    ax.tick_params(which="both", labelsize=12)
    plt.ylim((9, 33))
    fig.tight_layout()
    plt.show()
    fig.savefig(os.path.join(output_img, "SNR_orientations_box.pdf"))
    
    out [33]:
    In [35]:
    # cdfplot = cdf_with_thresholds(
    #     data_without_unknowns, "mean_f", "Orientations", 
    #     hue_order=['< 2.5°', '[2.5, 5]°', '> 5°'])
    # plt.xlabel("Mean frequency (kHz)")
    # plt.show()
    # cdfplot.get_figure().savefig(os.path.join(output_img, "mean-freq_orientations.pdf"))
    
    fig, ax = plt.subplots(figsize=(4,6))
    sns.boxplot(
        data=data_without_unknowns, 
        y="mean_f", fill=False,
        x="Orientations", hue="Orientations",
        hue_order=['< 2.5°', '[2.5, 5]°', '> 5°'],
        order=['< 2.5°', '[2.5, 5]°', '> 5°'],
        ax=ax,
        palette=["xkcd:light orange", "xkcd:orange", "xkcd:dark orange"],
    )
    ax.set_ylabel("Mean frequency (kHz)", fontsize=15)
    ax.set_xlabel("On-axis deviations", fontsize=15)
    ax.tick_params(which="both", labelsize=12)
    plt.ylim((6, 23))
    fig.tight_layout()
    plt.show()
    fig.savefig(os.path.join(output_img, "mean-freq_orientations_box.pdf"))
    out [35]:

    Conclusion : we are not able to differenciate further because the directivity of the echolocation clicks limits us. However, we can affirm that SNR is lower for whistles emitted on-axis that unknown orientations. The explanation lies in the frequencies => unknown orientation whistles have lower mean frequencies (check following graphs) => because highest frequencies were absorbed during transmission i water => so alles good.

    Harmonics VS orientation

    In [21]:
    better_filtered_data_bis = better_filtered_data.copy()
    
    fig, ax = plt.subplots(figsize=(3, 6))
    sns.barplot(
        x='Orientation', y='n_harmonics', data=better_filtered_data_bis, 
        estimator='mean', hue='Orientation', hue_order=["on-axis","unknown"],
        capsize=0.1, errorbar=('ci', 95))
    ax.set_ylabel('Mean Number of harmonics')
    plt.show()
    mwu_args = tuple(better_filtered_data_bis.groupby("Orientation")["n_harmonics"].apply(list).reset_index()["n_harmonics"])
    u, p = stats.mannwhitneyu(*mwu_args)
    print(f"MWU test: statistic={u:.2f}, pvalue={p:.2g}") 
    out [21]:
    Out [21]:
    MWU test: statistic=20503.00, pvalue=0.28
    

    So, the graph says one thing but the statistic is very clear => no statical difference in the number of harmonics... But since ~10% of the annotated whistles had harmonics, it is likely that there was just too much noise to be more precise.