update baseline and gridsearch

acfd2247 · Paul Best · 5f0b534e · acfd2247 · acfd2247 · acfd2247
Commit acfd2247 authored Feb 6, 2023 by Paul Best
--- a/hdbscan_gridsearch.py
+++ b/hdbscan_gridsearch.py
+import hdbscan, umap
+import pandas as pd, numpy as np
+import matplotlib.pyplot as plt
+from sklearn import metrics
+import os
+species = np.loadtxt('good_species.txt', dtype=str)
+frontends = ['spec32', 'vggish', '256_logMel128', 'biosound'] #, '256_logSTFT', '256_Mel128', '128_logMel128', '64_pcenMel128', '128_pcenMel128', '256_pcenMel128', '512_pcenMel128']
+out = pd.DataFrame(columns=['specie', 'frontend']) if not os.path.isfile('hdbscan_HP.csv') else pd.read_csv('hdbscan_HP.csv')
+plt.figure()
+for specie in species:
+    nmis = []
+    for i, frontend in enumerate(frontends):
+        df = pd.read_csv(f'{specie}/{specie}.csv')
+        fn = f'{specie}/encodings/encodings_' + \
+                (f'{specie}_{frontend}_sparrow_encoder_decod2_BN_nomaxPool.npy' if not frontend in ['vggish', 'biosound', 'spec32'] \
+                else frontend+'.npy')
+        if ((out.frontend==frontend)&(out.specie==specie)).any() or not os.path.isfile(fn):
+            continue
+        print(specie, frontend)
+        dic = np.load(fn, allow_pickle=True).item()
+        idxs = dic['idxs']
+        df = df.loc[idxs].reset_index()
+        for ncomp in [2, 4, 8, 16, 32]:
+            X = umap.UMAP(n_jobs=-1, n_components=ncomp).fit_transform(dic['encodings'])
+            for mcs in [5, 10, 20, 50, 100, 150, 200]:
+                for ms in [None, 3, 5, 10, 20, 30]:
+                    for eps in [0.0, 0.01, 0.02, 0.05, 0.1]:
+                        for al in ['leaf', 'eom']:
+                            if 'indiv' in df.columns:
+                                indiv_nmis = []
+                                for indiv, grp in df.groupby('indiv'):
+                                    clusters = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms, cluster_selection_epsilon=eps, \
+                                        core_dist_n_jobs=-1, cluster_selection_method=al).fit_predict(X[grp.index])
+                                    df.loc[grp.index, 'cluster'] = clusters.astype(int)
+                                    mask = ~grp.label.isna()
+                                    clusters, labels = clusters[mask], grp[mask].label
+                                    indiv_nmis.append(metrics.normalized_mutual_info_score(labels, clusters))
+                            else:
+                                clusters = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms, cluster_selection_epsilon=eps, \
+                                        core_dist_n_jobs=-1, cluster_selection_method=al).fit_predict(X)
+                                df['cluster'] = clusters.astype(int)
+                                mask = ~df.label.isna()
+                                clusters, labels = clusters[mask], df[mask].label
+                                tnmi = metrics.normalized_mutual_info_score(labels, clusters)
+                            df.drop('cluster', axis=1, inplace=True)
+                            out = pd.concat([out, pd.DataFrame([ \
+                                {'specie':specie, 'frontend':frontend, 'nmi': np.mean(indiv_nmis) if 'indiv' in df.columns else tnmi, \
+                                 'mcs':mcs, 'ms':ms, 'eps':eps, 'al':al, 'ncomp':ncomp}])], ignore_index=True)
+        out.to_csv('hdbscan_HP.csv', index=False)
--- a/run_openl3.py
+++ b/run_openl3.py
+from sklearn import metrics
+import matplotlib.pyplot as plt
+import umap, hdbscan
+from tqdm import tqdm
+import argparse, os
+import models, utils as u
+import pandas as pd, numpy as np, torch
+import torchopenl3 as openl3
+parser = argparse.ArgumentParser()
+parser.add_argument("specie", type=str)
+parser.add_argument("-cuda", type=int, default=0)
+args = parser.parse_args()
+df = pd.read_csv(f'{args.specie}/{args.specie}.csv')
+meta = models.meta[args.specie]
+batch_size = 64
+if True : #not os.path.isfile(f'{args.specie}/encodings/encodings_openl3.npy'):
+    gpu = torch.device(f'cuda:{args.cuda}')
+    frontend = models.frontend['logMel_vggish'](meta['sr'], meta['nfft'], meta['sampleDur'], 64)
+    loader = torch.utils.data.DataLoader(u.Dataset(df, f'{args.specie}/audio/', meta['sr'], meta['sampleDur']), batch_size=batch_size, num_workers=8, collate_fn=u.collate_fn)
+    model = openl3.models.load_audio_embedding_model(input_repr="mel128", content_type="music", embedding_size=512).to(gpu)
+    with torch.no_grad():
+        encodings, idxs = [], []
+        for x, idx in tqdm(loader, desc='test '+args.specie, leave=False):
+            encoding = openl3.get_audio_embedding(x.to(gpu), meta['sr'], model=model, center=False, batch_size=batch_size, verbose=False)[0]
+            idxs.extend(idx.numpy())
+            encodings.extend(encoding.mean(axis=1).cpu().numpy())
+    idxs, encodings = np.array(idxs), np.stack(encodings)
+    X = umap.UMAP(n_jobs=-1, n_components=8).fit_transform(encodings)
+    np.save(f'{args.specie}/encodings/encodings_openl3.npy', {'idxs':idxs, 'encodings':encodings, 'umap8':X})
+else:
+    dic = np.load(f'{args.specie}/encodings/encodings_openl3.npy', allow_pickle=True).item()
+    idxs, encodings, X = dic['idxs'], dic['encodings'], dic['umap8']
+clusters = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=3, cluster_selection_epsilon=0.1, core_dist_n_jobs=-1, cluster_selection_method='leaf').fit_predict(X)
+df.loc[idxs, 'cluster'] = clusters.astype(int)
+mask = ~df.loc[idxs].label.isna()
+clusters, labels = clusters[mask], df.loc[idxs[mask]].label
+print('NMI', metrics.normalized_mutual_info_score(labels, clusters))
+exit()
+#print('Found clusters : \n', pd.Series(clusters).value_counts())
+plt.figure(figsize=(20, 10))
+plt.scatter(X[clusters==-1,0], X[clusters==-1,1], s=2, alpha=.2, color='Grey')
+plt.scatter(X[clusters!=-1,0], X[clusters!=-1,1], s=2, c=clusters[clusters!=-1], cmap='tab20')
+plt.tight_layout()
+plt.savefig(f'{args.specie}/projections/vggish_projection_clusters.png')
+plt.figure(figsize=(20, 10))
+plt.scatter(X[~mask,0], X[~mask,1], s=2, alpha=.2, color='Grey')
+for l, grp in df.groupby('label'):
+    plt.scatter(X[df.loc[idxs].label==l, 0], X[df.loc[idxs].label==l, 1], s=4, label=l)
+plt.legend()
+plt.tight_layout()
+plt.savefig(f'{args.specie}/projections/vggish_projection_labels.png')
+clusters, labels = clusters[mask], df.loc[idxs[mask]].label
+print('Silhouette', metrics.silhouette_score(encodings[mask], clusters))
+print('NMI', metrics.normalized_mutual_info_score(labels, clusters))
+print('Homogeneity', metrics.homogeneity_score(labels, clusters))
+print('Completeness', metrics.completeness_score(labels, clusters))
+print('V-Measure', metrics.v_measure_score(labels, clusters))
+labelled = df[~df.label.isna()]
+for l, grp in labelled.groupby('label'):
+    best = (grp.groupby('cluster').fn.count() / labelled.groupby('cluster').fn.count()).idxmax()
+    print(f'Best precision for {l} is for cluster {best} with {(df.cluster==best).sum()} points, \
+with precision {((labelled.cluster==best)&(labelled.label==l)).sum()/(labelled.cluster==best).sum():.2f} and recall {((labelled.cluster==best)&(labelled.label==l)).sum()/(labelled.label==l).sum():.2f}')
--- a/run_spec32_baseline.py
+++ b/run_spec32_baseline.py
+from filterbank import STFT, MelFilter, Log1p
+import torch, umap
+import argparse, tqdm, utils as u
+import pandas as pd, numpy as np
+import models
+parser = argparse.ArgumentParser()
+parser.add_argument("specie", type=str)
+args = parser.parse_args()
+df = pd.read_csv(f'{args.specie}/{args.specie}.csv')
+meta = models.meta[args.specie]
+loader = torch.utils.data.DataLoader(u.Dataset(df, f'{args.specie}/audio/', meta['sr'], meta['sampleDur']), batch_size=128)
+frontend = torch.nn.Sequential(
+    STFT(meta['nfft'], int((meta['sampleDur']*meta['sr'] - meta['nfft'])/32) + 1),
+    MelFilter(meta['sr'], meta['nfft'], 32, 0, meta['sr']//2),
+    Log1p(7, trainable=False),
+    torch.nn.InstanceNorm2d(1),
+    torch.nn.Flatten()
+).to('cuda')
+encodings, idxs = [], []
+with torch.no_grad():
+    for x, idx in tqdm.tqdm(loader):
+        encoding = frontend(x.to('cuda'))
+        idxs.extend(idx)
+        encodings.extend(encoding.cpu().detach())
+idxs, encodings = np.array(idxs), np.stack(encodings)
+#print(encodings.shape)
+#X = umap.UMAP(n_jobs=-1).fit_transform(encodings)
+np.save(f'{args.specie}/encodings/encodings_spec32.npy', {'idxs':idxs, 'encodings':encodings}) #, 'umap':X})