fix ?

67a61790 · Paul Best · a0e1d311 · 67a61790 · 67a61790 · 67a61790
Commit 67a61790 authored Oct 31, 2022 by Paul Best
--- a/.gitignore
+++ b/.gitignore
--- a/PCEN_pytorch.py
+++ b/PCEN_pytorch.py
--- a/README.md
+++ b/README.md
--- a/humpback2/extract_annot.py
+++ b/humpback2/extract_annot.py
--- a/humpback2/humpback2.csv
+++ b/humpback2/humpback2.csv
--- a/new_specie/README
+++ b/new_specie/README
--- a/new_specie/cassin_test.csv
+++ b/new_specie/cassin_test.csv
--- a/new_specie/requirements.txt
+++ b/new_specie/requirements.txt
--- a/otter/otter.csv
+++ b/otter/otter.csv
--- a/plot_annot_distrib.py
+++ b/plot_annot_distrib.py
 import matplotlib.pyplot as plt
 import pandas as pd, numpy as np
-import models
+
+species = np.loadtxt('good_species.txt', dtype=str)

 fig, ax = plt.subplots(nrows=4, ncols=3, figsize=(10, 10))
-for i, specie in enumerate(models.meta):
+for i, specie in enumerate(species):
    df = pd.read_csv(f'{specie}/{specie}.csv')
    ax[i//3, i%3].bar(range(df.label.nunique() + 1), list(df.label.value_counts()) + [df.label.isna().sum()], log=True)
    ax[i//3, i%3].set_title(specie)
@@ -12,7 +13,7 @@ plt.savefig('annot_distrib.pdf')


 a = "Specie & \# Classes & \# Annotated samples & \# Samples \\\\ \hline \n"
-for specie in models.meta:
+for specie in species:
    df = pd.read_csv(f'{specie}/{specie}.csv')
    a += f"{specie.replace('_',' ')} & {df.label.nunique()} & {(~df.label.isna()).sum()} & {len(df)} \\\\ \hline \n"
 f = open('annot_distrib.tex', 'w')

--- a/test_AE.py
+++ b/test_AE.py
@@ -15,8 +15,7 @@ parser.add_argument("-encoder", type=str, default='sparrow_encoder')
 parser.add_argument("-frontend", type=str, default='logMel')
 args = parser.parse_args()

-modelname = f'{args.specie}_{args.bottleneck}_{args.frontend}{args.nMel}_{args.encoder}_{args.nMel}_decod2_BN_nomaxPool.stdc'
-#modelname = f'{args.specie}_{args.bottleneck}_{args.frontend}_{args.encoder}_{args.nMel}.stdc'
+modelname = f'{args.specie}_{args.bottleneck}_{args.frontend}{args.nMel}_{args.encoder}_decod2_BN_nomaxPool.stdc'
 meta = models.meta[args.specie]
 df = pd.read_csv(f'{args.specie}/{args.specie}.csv')
 print(f'Tests for model {modelname}')

--- a/train_AE.py
+++ b/train_AE.py
@@ -86,8 +86,7 @@ for epoch in range(100_000//len(loader)):
                    encoding = model[:2](x.to(gpu))
                    idxs.extend(idx)
                    encodings.extend(encoding.cpu().detach())
-            idxs = np.array(idxs)
-            encodings = np.stack(encodings)
+            idxs, encodings = np.array(idxs), np.stack(encodings)
            
            print('Computing UMAP...', end='')
            try:
@@ -97,7 +96,7 @@ for epoch in range(100_000//len(loader)):
                continue
            print('\rRunning HDBSCAN...', end='')
            clusters = hdbscan.HDBSCAN(min_cluster_size=len(df)//100, min_samples=5, core_dist_n_jobs=-1, cluster_selection_method='leaf').fit_predict(X)
-            df.loc[idxs, 'cluster'] = clusters.astype(int)
+            # df.loc[idxs, 'cluster'] = clusters.astype(int)
            mask = ~df.loc[idxs].label.isna()
            clusters, labels = clusters[mask], df.loc[idxs[mask]].label
            NMIs.append(metrics.normalized_mutual_info_score(labels, clusters))
@@ -110,48 +109,47 @@ for epoch in range(100_000//len(loader)):
            writer.add_scalar('Completeness HDBSCAN', metrics.completeness_score(labels, clusters), step)
            writer.add_scalar('V-Measure HDBSCAN', metrics.v_measure_score(labels, clusters), step)
            
-            print('\rComputing HDBSCAN precision and recall distributions', end='')
-            labelled = df[~df.label.isna()]
-            precs, recs = [], []
-            for l, grp in labelled.groupby('label'):
-                best = (grp.groupby('cluster').fn.count() / labelled.groupby('cluster').fn.count()).idxmax()
-                precs.append((grp.cluster==best).sum()/(labelled.cluster==best).sum())
-                recs.append((grp.cluster==best).sum()/len(grp))
-            writer.add_histogram('HDBSCAN Precisions ', np.array(precs), step)
-            writer.add_histogram('HDBSCAN Recalls ', np.array(recs), step)
-            df.drop('cluster', axis=1, inplace=True)
-            continue
-            print('\rRunning elbow method for K-Means...', end='')
-            ks = (5*1.2**np.arange(20)).astype(int)
-            distorsions = [cluster.KMeans(n_clusters=k).fit(encodings).inertia_ for k in ks]
-            print('\rEstimating elbow...', end='')
-            errors = [linregress(ks[:i], distorsions[:i]).stderr + linregress(ks[i+1:], distorsions[i+1:]).stderr for i in range(2, len(ks)-2)]
-            k = ks[np.argmin(errors)]
-            writer.add_scalar('Chosen K', k, step)
-            clusters = cluster.KMeans(n_clusters=k).fit_predict(encodings)
-            df.loc[idxs, 'cluster'] = clusters.astype(int)
-
-            writer.add_scalar('Silhouette', metrics.silhouette_score(encodings, clusters), step)
-            clusters, labels = clusters[mask], df.loc[idxs[mask]].label
-            writer.add_scalar('NMI K-Means', metrics.normalized_mutual_info_score(labels, clusters), step)
-            try:
-                writer.add_scalar('ARI K-Means', metrics.adjusted_rand_score(labels, clusters), step)
-            except:
-                pass
-            writer.add_scalar('Homogeneity K-Means', metrics.homogeneity_score(labels, clusters), step)
-            writer.add_scalar('Completeness K-Means', metrics.completeness_score(labels, clusters), step)
-            writer.add_scalar('V-Measure K-Means', metrics.v_measure_score(labels, clusters), step)
-
-            print('\rComputing K-Means precision and recall distributions', end='') 
-            labelled = df[~df.label.isna()]
-            precs, recs = [], []
-            for l, grp in labelled.groupby('label'):
-                best = (grp.groupby('cluster').fn.count() / labelled.groupby('cluster').fn.count()).idxmax()
-                precs.append((grp.cluster==best).sum()/(labelled.cluster==best).sum())
-                recs.append((grp.cluster==best).sum()/len(grp))
-            writer.add_histogram('K-Means Precisions ', np.array(precs), step)
-            writer.add_histogram('K-Means Recalls ', np.array(recs), step)
-            df.drop('cluster', axis=1, inplace=True)
+            # print('\rComputing HDBSCAN precision and recall distributions', end='')
+            # labelled = df[~df.label.isna()]
+            # precs, recs = [], []
+            # for l, grp in labelled.groupby('label'):
+            #     best = (grp.groupby('cluster').fn.count() / labelled.groupby('cluster').fn.count()).idxmax()
+            #     precs.append((grp.cluster==best).sum()/(labelled.cluster==best).sum())
+            #     recs.append((grp.cluster==best).sum()/len(grp))
+            # writer.add_histogram('HDBSCAN Precisions ', np.array(precs), step)
+            # writer.add_histogram('HDBSCAN Recalls ', np.array(recs), step)
+            # df.drop('cluster', axis=1, inplace=True)
+            # print('\rRunning elbow method for K-Means...', end='')
+            # ks = (5*1.2**np.arange(20)).astype(int)
+            # distorsions = [cluster.KMeans(n_clusters=k).fit(encodings).inertia_ for k in ks]
+            # print('\rEstimating elbow...', end='')
+            # errors = [linregress(ks[:i], distorsions[:i]).stderr + linregress(ks[i+1:], distorsions[i+1:]).stderr for i in range(2, len(ks)-2)]
+            # k = ks[np.argmin(errors)]
+            # writer.add_scalar('Chosen K', k, step)
+            # clusters = cluster.KMeans(n_clusters=k).fit_predict(encodings)
+            # df.loc[idxs, 'cluster'] = clusters.astype(int)
+
+            # writer.add_scalar('Silhouette', metrics.silhouette_score(encodings, clusters), step)
+            # clusters, labels = clusters[mask], df.loc[idxs[mask]].label
+            # writer.add_scalar('NMI K-Means', metrics.normalized_mutual_info_score(labels, clusters), step)
+            # try:
+            #     writer.add_scalar('ARI K-Means', metrics.adjusted_rand_score(labels, clusters), step)
+            # except:
+            #     pass
+            # writer.add_scalar('Homogeneity K-Means', metrics.homogeneity_score(labels, clusters), step)
+            # writer.add_scalar('Completeness K-Means', metrics.completeness_score(labels, clusters), step)
+            # writer.add_scalar('V-Measure K-Means', metrics.v_measure_score(labels, clusters), step)
+
+            # print('\rComputing K-Means precision and recall distributions', end='') 
+            # labelled = df[~df.label.isna()]
+            # precs, recs = [], []
+            # for l, grp in labelled.groupby('label'):
+            #     best = (grp.groupby('cluster').fn.count() / labelled.groupby('cluster').fn.count()).idxmax()
+            #     precs.append((grp.cluster==best).sum()/(labelled.cluster==best).sum())
+            #     recs.append((grp.cluster==best).sum()/len(grp))
+            # writer.add_histogram('K-Means Precisions ', np.array(precs), step)
+            # writer.add_histogram('K-Means Recalls ', np.array(recs), step)
+            # df.drop('cluster', axis=1, inplace=True)

            if len(NMIs) > 10 and max(NMIs) > max(NMIs[-10:]):
                print('\rEarly stop')