get_train_annot_YOLO.py

import os
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
from p_tqdm import p_map
import ipdb
import random
from datetime import date
import argparse
import matplotlib.patches as patches
from matplotlib.patches import Rectangle
from PIL import Image
from mycolorpy import colorlist as mcp

today = date.today()

def arg_directory(path):
    if os.path.isdir(path):
        return path
    else:
        raise argparse.ArgumentTypeError(f'`{path}` is not a valid path')

parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='TODO')
parser.add_argument('-f','--filename_path', type= str, help = 'Path and name of the file containing the annotations',required=True)
parser.add_argument('-p','--path_to_data', type=arg_directory, help = 'Path of the folder that contain the recordings',required=True)
parser.add_argument('-d','--directory', type=arg_directory, help = 'Directory to wich spectrograms and .txt files will be stored',required=True)
parser.add_argument('-m','--mode',type=str,choices=['uniform','personalized'],help = 'Choose the mode to calculate the y and height value',required=True)
parser.add_argument('-u','--unique',type=str, choices=['unique','multiple'], help = 'unique for only one spectrogram per file, multple for multiple spectrogram',required=True)
parser.add_argument('-c','--columns_name',type=str,help = 'Name of the column that contain the path',required=True)
parser.add_argument('--export',type=str, default=None, help='To export the position of the bounding box on the spectrogram',required=False)
args = parser.parse_args()

directory = args.directory

DURATION = 8
NB_CLASS = 5

df = pd.read_csv(args.filename_path, low_memory=False)

df.rename(columns={'label':'Code'},inplace=True)
df.rename(columns={'annotation_initial_time':'start'},inplace=True)
df.rename(columns={'annotation_final_time':'stop'},inplace=True)
df.rename(columns={'duree':'d_annot'},inplace=True)
df.rename(columns={'min_frequency':'min_freq'},inplace=True)
df.rename(columns={'max_frequency':'max_freq'},inplace=True)
df.rename(columns={'avg_frequency':'midl_y'},inplace=True)

tab = df.groupby('Code').count()
tab = tab.sort_values(tab.columns[0],ascending = False)[:NB_CLASS]
df = df[df.Code.isin(tab.index)]

try :
    df['max_freq'].fillna(9000,inplace = True)
    df['min_freq'].fillna(1000,inplace = True)
except Exception:
    df['max_freq'] = 9000
    df['min_freq'] = 1000
    df['midl_y'] = 5000

df['d_annot'] = df.stop - df.start
df['midl'] = (df.stop + df.start)/2
df['Path'] = df[args.columns_name]

df = df[df.d_annot<8]
df = df.reset_index()

list_espece = df.groupby('Code').count().sort_values(df.columns[0],ascending = False)
data = pd.DataFrame(columns = ['espece','ind'])

for i in range (len(list_espece)):
    esp = list_espece.index[i]
    new_col = pd.DataFrame([[esp,i]],columns = ['espece','ind'])
    data = pd.concat([data,new_col])

liste_espece = data.espece
liste_espece.to_csv(str(directory+'liste_especes.csv'),index = False)

print('\n',data)

color = mcp.gen_color(cmap = "Wistia", n= len(list_espece))

def process(x):
    count, (f, grp) = x
    filename = str(f)
    duration = DURATION

    while len(grp) != 0:

        tab = grp[grp.midl <= grp.start.iloc[0]+7]
        fin = pd.DataFrame(columns = ['id','x', 'y', 'width', 'height'])
        duree = tab.d_annot

        if len(tab)==0:
            tab = grp
            print(tab)
        rd = round(random.uniform(-1.5,1.5),2)

        if args.unique == 'multiple':
            if tab.start.iloc[0] <= 3:
                offset = 0
            if tab.start.iloc[0] == 3:
                offset = 1
            if tab.start.iloc[0] >= 3:
                offset = tab.midl.iloc[0] - 3.5 + rd
        elif args.unique == 'unique':
            offset= 0

        window_size = 1024
        window = np.hanning(window_size)

        y, sr = librosa.load(filename, offset = offset, duration = duration, sr = None)
        stft  = librosa.core.spectrum.stft(y, n_fft=window_size, hop_length=512, window=window)
        spectrum, freq, time, im = plt.specgram(y, Fs=sr, NFFT=window_size, noverlap=512, cmap='jet')

        vmin = np.flipud(np.log10(np.abs(stft))).mean()
        vmax = np.flipud(np.log10(np.abs(stft))).max()

        plt.close()
        plt.imshow(np.flipud(np.log10(np.abs(stft))),aspect = "auto", interpolation = None, cmap = 'jet', vmin = vmin, vmax = vmax)
        plt.subplots_adjust(top=1, bottom=0, left=0, right=1)

        for idxs, row in tab.iterrows():
            species = row.Code
            x_pxl = (row.midl - offset) / duration
            width_pxl = (row.stop-row.start)/duration

            if args.mode == 'uniform':
                height_pxl = 0.8
                y_pxl = 0.5

            else:
                y_pxl = 1 - (row.midl_y / (sr/2))

                height_pxl = (row.max_freq - row.min_freq)/(sr/2)
                if height_pxl > 1:
                    height_pxl = 1
                elif height_pxl > y_pxl*2:
                    y_pxl=y_pxl+0.5*(height_pxl-y_pxl*2)

            annotation = pd.DataFrame([[str(data.loc[data.espece == row.Code,'ind'][0]),x_pxl,y_pxl,width_pxl,height_pxl]],columns = ['id','x', 'y', 'width', 'height'])

            fin = pd.concat([fin,annotation])
        grp = grp.drop(tab.index)

        name = str(row.Path.replace('/','_').replace('.','_')+'_'+str(count))
        name_file = os.path.join(directory,str('labels_'+str(today.day)+'_'+str(today.month)),str(name+'.txt'))

        try :
            plt.savefig(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),row.Code,str(name+'.jpg')))
            fin.to_csv(name_file,sep =' ',header=False,index=False)
            plt.savefig(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),'all',str(name+'.jpg')))

        except :
            os.mkdir(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month))))
            for especes in list_espece.index:

                os.mkdir(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),especes))
            os.mkdir(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),'all'))
            os.mkdir(os.path.join(directory,str('labels_'+str(today.day)+'_'+str(today.month))))

            fin.to_csv(name_file,sep =' ',header=False,index=False)
            plt.savefig(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),row.Code,str(name+'.jpg')))
            plt.savefig(os.path.join(directory,str('images_'+str(today.day)+'_'+str(today.month)),'all',str(name+'.jpg')))

        if args.export != None:
            for l in range(len(fin)):

                try :
                    plt.gca().add_patch(Rectangle(((fin.x.iloc[l]*len(time))-(0.5*fin.width.iloc[l]*len(time)), ((fin.y.iloc[l]*len(freq))+0.5*fin.height.iloc[l]*len(freq))), fin.width.iloc[l]*len(time), fin.height.iloc[l]*len(freq), linewidth=3, edgecolor=color[int(fin.id.iloc[l])], facecolor='none'))

                except IndexError:
                    ipdb.set_trace()
                try:
                    plt.savefig(os.path.join(directory, str('images_annotes_'+str(today.day)+'_'+str(today.month)),str(name+'.jpg')))

                except Exception:
                    os.mkdir(os.path.join(directory,str('images_annotes_'+str(today.day)+'_'+str(today.month))))
                    plt.savefig(os.path.join(directory, str('images_annotes_'+str(today.day)+'_'+str(today.month)),str(name+'.jpg')))

        plt.close()

p_map(process, enumerate(df.groupby('Path')), num_cpus=2, total = len(df.groupby('Path')))

print('saved to ',directory)