diff --git a/get_train_annot.py b/get_train_annot.py index df57dceef119594ba677778ff5efb1014e85c4e6..a0f4c267e029db59dfaac9a7c70f2efcc01404b7 100755 --- a/get_train_annot.py +++ b/get_train_annot.py @@ -14,7 +14,7 @@ from tqdm import tqdm import pandas as pd -def process(entry, arguments, species_list): +def main(entry, arguments, species_list): """ Precess the annotation to get the .jpg spectrogram and the .txt annotation file :param x (tuple): Enumerate number, [filename, group] per file @@ -213,11 +213,11 @@ if __name__ == '__main__': if args.cpu == 1: for i in tqdm(enumerate(df.groupby('Path')), total=len(df.groupby('Path')), desc="Processing", ascii='░▒▓█'): - process(i, args, species) + main(i, args, species) else: args = [args for i in range(len(df.groupby('Path')))] species = [species for i in range(len(df.groupby('Path')))] - p_map(process, enumerate(df.groupby('Path')), args, + p_map(main, enumerate(df.groupby('Path')), args, species, num_cpus=args[0].cpu, total=len(df.groupby('Path'))) args = args[0] print('saved to', args.directory) @@ -231,11 +231,13 @@ if __name__ == '__main__': if SPLIT == 'Y': print('The train set will be 70%, val set 15% and test set 15%') - path = os.getcwd() - # Get the current path to find the split script + + # Get the path of the current script + path = os.path.abspath(os.path.dirname( __file__ )) script = os.path.join(path, 'get_train_val.py') data_path = os.path.join(path, args.directory, 'labels') directory_path = os.path.join(path, args.directory) + # Create the directory path if not exists utils.create_directory(directory_path) try: diff --git a/get_train_val.py b/get_train_val.py index 0666da9a6dc99b8fbadb3c6b8dec9a1b44a9d423..86028220ffc92d3734d8b4e379323d64ca25d17c 100755 --- a/get_train_val.py +++ b/get_train_val.py @@ -67,17 +67,22 @@ def prepare_data(arguments): """ Prepare the annotation before getting splited :param args (args): Argument - :return detection (DataFrame): DataFrame with all the annotation to split + :return annotations (DataFrame): DataFrame with all the annotation to split """ - detections = pd.concat({f: pd.read_csv(os.path.join(arguments.path_to_data, f), sep=' ', - names=['species', 'x', 'y', 'w', 'h']) - for f in tqdm(os.listdir(arguments.path_to_data), - desc="Processing", ascii='░▒▓█')}, - names=['file']) - - detections = detections.reset_index() - detections.species = detections.species.astype(float) - return detections + annotations = [] + background = [] + for f in tqdm(os.path.join(arguments.path_to_data, f), desc="Processing", + ascii='░▒▓█'): + file_annotation = pd.read_csv(f, sep=' ', names=['species', 'x', 'y', 'w', 'h']) + if len(file_annotation) == 0: + background.append(f) + else: + file_annotation['file'] = f + annotations.extend(file_annotation.to_dict(orient='records')) + + annotations = pd.DataFrame(annotations) + annotations.species = annotations.species.astype(float) + return annotations, background if __name__ == '__main__': @@ -96,7 +101,7 @@ if __name__ == '__main__': ' for test and same for validation', default=None) args = parser.parse_args() - df = prepare_data(args) + df, background = prepare_data(args) train, val = utils.split(df, 'train', args.ratio) saved_directory = os.path.join(args.directory, 'set') @@ -126,7 +131,13 @@ if __name__ == '__main__': command = f'python {yolo_path} --data {data_path} --imgsz 640 --epochs 100 --weights {weights_path} --hyp {hyp_path} --cache' print(command,'\n') - print('\u26A0\uFE0F Be aware that it is recommended to have background images that', - 'represents 10% of your dataset. To do so, please use the script "get_spectrogram.py"', - 'with --background arguments. Comptue on recordings that contains multiple type of noise...') + if len(background == 0): + print('\u26A0\uFE0F Be aware that it is recommended to have background images that', + 'represents 10% of your dataset. If you do not have background, use the script "get_spectrogram.py"', + 'with --background arguments. Comptue on recordings that contains multiple type of noise...') + else: + utils.split_background(background, args) + print(f'Your dataset contains {len(background)} images in background. It represents ', + f'{(len(background)/len(df))*100} % of your dataset set. It is recommended to reach around', + ' 10% for a good model.') \ No newline at end of file diff --git a/utils.py b/utils.py index 245f8b9d15882a6a33d90012e7533770e213bef6..4c1ed55f93579206fc262115f2475f95085ff3ca 100755 --- a/utils.py +++ b/utils.py @@ -9,6 +9,7 @@ import json from datetime import date from pathlib import Path import librosa +import random import pandas as pd import numpy as np @@ -209,6 +210,41 @@ def split(df, method, ratio=0.7): return major_df, minor_df +def split_background(file_list, arguments): + """ + Randomly split the background images and save them into the differents sets. + :param file_list (list): List with all the filename of the background. + :param argument (args): Arguments. + """ + file_list = ['.'.join(x.split('.')[:-1]) + for num, x in enumerate(file_list)] + random.shuffle(file_list) + total = len(file_list) + if arguments.test: + r = 0 + t = total/3 + for s in ['train','test','val']: + source_txt = arguments.path_to_data + source_img = os.path.join(arguments.path_to_data, '../images/') + directory_txt = os.path.join(arguments.directory, f'labels/{s}') + directory_img = os.path.join(arguments.directory, f'images/{s}') + copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt') + copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg') + r=t + t+=t + else: + r = 0 + t = total/2 + for s in ['train','val']: + source_txt = arguments.path_to_data + source_img = os.path.join(arguments.path_to_data, '../images/') + directory_txt = os.path.join(arguments.directory, f'labels/{s}') + directory_img = os.path.join(arguments.directory, f'images/{s}') + copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt') + copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg') + r=t + t+=t + def open_file(path): """ Open a file with a path without knowing if suffix is .pkl or .csv @@ -240,7 +276,7 @@ def open_file(path): print("Wav files can't be load...") return pd.DataFrame() else: - print('Collect all files on a folder...') + print('Collect all files in the folder...') df = pd.DataFrame(glob.glob(os.path.join(path, '*'), recursive=True), columns=['Path']) return df