get_train_val.py

"""Separates training and validation datasets in a balanced manner"""

import argparse
import os
import pandas as pd
import utils

from tqdm import tqdm


def export_split(argument, entry, path, directory):
    """
    Export the annotations' sets
    :param entry (list): Minor set = [0], Major set = [1]
    :param path (str): Path to take the labels files
    :param directory (str): Directory to save the data
    """
    train_set = entry[0]
    val_set = entry[1]

    for folder in ['images', 'labels', 'images/train', 'images/val', 'images/test',
                    'labels/train', 'labels/val', 'labels/test']:
        utils.create_directory(os.path.join(directory, folder))

    if argument.test:
        test_set = entry[2]
        test_set.file = ['.'.join(x.split('.')[:-1])
                         for num, x in enumerate(test.file)]

        utils.copy_files_to_directory(test_set.file, path, os.path.join(
            directory, 'labels/test'), 'txt')
        utils.copy_files_to_directory(test_set.file, os.path.join(
            path, '../images/all'), os.path.join(directory, 'images/test'), 'jpg')

    val_set.file = ['.'.join(x.split('.')[:-1])
                    for _, x in enumerate(val.file)]
    train_set.file = ['.'.join(x.split('.')[:-1])
                      for _, x in enumerate(train_set.file)]

    # Copy the validation set into the folder
    utils.copy_files_to_directory(val_set.file, path, os.path.join(
        directory, 'labels/val'), 'txt')
    utils.copy_files_to_directory(val_set.file, os.path.join(
        path, '../images/all'), os.path.join(directory, 'images/val'), 'jpg')
    # Copy the trainning set into the folder
    utils.copy_files_to_directory(train_set.file, path, os.path.join(
        directory, 'labels/train'), 'txt')
    utils.copy_files_to_directory(train_set.file, os.path.join(
        path, '../images/all'), os.path.join(directory, 'images/train'), 'jpg')

    try:
        species_list = pd.read_csv(os.path.join(path, '../species_list.csv'))
    except FileNotFoundError:
        print('No species list detected, please add it to : ',
              os.path.join(directory, 'custom_data.yaml'))

    with open(os.path.join(directory, 'custom_data.yaml'), 'w', encoding='utf-8') as f:
        if argument.test == 1:
            f.write(f'test: {os.path.join(directory, "images/test")}\n')
        f.write(f'train: {os.path.join(directory, "images/train")}\n')
        f.write(f'val: {os.path.join(directory, "images/val")}\n')
        f.write(f'nc: {len(species_list)}\n')
        f.write(f'names: {[str(x) for x in species_list.species.tolist()]}')


def prepare_data(arguments):
    """
    Prepare the annotation before getting splited
    :param args (args): Argument
    :return detection (DataFrame): DataFrame with all the annotation to split
    """
    detections = pd.concat({f: pd.read_csv(os.path.join(arguments.path_to_data, f), sep=' ',
                                     names=['species', 'x', 'y', 'w', 'h'])
                   for f in tqdm(os.listdir(arguments.path_to_data),
                                 desc="Processing", ascii='░▒▓█')},
                                 names=['file'])

    detections = detections.reset_index()
    detections.species = detections.species.astype(float)
    return detections


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Split the annotation into train, val and test if needed')
    parser.add_argument('path_to_data', type=utils.arg_directory,
                        help='Path of the folder that contains the .txt (ending with labels/)')
    parser.add_argument('directory', type=utils.arg_directory,
                        help='Directory to which spectrogram and .txt files will be'
                                         'stored (different from -p)')
    parser.add_argument('-r', '--ratio', type=float,
                        default=0.7, help='Train Ratio (val = 1 - ratio)')
    parser.add_argument('--test', action='store_const', const=1,
                        help='Split into train/test/val. 1 - Ratio / 2 '
                        ' for test and same for validation', default=None)
    args = parser.parse_args()

    df = prepare_data(args)
    train, val = utils.split(df, 'train', args.ratio)

    saved_directory = os.path.join(args.directory, 'set')
    utils.create_directory(saved_directory)
    if args.test:
        val, test = utils.split(val, 'val', 0.5)
        export_split(args, [train, val, test], args.path_to_data,
                                 saved_directory)
        state, proposition = utils.get_set_info([train, val, test])
    else:
        export_split(args, [train, val], args.path_to_data, saved_directory)
        state, proposition = utils.get_set_info([train, val])

    print(f'\nYour dataset is {state} {proposition}')

    print(f'Train saved in {saved_directory}\n')
    print('To train your model, use the following command : \n')

    current_path = os.getcwd()

    directory_path = os.path.join(current_path, saved_directory)

    yolo_path = os.path.join(current_path, 'yolov5/train.py')
    data_path = os.path.join(directory_path, 'custom_data.yaml')
    weights_path = os.path.join(current_path, 'yolov5/weights/yolov5l.pt')
    hyp_path = os.path.join(current_path, 'custom_hyp.yaml')

    command = f'python {yolo_path} --data {data_path} --imgsz 640 --epochs 100 --weights {weights_path} --hyp {hyp_path} --cache'
    print(command,'\n')
    print('\u26A0\uFE0F   Be aware that it is recommended to have background images that',
    'represents 10% of your dataset. To do so, please use the script "get_spectrogram.py"',
    'with --background arguments. Comptue on recordings that contains multiple type of noise...')