correct split background

74a8dd8d · Stephane Chavin · c1ab7237 · 74a8dd8d · 74a8dd8d · 74a8dd8d
Commit 74a8dd8d authored 7 months ago by Stephane Chavin
--- a/get_train_annot.py
+++ b/get_train_annot.py
@@ -14,7 +14,7 @@ from tqdm import tqdm
 import pandas as pd


-def process(entry, arguments, species_list):
+def main(entry, arguments, species_list):
    """
    Precess the annotation to get the .jpg spectrogram and the .txt annotation file
    :param x (tuple): Enumerate number, [filename, group] per file
@@ -213,11 +213,11 @@ if __name__ == '__main__':
    if args.cpu == 1:
        for i in tqdm(enumerate(df.groupby('Path')), total=len(df.groupby('Path')),
                      desc="Processing", ascii='░▒▓█'):
-            process(i, args, species)
+            main(i, args, species)
    else:
        args = [args for i in range(len(df.groupby('Path')))]
        species = [species for i in range(len(df.groupby('Path')))]
-        p_map(process, enumerate(df.groupby('Path')), args,
+        p_map(main, enumerate(df.groupby('Path')), args,
              species, num_cpus=args[0].cpu, total=len(df.groupby('Path')))
        args = args[0]
    print('saved to', args.directory)
@@ -231,11 +231,13 @@ if __name__ == '__main__':

    if SPLIT == 'Y':
        print('The train set will be 70%, val set 15% and test set 15%')
-        path = os.getcwd()
-        # Get the current path to find the split script
+
+        # Get the path of the current script
+        path = os.path.abspath(os.path.dirname( __file__ ))
        script = os.path.join(path, 'get_train_val.py')
        data_path = os.path.join(path, args.directory, 'labels')
        directory_path = os.path.join(path, args.directory)
+
        # Create the directory path if not exists
        utils.create_directory(directory_path)
        try:

--- a/get_train_val.py
+++ b/get_train_val.py
@@ -67,17 +67,22 @@ def prepare_data(arguments):
    """
    Prepare the annotation before getting splited
    :param args (args): Argument
-    :return detection (DataFrame): DataFrame with all the annotation to split
+    :return annotations (DataFrame): DataFrame with all the annotation to split
    """
-    detections = pd.concat({f: pd.read_csv(os.path.join(arguments.path_to_data, f), sep=' ',
-                                     names=['species', 'x', 'y', 'w', 'h'])
-                   for f in tqdm(os.listdir(arguments.path_to_data),
-                                 desc="Processing", ascii='░▒▓█')},
-                                 names=['file'])
+    annotations = [] 
+    background = []  
+    for f in tqdm(os.path.join(arguments.path_to_data, f), desc="Processing", 
+                                                            ascii='░▒▓█'):
+        file_annotation = pd.read_csv(f, sep=' ', names=['species', 'x', 'y', 'w', 'h'])
+        if len(file_annotation) == 0:
+            background.append(f)
+        else:
+            file_annotation['file'] = f
+            annotations.extend(file_annotation.to_dict(orient='records'))

-    detections = detections.reset_index()
-    detections.species = detections.species.astype(float)
-    return detections
+    annotations = pd.DataFrame(annotations)
+    annotations.species = annotations.species.astype(float)
+    return annotations, background


 if __name__ == '__main__':
@@ -96,7 +101,7 @@ if __name__ == '__main__':
                        ' for test and same for validation', default=None)
    args = parser.parse_args()

-    df = prepare_data(args)
+    df, background = prepare_data(args)
    train, val = utils.split(df, 'train', args.ratio)
    
    saved_directory = os.path.join(args.directory, 'set')
@@ -126,7 +131,13 @@ if __name__ == '__main__':

    command = f'python {yolo_path} --data {data_path} --imgsz 640 --epochs 100 --weights {weights_path} --hyp {hyp_path} --cache'
    print(command,'\n')
+    if len(background == 0):
        print('\u26A0\uFE0F   Be aware that it is recommended to have background images that',
-    'represents 10% of your dataset. To do so, please use the script "get_spectrogram.py"',
+        'represents 10% of your dataset. If you do not have background, use the script "get_spectrogram.py"',
        'with --background arguments. Comptue on recordings that contains multiple type of noise...')
+    else:
+        utils.split_background(background, args)
+        print(f'Your dataset contains {len(background)} images in background. It represents ',
+        f'{(len(background)/len(df))*100} % of your dataset set. It is recommended to reach around',
+        ' 10% for a good model.')
    
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
@@ -9,6 +9,7 @@ import json
 from datetime import date
 from pathlib import Path
 import librosa
+import random

 import pandas as pd
 import numpy as np
@@ -209,6 +210,41 @@ def split(df, method, ratio=0.7):
    return major_df, minor_df


+def split_background(file_list, arguments):
+    """
+    Randomly split the background images and save them into the differents sets.
+    :param file_list (list): List with all the filename of the background.
+    :param argument (args): Arguments.
+    """
+    file_list = ['.'.join(x.split('.')[:-1])
+            for num, x in enumerate(file_list)]
+    random.shuffle(file_list)
+    total = len(file_list)
+    if arguments.test:
+        r = 0
+        t = total/3
+        for s in ['train','test','val']:
+            source_txt = arguments.path_to_data
+            source_img = os.path.join(arguments.path_to_data, '../images/')
+            directory_txt = os.path.join(arguments.directory, f'labels/{s}')
+            directory_img = os.path.join(arguments.directory, f'images/{s}')
+            copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt')
+            copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg')
+            r=t
+            t+=t
+    else:
+        r = 0
+        t = total/2
+        for s in ['train','val']:
+            source_txt = arguments.path_to_data
+            source_img = os.path.join(arguments.path_to_data, '../images/')
+            directory_txt = os.path.join(arguments.directory, f'labels/{s}')
+            directory_img = os.path.join(arguments.directory, f'images/{s}')
+            copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt')
+            copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg')
+            r=t
+            t+=t
+
 def open_file(path):
    """
    Open a file with a path without knowing if suffix is .pkl or .csv
@@ -240,7 +276,7 @@ def open_file(path):
        print("Wav files can't be load...")
        return pd.DataFrame()
    else:
-        print('Collect all files on a folder...')
+        print('Collect all files in the folder...')
        df = pd.DataFrame(glob.glob(os.path.join(path, '*'),
                          recursive=True), columns=['Path'])
    return df