Skip to content
Snippets Groups Projects
Commit 74a8dd8d authored by Stephane Chavin's avatar Stephane Chavin
Browse files

correct split background

parent c1ab7237
No related branches found
No related tags found
No related merge requests found
......@@ -14,7 +14,7 @@ from tqdm import tqdm
import pandas as pd
def process(entry, arguments, species_list):
def main(entry, arguments, species_list):
"""
Precess the annotation to get the .jpg spectrogram and the .txt annotation file
:param x (tuple): Enumerate number, [filename, group] per file
......@@ -213,11 +213,11 @@ if __name__ == '__main__':
if args.cpu == 1:
for i in tqdm(enumerate(df.groupby('Path')), total=len(df.groupby('Path')),
desc="Processing", ascii='░▒▓█'):
process(i, args, species)
main(i, args, species)
else:
args = [args for i in range(len(df.groupby('Path')))]
species = [species for i in range(len(df.groupby('Path')))]
p_map(process, enumerate(df.groupby('Path')), args,
p_map(main, enumerate(df.groupby('Path')), args,
species, num_cpus=args[0].cpu, total=len(df.groupby('Path')))
args = args[0]
print('saved to', args.directory)
......@@ -231,11 +231,13 @@ if __name__ == '__main__':
if SPLIT == 'Y':
print('The train set will be 70%, val set 15% and test set 15%')
path = os.getcwd()
# Get the current path to find the split script
# Get the path of the current script
path = os.path.abspath(os.path.dirname( __file__ ))
script = os.path.join(path, 'get_train_val.py')
data_path = os.path.join(path, args.directory, 'labels')
directory_path = os.path.join(path, args.directory)
# Create the directory path if not exists
utils.create_directory(directory_path)
try:
......
......@@ -67,17 +67,22 @@ def prepare_data(arguments):
"""
Prepare the annotation before getting splited
:param args (args): Argument
:return detection (DataFrame): DataFrame with all the annotation to split
:return annotations (DataFrame): DataFrame with all the annotation to split
"""
detections = pd.concat({f: pd.read_csv(os.path.join(arguments.path_to_data, f), sep=' ',
names=['species', 'x', 'y', 'w', 'h'])
for f in tqdm(os.listdir(arguments.path_to_data),
desc="Processing", ascii='░▒▓█')},
names=['file'])
annotations = []
background = []
for f in tqdm(os.path.join(arguments.path_to_data, f), desc="Processing",
ascii='░▒▓█'):
file_annotation = pd.read_csv(f, sep=' ', names=['species', 'x', 'y', 'w', 'h'])
if len(file_annotation) == 0:
background.append(f)
else:
file_annotation['file'] = f
annotations.extend(file_annotation.to_dict(orient='records'))
detections = detections.reset_index()
detections.species = detections.species.astype(float)
return detections
annotations = pd.DataFrame(annotations)
annotations.species = annotations.species.astype(float)
return annotations, background
if __name__ == '__main__':
......@@ -96,7 +101,7 @@ if __name__ == '__main__':
' for test and same for validation', default=None)
args = parser.parse_args()
df = prepare_data(args)
df, background = prepare_data(args)
train, val = utils.split(df, 'train', args.ratio)
saved_directory = os.path.join(args.directory, 'set')
......@@ -126,7 +131,13 @@ if __name__ == '__main__':
command = f'python {yolo_path} --data {data_path} --imgsz 640 --epochs 100 --weights {weights_path} --hyp {hyp_path} --cache'
print(command,'\n')
if len(background == 0):
print('\u26A0\uFE0F Be aware that it is recommended to have background images that',
'represents 10% of your dataset. To do so, please use the script "get_spectrogram.py"',
'represents 10% of your dataset. If you do not have background, use the script "get_spectrogram.py"',
'with --background arguments. Comptue on recordings that contains multiple type of noise...')
else:
utils.split_background(background, args)
print(f'Your dataset contains {len(background)} images in background. It represents ',
f'{(len(background)/len(df))*100} % of your dataset set. It is recommended to reach around',
' 10% for a good model.')
\ No newline at end of file
......@@ -9,6 +9,7 @@ import json
from datetime import date
from pathlib import Path
import librosa
import random
import pandas as pd
import numpy as np
......@@ -209,6 +210,41 @@ def split(df, method, ratio=0.7):
return major_df, minor_df
def split_background(file_list, arguments):
"""
Randomly split the background images and save them into the differents sets.
:param file_list (list): List with all the filename of the background.
:param argument (args): Arguments.
"""
file_list = ['.'.join(x.split('.')[:-1])
for num, x in enumerate(file_list)]
random.shuffle(file_list)
total = len(file_list)
if arguments.test:
r = 0
t = total/3
for s in ['train','test','val']:
source_txt = arguments.path_to_data
source_img = os.path.join(arguments.path_to_data, '../images/')
directory_txt = os.path.join(arguments.directory, f'labels/{s}')
directory_img = os.path.join(arguments.directory, f'images/{s}')
copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt')
copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg')
r=t
t+=t
else:
r = 0
t = total/2
for s in ['train','val']:
source_txt = arguments.path_to_data
source_img = os.path.join(arguments.path_to_data, '../images/')
directory_txt = os.path.join(arguments.directory, f'labels/{s}')
directory_img = os.path.join(arguments.directory, f'images/{s}')
copy_files_to_directory(file_list[r:t], source_txt, directory_txt, 'txt')
copy_files_to_directory(file_list[r:t], source_img, directory_img, 'jpg')
r=t
t+=t
def open_file(path):
"""
Open a file with a path without knowing if suffix is .pkl or .csv
......@@ -240,7 +276,7 @@ def open_file(path):
print("Wav files can't be load...")
return pd.DataFrame()
else:
print('Collect all files on a folder...')
print('Collect all files in the folder...')
df = pd.DataFrame(glob.glob(os.path.join(path, '*'),
recursive=True), columns=['Path'])
return df
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment