Skip to content
Snippets Groups Projects
Commit 010622d8 authored by Stephane Chavin's avatar Stephane Chavin
Browse files

improve

parent 0649cbcd
Branches
No related tags found
No related merge requests found
......@@ -12,21 +12,60 @@ def arg_directory(path):
else:
raise argparse.ArgumentTypeError(f'`{path}` is not a valid path')
def create_directory_if_not_exists(directory):
if not os.path.exists(directory):
os.mkdir(directory)
def copy_files_to_directory(file_list, source_dir, destination_dir):
for file_name in file_list:
source_path = os.path.join(source_dir, f'{file_name}.txt')
destination_path = os.path.join(destination_dir, f'{file_name}.txt')
shutil.copy2(source_path, destination_path)
def split(df, ratio):
print(len(df))
classes = df.espece.unique()
n_class = classes.size
train_count = pd.DataFrame(np.zeros((n_class, 1)), index=classes)
test_count = train_count.copy()
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for i, c in enumerate(classes):
if train_count.loc[c].iloc[0] == 0:
sdf = df.groupby('espece').get_group(c)
f = sdf.sample(1).file.iloc[0]
mask = df.file == f
train_count += df[mask].espece.value_counts()
train_df = pd.concat([train_df,df[mask]])
df = df[~mask]
if test_count.loc[c].iloc[0] == 0:
sdf = df.groupby('espece').get_group(c)
f = sdf.sample(1).file.iloc[0]
mask = df.file == f
test_count += df[mask].espece.value_counts()
test_df = pd.concat([test_df, df[mask]])
df = df[~mask]
while len(df):
min_esp = df.groupby('espece').count().file.idxmin()
sdf = df.groupby('espece').get_group(min_esp)
f = sdf.sample(1).file.iloc[0]
if (train_count.loc[min_esp]/(test_count.loc[min_esp] + train_count.loc[min_esp]))[0] > ratio:
test_count.loc[min_esp] += df[df.file == f].groupby('espece').count().iloc[0].file
test_df = pd.concat([test_df,df[df.file == f]])
else:
train_count.loc[min_esp] += df[df.file == f].groupby('espece').count().iloc[0].file
train_df = pd.concat([train_df, df[df.file == f]])
df = df[df.file != f]
print('ratio', train_count/(test_count + train_count))
return pd.concat(train_df), pd.concat(test_df)
def process_data(args):
path = args.path_to_data
direction = args.direction
NB_CLASS = 2
directory = args.directory
df = pd.concat({f: pd.read_csv(os.path.join(path, f), sep=' ', names=['espece', 'x', 'y', 'w', 'h'])
for f in tqdm(os.listdir(path))}, names=['file'])
......@@ -38,53 +77,36 @@ def process_data(args):
df = df[df.espece != 'y']
df.espece = df.espece.astype(float)
tab = df.groupby('espece').count()
tab = tab.sort_values(tab.columns[0], ascending=False)[:NB_CLASS]
tab = tab.sort_values(tab.columns[0], ascending=False)
compte = pd.DataFrame(np.zeros((len(tab) + 1, 1)), columns=['nombre'])
nb_val = int(tab.file.min() * (1 - args.ratio))
esp_min = tab[tab.file == tab.file.min()].index[0]
val = df[df.file.isin(df[df.espece == esp_min].sample(int(nb_val)).file)]
df.drop(val.index, inplace=True)
for n in tab.iterrows():
if n[0] == esp_min:
continue
else:
nb_val = len(df[df.espece == n[0]]) * (1 - args.ratio)
new_row = df[df.file.isin(df[df.espece == n[0]].sample(int(nb_val)).file)]
val = pd.concat((val, new_row))
df = df.drop(new_row.index)
compte = val.groupby('espece').count()
return df
val = pd.DataFrame(val.groupby('file').count().index, columns=['file'])
train = pd.DataFrame(df.groupby('file').count().index, columns=['file'])
def export_split(val, train, path, directory):
val.file = val.file.str.rsplit('.', 1).str[0]
train.file = train.file.str.rsplit('.', 1).str[0]
create_directory_if_not_exists(os.path.join(directory, 'images'))
create_directory_if_not_exists(os.path.join(directory, 'images/train'))
create_directory_if_not_exists(os.path.join(directory, 'images/val'))
create_directory_if_not_exists(os.path.join(directory, 'labels'))
create_directory_if_not_exists(os.path.join(directory, 'labels/train'))
create_directory_if_not_exists(os.path.join(directory, 'labels/val'))
create_directory_if_not_exists(os.path.join(direction, 'images'))
create_directory_if_not_exists(os.path.join(direction, 'images/train'))
create_directory_if_not_exists(os.path.join(direction, 'images/val'))
create_directory_if_not_exists(os.path.join(direction, 'labels'))
create_directory_if_not_exists(os.path.join(direction, 'labels/train'))
create_directory_if_not_exists(os.path.join(direction, 'labels/val'))
copy_files_to_directory(val.file, path, os.path.join(direction, 'labels/val'))
copy_files_to_directory(val.file, os.path.join(path, '../images/all'), os.path.join(direction, 'images/val'))
copy_files_to_directory(val.file, path, os.path.join(directory, 'labels/val'))
copy_files_to_directory(val.file, os.path.join(path, '../images/all'), os.path.join(directory, 'images/val'))
copy_files_to_directory(train.file, path, os.path.join(direction, 'labels/train'))
copy_files_to_directory(train.file, os.path.join(path, '../images/all'), os.path.join(direction, 'images/train'))
copy_files_to_directory(train.file, path, os.path.join(directory, 'labels/train'))
copy_files_to_directory(train.file, os.path.join(path, '../images/all'), os.path.join(directory, 'images/train'))
try:
liste_espece = pd.read_csv(os.path.join(path, '../liste_especes.csv'))
except Exception:
print('No species list detected, please add it to', os.path.join(direction, 'custom_data.yaml'))
print('No species list detected, please add it to', os.path.join(directory, 'custom_data.yaml'))
with open(os.path.join(direction, 'custom_data.yaml'), 'w') as f:
f.write(f'train: {os.path.join(direction, "images/train")}\n')
f.write(f'val: {os.path.join(direction, "images/val")}\n')
with open(os.path.join(directory, 'custom_data.yaml'), 'w') as f:
f.write(f'train: {os.path.join(directory, "images/train")}\n')
f.write(f'val: {os.path.join(directory, "images/val")}\n')
f.write(f'nc: {len(liste_espece)}\n')
f.write(f'names: {liste_espece.espece.tolist()}')
......@@ -92,7 +114,9 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='TODO')
parser.add_argument('-r', '--ratio', type=float, default=0.7, help='Train Ratio (val = 1 - ratio)')
parser.add_argument('-p', '--path_to_data', type=arg_directory, help='Path of the folder that contains the .txt (ending with labels/)', required=True)
parser.add_argument('-d', '--direction', type=arg_directory, help='Directory to which spectrogram and .txt files will be stored (different from -p)', required=True)
parser.add_argument('-d', '--directory', type=arg_directory, help='Directory to which spectrogram and .txt files will be stored (different from -p)', required=True)
args = parser.parse_args()
process_data(args)
df = process_data(args)
train, val = split(df, args.ratio)
export_split(val, train, args.path_to_data, args.directory)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment