From efef6ebc388a61e002945e8b8988b630bf5b659c Mon Sep 17 00:00:00 2001
From: Stephane Chavin <stephane.chavin@lis-lab.fr>
Date: Thu, 30 Jan 2025 16:47:43 +0100
Subject: [PATCH] correct split between annotations

---
 get_train_annot.py |  2 +-
 utils.py           | 32 +++++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/get_train_annot.py b/get_train_annot.py
index a0f4c26..92ee2b4 100755
--- a/get_train_annot.py
+++ b/get_train_annot.py
@@ -35,7 +35,7 @@ def main(entry, arguments, species_list):
     offset_list = np.arange(
         0, file_duration, arguments.duration - arguments.overlap)
     new_pos = utils.split_annotations(
-        grp[['start', 'stop']], arguments.duration)
+        grp, arguments.duration)
     grp = pd.merge(grp, new_pos)
     while len(grp) != 0:
         # collect all the data between the offset and duration-overlap
diff --git a/utils.py b/utils.py
index 104c4ca..4120728 100755
--- a/utils.py
+++ b/utils.py
@@ -490,23 +490,33 @@ def split_annotations(df, duration=8):
             current_chunk_end = (start // duration + 1) * duration
 
             if end > current_chunk_end:
-                if (current_chunk_end - start) >= (end - start) * 0.5:
-                    # Split the annotation
-                    splited_annotations.append(
-                        {'start': start, 'stop': current_chunk_end})
-                    start = current_chunk_end
+                # Check for the first part of the annotation
+                if (current_chunk_end - start) > (end - start) * 0.2 and (current_chunk_end - start) < (end - start) * 0.80:
+                    # Split the annotation into 2 new annotations
+                    new_row = row.copy()
+                    row['stop'] = current_chunk_end
+                    new_row['start'] = current_chunk_end
+                    splited_annotations.append(pd.DataFrame(row).T)
+                    splited_annotations.append(pd.DataFrame(new_row).T)
+                    break
+                elif (current_chunk_end - start) <= (end - start) * 0.2:
+                    # If the first segment is less than 20% of the annotation
+                    # only keep the second part
+                    row['start'] = current_chunk_end
+                    splited_annotations.append(pd.DataFrame(row).T)
+                    break
                 else:
-                    # If the remaining segment is less than half of the annotation
-                    # only keep the longest part
-                    splited_annotations.append(
-                        {'start': current_chunk_end, 'stop': end})
+                    # If the first segment is more than 80% of the annotation
+                    # only keep the first part    
+                    row['stop'] = current_chunk_end                
+                    splited_annotations.append(pd.DataFrame(row).T)
                     break
             else:
                 # This annotation fits within the current chunk
-                splited_annotations.append({'start': start, 'stop': end})
+                splited_annotations.append(pd.DataFrame(row).T)
                 break
 
-    return pd.DataFrame(splited_annotations)
+    return pd.concat(splited_annotations)
 
 
 def get_box_shape(info, im):
-- 
GitLab