Raphael Sturgis · bf3d0348
--- a/skais/ais/ais_points.py 0 → 100644

+ 217

− 0
+++ b/skais/ais/ais_points.py 0 → 100644

+ 217

− 0
+import pickle
+from datetime import datetime
+
+import pandas as pd
+import numpy as np
+from numba import jit
+from scipy.stats import stats
+
+from skais.ais.ais_trajectory import AISTrajectory
+
+
+def compute_trajectories(df, time_gap, min_size=50, size_limit=500, interpolation_time=None):
+    n_sample = len(df.index)
+    result = []
+    work_df = df.copy()
+
+    index = 0
+    while index < n_sample:
+        i = compute_trajectory(df['ts_sec'][index:].to_numpy(), time_gap, size_limit)
+        trajectory = AISTrajectory(work_df[:i], interpolation_time=interpolation_time)
+        if len(trajectory.df.index) > min_size:
+            result.append(trajectory)
+        work_df = work_df[i:]
+        index += i
+
+    return result
+
+
+@jit(nopython=True)
+def compute_trajectory(times, time_gap, size_limit):
+    n_samples = len(times)
+
+    previous_date = times[0]
+
+    i = 0
+    for i in range(size_limit):
+        if i >= n_samples or ((times[i] - previous_date) / 60 > time_gap):
+            return i
+        previous_date = times[i]
+
+    return i + 1
+
+
+class AISPoints:
+    def __init__(self, df=None):
+        if 'ts' in df:
+            df['ts'] = pd.to_datetime(df.ts)
+            df = df.sort_values(['mmsi', 'ts'])
+
+        self.df = df
+
+    @staticmethod
+    def load_from_csv(file_name):
+        df = pd.read_csv(file_name)
+        ais_points = AISPoints(df)
+        return ais_points
+
+    def remove_outliers(self, features, rank=4):
+        if rank <= 0:
+            raise ValueError(f"Rank is equal to {rank}, must be positive and superior to 0")
+        for feature in features:
+            self.df = self.df.drop(self.df[(np.abs(stats.zscore(self.df[feature])) > rank)].index)
+
+    def normalize(self, features, normalization_type="min-max"):
+        normalization_dict = {}
+        if normalization_type == "min-max":
+            for f in features:
+                minimum = self.df[f].min()
+                maximum = self.df[f].max()
+                diff = (maximum - minimum)
+                if diff == 0:
+                    print("Warning: diff = %d", diff)
+                    diff = 1
+                self.df[f] = (self.df[f] - minimum) / diff
+                normalization_dict[f"{f}_minimum"] = minimum
+                normalization_dict[f"{f}_maximum"] = maximum
+
+        elif normalization_type == "standardization":
+            normalisation_factors = ("standardization", {})
+            for f in features:
+                mean = self.df[f].mean()
+                std = self.df[f].std()
+                if std == 0:
+                    print("Warning: std = %d", std)
+                    std = 1
+                self.df[f] = (self.df[f] - mean) / std
+                normalization_dict[f"{f}_mean"] = mean
+                normalization_dict[f"{f}_std"] = std
+
+        else:
+            raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, "
+                             f"standardization]")
+        return normalization_type, normalization_dict
+
+    def histogram(self, features, bins=10, ranges=None, label=None, y_field='label'):
+        if label is not None:
+            tmp = self.df[self.df[y_field] == label]
+        else:
+            tmp = self.df
+        dat = tmp[features]
+        h = np.histogramdd(dat.to_numpy(), bins, ranges)[0]
+        if h.sum() == 0:
+            return np.full(h.shape, 1 / h.size)
+        else:
+            return h / h.sum()
+
+    def disjointed_histogram(self, features, bins, ranges, label=None, y_field='label'):
+        if label is not None:
+            tmp = self.df[self.df[y_field] == label]
+        else:
+            tmp = self.df
+
+        if type(bins) == int:
+            bins = [bins for _ in features]
+
+        histograms = []
+        for feature, bin, f_range in zip(features, bins, ranges):
+            histograms.append(np.histogram(tmp[feature], bin, f_range))
+        return histograms
+
+    def compute_diff_heading_cog(self):
+        self.df["diff"] = self.df.apply(lambda x: 180 - abs(abs(x['heading'] - x['cog']) - 180),
+                                        axis=1)
+
+    def clean_angles(self):
+        self.df = self.df[self.df["cog"] <= 360]
+        self.df = self.df[self.df["cog"] >= 0]
+
+        self.df = self.df[self.df["heading"] <= 360]
+        self.df = self.df[self.df["heading"] >= 0]
+
+    def histogram_joint_x_y(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
+                            , y_nb_bins=2, y_fields='label', y_range=[0, 1]):
+        return self.histogram(x_fields + [y_fields],
+                              bins=[x_nb_bins for i in x_fields] + [y_nb_bins],
+                              ranges=x_range + [y_range])
+
+    def histogram_x_knowing_y(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
+                              , y_nb_bins=2, y_field='label'):
+        result = []
+        for i in range(y_nb_bins):
+            layer = self.histogram(x_fields, bins=x_nb_bins, ranges=x_range, label=i, y_field=y_field)
+            result.append(layer)
+        return np.stack(result, axis=len(x_fields))
+
+    def disjointed_histogram_x_knowing_y(self, features, x_nb_bins=10, x_range=[[0, 1]]
+                                         , y_nb_bins=4, y_field='label'):
+        out = []
+        for feature, f_range in zip(features, x_range):
+            result = []
+            for i in range(y_nb_bins):
+                layer, _ = np.histogram(self.df[self.df[y_field] == i][feature].to_numpy(), bins=x_nb_bins,
+                                        range=f_range)
+                if layer.sum() == 0:
+                    layer = np.full(layer.shape, 1)
+                result.append(layer)
+            out.append(np.stack(result))
+        return out
+
+    def histogram_y_knowing_x(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
+                              , y_nb_bins=2, y_field='label', y_range=[0, 1]):
+        h_joint = self.histogram_joint_x_y(x_fields, x_nb_bins, x_range, y_nb_bins, y_field, y_range)
+        y_hist = self.histogram(features=y_field, bins=y_nb_bins, ranges=[y_range])
+
+        result = np.zeros(h_joint.shape)
+
+        for idx, x in np.ndenumerate(h_joint):
+            if h_joint[idx[:-1]].sum() == 0:
+                result[idx] = y_hist[idx[-1]]
+            else:
+                result[idx] = x / h_joint[idx[:-1]].sum()
+        return result
+
+    def get_trajectories(self, time_gap=30, min_size=50, interpolation_time=None):
+
+        if 'ts' in self.df:
+
+            self.df['ts'] = pd.to_datetime(self.df['ts'], infer_datetime_format=True)
+            self.df['ts_sec'] = self.df['ts'].apply(lambda x: datetime.timestamp(x))
+
+            dat = self.df
+        else:
+            raise ValueError
+
+        trajectories = []
+        for mmsi in dat.mmsi.unique():
+            trajectories += compute_trajectories(dat[dat['mmsi'] == mmsi], time_gap, min_size=min_size,
+                                                 interpolation_time=interpolation_time)
+
+        return trajectories
+
+    def describe(self):
+        stats = {"nb vessels": len(self.df.mmsi.unique()),
+                 "nb points": len(self.df.index),
+                 "average speed": self.df['sog'].mean(),
+                 "average diff": self.df['diff'].mean()
+                 }
+
+        for n in np.sort(self.df['label'].unique()):
+            stats[f"labeled {n}"] = len(self.df[self.df['label'] == n].index)
+
+        return stats
+
+    @staticmethod
+    def fuse(*args):
+        if len(args) == 1:
+            if not isinstance(args[0], list):
+                return args[0]
+            else:
+                table = args[0]
+        else:
+            table = args
+        dfs = []
+        for aisPosition in table:
+            dfs.append(aisPosition.df)
+
+        return AISPoints(pd.concat(dfs).reindex())