Skip to content
Snippets Groups Projects

Develop

Merged Raphael Sturgis requested to merge develop into main
21 files
+ 1677
0
Compare changes
  • Side-by-side
  • Inline
Files
21
+ 217
0
 
import pickle
 
from datetime import datetime
 
 
import pandas as pd
 
import numpy as np
 
from numba import jit
 
from scipy.stats import stats
 
 
from skais.ais.ais_trajectory import AISTrajectory
 
 
 
def compute_trajectories(df, time_gap, min_size=50, size_limit=500, interpolation_time=None):
 
n_sample = len(df.index)
 
result = []
 
work_df = df.copy()
 
 
index = 0
 
while index < n_sample:
 
i = compute_trajectory(df['ts_sec'][index:].to_numpy(), time_gap, size_limit)
 
trajectory = AISTrajectory(work_df[:i], interpolation_time=interpolation_time)
 
if len(trajectory.df.index) > min_size:
 
result.append(trajectory)
 
work_df = work_df[i:]
 
index += i
 
 
return result
 
 
 
@jit(nopython=True)
 
def compute_trajectory(times, time_gap, size_limit):
 
n_samples = len(times)
 
 
previous_date = times[0]
 
 
i = 0
 
for i in range(size_limit):
 
if i >= n_samples or ((times[i] - previous_date) / 60 > time_gap):
 
return i
 
previous_date = times[i]
 
 
return i + 1
 
 
 
class AISPoints:
 
def __init__(self, df=None):
 
if 'ts' in df:
 
df['ts'] = pd.to_datetime(df.ts)
 
df = df.sort_values(['mmsi', 'ts'])
 
 
self.df = df
 
 
@staticmethod
 
def load_from_csv(file_name):
 
df = pd.read_csv(file_name)
 
ais_points = AISPoints(df)
 
return ais_points
 
 
def remove_outliers(self, features, rank=4):
 
if rank <= 0:
 
raise ValueError(f"Rank is equal to {rank}, must be positive and superior to 0")
 
for feature in features:
 
self.df = self.df.drop(self.df[(np.abs(stats.zscore(self.df[feature])) > rank)].index)
 
 
def normalize(self, features, normalization_type="min-max"):
 
normalization_dict = {}
 
if normalization_type == "min-max":
 
for f in features:
 
minimum = self.df[f].min()
 
maximum = self.df[f].max()
 
diff = (maximum - minimum)
 
if diff == 0:
 
print("Warning: diff = %d", diff)
 
diff = 1
 
self.df[f] = (self.df[f] - minimum) / diff
 
normalization_dict[f"{f}_minimum"] = minimum
 
normalization_dict[f"{f}_maximum"] = maximum
 
 
elif normalization_type == "standardization":
 
normalisation_factors = ("standardization", {})
 
for f in features:
 
mean = self.df[f].mean()
 
std = self.df[f].std()
 
if std == 0:
 
print("Warning: std = %d", std)
 
std = 1
 
self.df[f] = (self.df[f] - mean) / std
 
normalization_dict[f"{f}_mean"] = mean
 
normalization_dict[f"{f}_std"] = std
 
 
else:
 
raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, "
 
f"standardization]")
 
return normalization_type, normalization_dict
 
 
def histogram(self, features, bins=10, ranges=None, label=None, y_field='label'):
 
if label is not None:
 
tmp = self.df[self.df[y_field] == label]
 
else:
 
tmp = self.df
 
dat = tmp[features]
 
h = np.histogramdd(dat.to_numpy(), bins, ranges)[0]
 
if h.sum() == 0:
 
return np.full(h.shape, 1 / h.size)
 
else:
 
return h / h.sum()
 
 
def disjointed_histogram(self, features, bins, ranges, label=None, y_field='label'):
 
if label is not None:
 
tmp = self.df[self.df[y_field] == label]
 
else:
 
tmp = self.df
 
 
if type(bins) == int:
 
bins = [bins for _ in features]
 
 
histograms = []
 
for feature, bin, f_range in zip(features, bins, ranges):
 
histograms.append(np.histogram(tmp[feature], bin, f_range))
 
return histograms
 
 
def compute_diff_heading_cog(self):
 
self.df["diff"] = self.df.apply(lambda x: 180 - abs(abs(x['heading'] - x['cog']) - 180),
 
axis=1)
 
 
def clean_angles(self):
 
self.df = self.df[self.df["cog"] <= 360]
 
self.df = self.df[self.df["cog"] >= 0]
 
 
self.df = self.df[self.df["heading"] <= 360]
 
self.df = self.df[self.df["heading"] >= 0]
 
 
def histogram_joint_x_y(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
 
, y_nb_bins=2, y_fields='label', y_range=[0, 1]):
 
return self.histogram(x_fields + [y_fields],
 
bins=[x_nb_bins for i in x_fields] + [y_nb_bins],
 
ranges=x_range + [y_range])
 
 
def histogram_x_knowing_y(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
 
, y_nb_bins=2, y_field='label'):
 
result = []
 
for i in range(y_nb_bins):
 
layer = self.histogram(x_fields, bins=x_nb_bins, ranges=x_range, label=i, y_field=y_field)
 
result.append(layer)
 
return np.stack(result, axis=len(x_fields))
 
 
def disjointed_histogram_x_knowing_y(self, features, x_nb_bins=10, x_range=[[0, 1]]
 
, y_nb_bins=4, y_field='label'):
 
out = []
 
for feature, f_range in zip(features, x_range):
 
result = []
 
for i in range(y_nb_bins):
 
layer, _ = np.histogram(self.df[self.df[y_field] == i][feature].to_numpy(), bins=x_nb_bins,
 
range=f_range)
 
if layer.sum() == 0:
 
layer = np.full(layer.shape, 1)
 
result.append(layer)
 
out.append(np.stack(result))
 
return out
 
 
def histogram_y_knowing_x(self, x_fields=["sog", "diff"], x_nb_bins=10, x_range=[[0, 30], [0, 180]]
 
, y_nb_bins=2, y_field='label', y_range=[0, 1]):
 
h_joint = self.histogram_joint_x_y(x_fields, x_nb_bins, x_range, y_nb_bins, y_field, y_range)
 
y_hist = self.histogram(features=y_field, bins=y_nb_bins, ranges=[y_range])
 
 
result = np.zeros(h_joint.shape)
 
 
for idx, x in np.ndenumerate(h_joint):
 
if h_joint[idx[:-1]].sum() == 0:
 
result[idx] = y_hist[idx[-1]]
 
else:
 
result[idx] = x / h_joint[idx[:-1]].sum()
 
return result
 
 
def get_trajectories(self, time_gap=30, min_size=50, interpolation_time=None):
 
 
if 'ts' in self.df:
 
 
self.df['ts'] = pd.to_datetime(self.df['ts'], infer_datetime_format=True)
 
self.df['ts_sec'] = self.df['ts'].apply(lambda x: datetime.timestamp(x))
 
 
dat = self.df
 
else:
 
raise ValueError
 
 
trajectories = []
 
for mmsi in dat.mmsi.unique():
 
trajectories += compute_trajectories(dat[dat['mmsi'] == mmsi], time_gap, min_size=min_size,
 
interpolation_time=interpolation_time)
 
 
return trajectories
 
 
def describe(self):
 
stats = {"nb vessels": len(self.df.mmsi.unique()),
 
"nb points": len(self.df.index),
 
"average speed": self.df['sog'].mean(),
 
"average diff": self.df['diff'].mean()
 
}
 
 
for n in np.sort(self.df['label'].unique()):
 
stats[f"labeled {n}"] = len(self.df[self.df['label'] == n].index)
 
 
return stats
 
 
@staticmethod
 
def fuse(*args):
 
if len(args) == 1:
 
if not isinstance(args[0], list):
 
return args[0]
 
else:
 
table = args[0]
 
else:
 
table = args
 
dfs = []
 
for aisPosition in table:
 
dfs.append(aisPosition.df)
 
 
return AISPoints(pd.concat(dfs).reindex())
Loading