Commit 77a9bdf2 authored by Raphael Sturgis's avatar Raphael Sturgis
Browse files

Merge branch 'develop' into 'main'

version 0.2a

See merge request !12
parents e015b9c3 184ebc0e
......@@ -3,3 +3,6 @@ build/
skais.egg-info/
*.coverage
*__pycache__*
venv/
local.*
\ No newline at end of file
skais:0.1a
skais:0.2a
pandas~=1.1.5
setuptools~=57.0.0
numpy~=1.19.5
numba~=0.53.1
numba>0.53.1
scipy~=1.5.4
hmmlearn~=0.2.6
scikit-learn~=1.0.1
\ No newline at end of file
scikit-learn~=1.0.1
tqdm~=4.62.3
\ No newline at end of file
......@@ -3,38 +3,6 @@ import pandas as pd
from scipy.stats import stats
# def compute_trajectories(df, time_gap, min_size=50, size_limit=500, interpolation_time=None):
# n_sample = len(df.index)
# result = []
# work_df = df.copy()
#
# index = 0
# while index < n_sample:
# i = compute_trajectory(df['ts_sec'][index:].to_numpy(), time_gap, size_limit)
# trajectory = AISTrajectory(work_df[:i], interpolation_time=interpolation_time)
# if len(trajectory.df.index) > min_size:
# result.append(trajectory)
# work_df = work_df[i:]
# index += i
#
# return result
#
#
# @jit(nopython=True)
# def compute_trajectory(times, time_gap, size_limit):
# n_samples = len(times)
#
# previous_date = times[0]
#
# i = 0
# for i in range(size_limit):
# if i >= n_samples or ((times[i] - previous_date) / 60 > time_gap):
# return i
# previous_date = times[i]
#
# return i + 1
class AISPoints:
# Todo: Should be more elegant
......@@ -73,36 +41,91 @@ class AISPoints:
self.df = self.df[self.df["heading"] <= 360]
self.df = self.df[self.df["heading"] >= 0]
def normalize(self, features, normalization_type="min-max"):
normalization_dict = {}
if normalization_type == "min-max":
for f in features:
minimum = self.df[f].min()
maximum = self.df[f].max()
diff = (maximum - minimum)
if diff == 0:
print("Warning: diff = %d", diff)
diff = 1
self.df[f] = (self.df[f] - minimum) / diff
normalization_dict[f"{f}_minimum"] = minimum
normalization_dict[f"{f}_maximum"] = maximum
elif normalization_type == "standardization":
normalisation_factors = ("standardization", {})
for f in features:
mean = self.df[f].mean()
std = self.df[f].std()
if std == 0:
print("Warning: std = %d", std)
std = 1
self.df[f] = (self.df[f] - mean) / std
normalization_dict[f"{f}_mean"] = mean
normalization_dict[f"{f}_std"] = std
def normalize(self, min_max_features=(), standardization_features=(), third_quartile_features=(),
divide_by_value=(), divide_by_max=(), normalization_dict=None):
if normalization_dict is None:
normalization_dict = {}
for f in min_max_features:
if f in self.df.columns:
normalization_dict[f] = {'type': 'min-max'}
minimum = self.df[f].min()
maximum = self.df[f].max()
diff = (maximum - minimum)
if diff == 0:
print("Warning: diff = 0")
self.df[f] = (self.df[f] - minimum)
else:
self.df[f] = (self.df[f] - minimum) / diff
normalization_dict[f]["minimum"] = minimum
normalization_dict[f]["maximum"] = maximum
for f in standardization_features:
if f in self.df.columns:
normalization_dict[f] = {'type': 'standardization'}
mean = self.df[f].mean()
std = self.df[f].std()
if std == 0:
print("Warning: std = %d", std)
std = 1
self.df[f] = (self.df[f] - mean) / std
normalization_dict[f]["mean"] = mean
normalization_dict[f]["std"] = std
for f in third_quartile_features:
if f in self.df.columns:
normalization_dict[f] = {'type': '3rd quartile'}
third_quartile = self.df[f].quantile(0.75)
if third_quartile == 0:
print("Warning: third quartile = %d", third_quartile)
third_quartile = 1
self.df[f] = self.df[f] / third_quartile
normalization_dict[f]["value"] = third_quartile
for t in divide_by_value:
f = t[0]
value = t[1]
if f in self.df.columns:
if value != 0:
normalization_dict[f] = {'type': 'divide by value',
'value': value}
self.df[f] = self.df[f] / value
else:
print("Warning: dividing by 0")
for f in divide_by_max:
if f in self.df.columns:
maximum = self.df[f].max()
normalization_dict[f] = {'type': 'divide by max',
'maximum': maximum}
self.df[f] = self.df[f] / maximum
else:
raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, "
f"standardization]")
return normalization_type, normalization_dict
for f in normalization_dict:
if f in self.df.columns:
if normalization_dict[f]['type'] == 'min-max':
minimum = normalization_dict[f]["minimum"]
maximum = normalization_dict[f]["maximum"]
diff = (maximum - minimum)
if diff == 0:
print("Warning: diff = 0")
diff = 1
self.df[f] = (self.df[f] - minimum) / diff
elif normalization_dict[f]['type'] == "standardization":
mean = normalization_dict[f]["mean"]
std = normalization_dict[f]["std"]
if std == 0:
print("Warning: std = 0")
std = 1
self.df[f] = (self.df[f] - mean) / std
elif normalization_dict[f]['type'] == "3rd quartile":
third_quartile = normalization_dict[f]["value"]
self.df[f] = self.df[f] / third_quartile
elif normalization_dict[f]['type'] == "divide by value":
value = normalization_dict[f]["value"]
self.df[f] = self.df[f] / value
elif normalization_dict[f]['type'] == "divide by max":
maximum = normalization_dict[f]["maximum"]
self.df[f] = self.df[f] / maximum
else:
raise ValueError(
f"{normalization_dict[f]['type']} not a valid normalization method. Must be on of [min-max,"
f" standardization, 3rd quartile, divide by value]")
return normalization_dict
# New features
def compute_drift(self):
......
import random
import pandas as pd
import numpy as np
from numba import jit
from scipy.interpolate import interp1d
from skais.utils.geography import great_circle, position_from_distance, get_coord
from skais.ais.ais_points import AISPoints
from skais.utils.geometry import bresenham
@jit(nopython=True)
......@@ -36,6 +40,13 @@ def apply_func_on_window(dat, func, radius, on_edge='copy'):
data = dat[i - radius:i + radius + 1]
result[i - radius] = func(data)
return result
elif on_edge == 'ignore':
for i in range(0, dat.shape[0]):
lower_bound = max(0, i - radius)
upper_bound = min(dat.shape[0], i + radius + 1)
data = dat[lower_bound:upper_bound]
result[i] = func(data)
return result
else:
raise ValueError
......@@ -49,9 +60,10 @@ def apply_time_sequence(dat, time, func):
class AISTrajectory(AISPoints):
def __init__(self, df, interpolation_time=None):
def __init__(self, df, mmsi=0, interpolation_time=None):
df = df.drop_duplicates(subset=['ts_sec'])
df = df.sort_values(by=['ts_sec'])
self.mmsi = mmsi
if interpolation_time and len(df.index) > 4:
float_columns = ['longitude', 'latitude', 'cog', 'heading', 'rot', 'sog', 'diff']
......@@ -76,8 +88,8 @@ class AISTrajectory(AISPoints):
kind='nearest', axis=0)(t_interp1d).astype(int)
df = new_df
# self.df = df.dropna()
if 'sog' in df.columns:
df.loc[df['sog'] < 0, 'sog'] = 0
AISPoints.__init__(self, df)
def sliding_window(self, size=10, offset=1, fields=None):
......@@ -92,9 +104,9 @@ class AISTrajectory(AISPoints):
return result
def apply_func_on_time_window(self, func, radius, column, new_column=None):
def apply_func_on_time_window(self, func, radius, column, new_column=None, on_edge='copy'):
dat = self.df[column].to_numpy()
result = apply_func_on_window(dat, func, radius, on_edge='copy')
result = apply_func_on_window(dat, func, radius, on_edge)
if new_column is None:
self.df[column] = result
......@@ -153,3 +165,104 @@ class AISTrajectory(AISPoints):
index += i
return result
def shift_trajectory_to_coordinates(self, target_coordinate=(0, 0), point_index=None, in_place=False):
if point_index is None:
point_index = random.randint(0, len(self.df.index) - 1)
df = self.df.copy()
new_df = df.copy()
new_df['latitude'].iat[point_index] = target_coordinate[0]
new_df['longitude'].iat[point_index] = target_coordinate[1]
new_point = target_coordinate
for i in range(point_index, 0, -1):
current_point = (df.iloc[i]['latitude'], df.iloc[i]['longitude'])
lat_dist = great_circle(current_point[0], df.iloc[i - 1]['latitude'], current_point[1], current_point[1])
long_dist = great_circle(current_point[0], current_point[0], current_point[1], df.iloc[i - 1]['longitude'])
if current_point[0] > df.iloc[i - 1]['latitude']:
lat_dist *= -1
if current_point[1] > df.iloc[i - 1]['longitude']:
long_dist *= -1
new_point = position_from_distance(new_point, (lat_dist, long_dist))
new_df['latitude'].iat[i - 1] = new_point[0]
new_df['longitude'].iat[i - 1] = new_point[1]
new_point = target_coordinate
for i in range(point_index, len(df.index) - 1):
current_point = (df.iloc[i]['latitude'], df.iloc[i]['longitude'])
lat_dist = great_circle(current_point[0], df.iloc[i + 1]['latitude'], current_point[1], current_point[1])
long_dist = great_circle(current_point[0], current_point[0], current_point[1], df.iloc[i + 1]['longitude'])
if current_point[0] > df.iloc[i + 1]['latitude']:
lat_dist *= -1
if current_point[1] > df.iloc[i + 1]['longitude']:
long_dist *= -1
new_point = position_from_distance(new_point, (lat_dist, long_dist))
new_df['latitude'].iat[i + 1] = new_point[0]
new_df['longitude'].iat[i + 1] = new_point[1]
if in_place:
self.df = new_df
return self
else:
return AISTrajectory(new_df, mmsi=self.mmsi)
def get_time_per_label_shift(self, label_column='label'):
current_label = -1
result = []
for index, row in self.df.iterrows():
if current_label != row[label_column]:
current_label = row[label_column]
result.append((row['ts_sec'], current_label))
return result
def generate_array_from_positions(self, height=256, width=256, link=True, bounding_box='fit', features=None,
node_size=0):
nb_channels = 1
if features is not None:
nb_channels = len(features)
data = np.zeros((height, width, nb_channels), dtype=np.uint8)
if bounding_box != 'fit':
raise ValueError("feature not implemented")
positions = self.df[['longitude', 'latitude']].to_numpy()
min_lon, max_lon = (min(positions[:, 0]), max(positions[:, 0]))
min_lat, max_lat = (min(positions[:, 1]), max(positions[:, 1]))
if min_lat == max_lat:
min_lat -= 1
max_lat += 1
if min_lon == max_lon:
min_lon -= 1
max_lon += 1
for longitude, latitude in positions:
x_coord, y_coord = get_coord(latitude, longitude, height, width, min_lat, max_lat, min_lon, max_lon)
x_lower_bound = max(0, x_coord - node_size)
x_upper_bound = min(height - 1, x_coord + node_size)
y_lower_bound = max(0, y_coord - node_size)
y_upper_bound = min(width - 1, y_coord + node_size)
for x in range(x_lower_bound, x_upper_bound + 1):
for y in range(y_lower_bound, y_upper_bound + 1):
data[x, y] = [1]
if link:
lon, lat = positions[0, 0], positions[0, 1]
for longitude, latitude in positions[1:]:
x_prv, y_prev = get_coord(lat, lon, height, width, min_lat, max_lat, min_lon, max_lon)
x_nxt, y_nxt = get_coord(latitude, longitude, height, width, min_lat, max_lat, min_lon, max_lon)
lon, lat = longitude, latitude
for x, y in bresenham(x_prv, y_prev, x_nxt, y_nxt):
data[x, y] = [1]
return data
......@@ -9,6 +9,6 @@ from skais.ais.ais_trajectory import AISTrajectory
def get_trajectories(ais_points):
trajectories = []
for mmsi in ais_points.df.mmsi.unique():
trajectories.append(AISTrajectory(ais_points.df[ais_points.df['mmsi'] == mmsi].reset_index(drop=True)))
trajectories.append(AISTrajectory(ais_points.df[ais_points.df['mmsi'] == mmsi].reset_index(drop=True),
mmsi=mmsi))
return trajectories
......@@ -18,16 +18,14 @@ def angular_dispersion(angles):
def angular_mean(angles):
x, y = angular_average_vector(angles)
theta = abs(np.arctan2(x, y))
theta = np.arctan(y/x)
if y > 0 and x > 0: # first Q
if y > 0 and x > 0:
return theta
if y > 0 >= x: # Second Q
return np.pi / 2 + theta
if y <= 0 < x: # Fourth Q
return np.pi / 2 - theta
else: # Third Q
return - theta
elif x <= 0:
return np.pi + theta
else:
return 2*np.pi + theta
def angular_std(angles):
......
import tqdm as tqdm
from skais.process.data_augmentation.data_transformer import DataTransformer
from skais.process.data_augmentation.flip import Flip
from skais.process.data_augmentation.pipeline import Pipeline
from skais.process.data_augmentation.translator import Translator
class AugmentationEngine:
def __init__(self, translation_values=None, flip_values=None, keep_original=True):
self.pipelines = []
if keep_original:
self.pipelines.append(DataTransformer())
if translation_values is not None:
for tv_long, tv_lat in translation_values:
self.pipelines.append(Pipeline([Translator(tv_long, tv_lat)]))
if flip_values is not None:
for fv_meridian, fv_parallel in flip_values:
self.pipelines.append(Pipeline([Flip(fv_meridian, fv_parallel)]))
if flip_values is not None and translation_values is not None:
for tv_long, tv_lat in translation_values:
translator = Translator(tv_long, tv_lat)
for fv_meridian, fv_parallel in flip_values:
flip = Flip(fv_meridian, fv_parallel)
self.pipelines.append(Pipeline([translator, flip]))
def transform(self, x, verbose=0):
results = []
iterator = self.pipelines
if verbose > 0:
iterator = tqdm.tqdm(self.pipelines)
for p in iterator:
results += p.transform(x)
return results
class DataTransformer:
def transform(self, x):
return x
from skais.ais.ais_trajectory import AISTrajectory
from skais.process.data_augmentation.data_transformer import DataTransformer
class Flip(DataTransformer):
def __init__(self, meridian=None, parallel=None):
self.meridian = meridian
self.parallel = parallel
def transform(self, x):
result = []
if self.parallel is not None:
for trajectory in x:
df = trajectory.df.copy()
df['latitude'] = -trajectory.df['latitude']
result.append(AISTrajectory(df))
return result
from skais.process.data_augmentation.data_transformer import DataTransformer
class Pipeline(DataTransformer):
def __init__(self, sequence):
for s in sequence:
assert (isinstance(s, DataTransformer))
self.sequence = sequence
def transform(self, x):
result = x.copy()
for aug in self.sequence:
result = aug.transform(result)
return result
from skais.ais.ais_trajectory import AISTrajectory
from skais.process.data_augmentation.data_transformer import DataTransformer
class Translator(DataTransformer):
def __init__(self, longitude, latitude):
self.longitude = longitude
self.latitude = latitude
def transform(self, x):
result = []
for trajectory in x:
df = trajectory.df.copy()
df['longitude'] = trajectory.df['longitude'] + self.longitude
result.append(AISTrajectory(df))
return result
......@@ -17,7 +17,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
)
)
......@@ -31,7 +31,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -49,13 +49,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [1000] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]}
)
)
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0]
"heading": [0.0 for _ in range(0, 359, 10)] + [0]
}
)
ais_points.remove_outliers(["cog", "heading"])
......@@ -65,13 +65,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [1000] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]}
)
)
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [0.0 for i in range(0, 359, 10)]
"heading": [0.0 for _ in range(0, 359, 10)]
}
)
ais_points.remove_outliers(["cog", "heading"], rank=2)
......@@ -81,13 +81,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000]}
)
)
expected = pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [10000]
"heading": [0.0 for _ in range(0, 359, 10)] + [10000]
}
)
ais_points.remove_outliers(["cog"])
......@@ -98,7 +98,7 @@ class TestAISPositions(unittest.TestCase):
pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000]
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000]
}
)
)
......@@ -109,7 +109,7 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [489, 456, -12] + [180, 180, 180],
"heading": [180 for i in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333],
"heading": [180 for _ in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333],
}
)
)
......@@ -117,7 +117,7 @@ class TestAISPositions(unittest.TestCase):
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180 for _ in range(0, 359, 10)]
}
)
......@@ -130,17 +130,17 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180.0 for _ in range(0, 359, 10)]
}
)
)
ais_points.normalize(['cog', 'heading'])