diff --git a/skais/ais/ais_points.py b/skais/ais/ais_points.py index 450458d87deaf7b146cadc299fbd29cbb2577654..3d5758ebd99e17e446a472deef06950705e10720 100644 --- a/skais/ais/ais_points.py +++ b/skais/ais/ais_points.py @@ -3,38 +3,6 @@ import pandas as pd from scipy.stats import stats -# def compute_trajectories(df, time_gap, min_size=50, size_limit=500, interpolation_time=None): -# n_sample = len(df.index) -# result = [] -# work_df = df.copy() -# -# index = 0 -# while index < n_sample: -# i = compute_trajectory(df['ts_sec'][index:].to_numpy(), time_gap, size_limit) -# trajectory = AISTrajectory(work_df[:i], interpolation_time=interpolation_time) -# if len(trajectory.df.index) > min_size: -# result.append(trajectory) -# work_df = work_df[i:] -# index += i -# -# return result -# -# -# @jit(nopython=True) -# def compute_trajectory(times, time_gap, size_limit): -# n_samples = len(times) -# -# previous_date = times[0] -# -# i = 0 -# for i in range(size_limit): -# if i >= n_samples or ((times[i] - previous_date) / 60 > time_gap): -# return i -# previous_date = times[i] -# -# return i + 1 - - class AISPoints: # Todo: Should be more elegant @@ -73,60 +41,91 @@ class AISPoints: self.df = self.df[self.df["heading"] <= 360] self.df = self.df[self.df["heading"] >= 0] - def normalize(self, features, normalization_type="min-max", normalization_dict=None): + def normalize(self, min_max_features=(), standardization_features=(), third_quartile_features=(), + divide_by_value=(), divide_by_max=(), normalization_dict=None): if normalization_dict is None: - normalization_dict = {'normalization_type': normalization_type} - if normalization_type == "min-max": - for f in features: + normalization_dict = {} + for f in min_max_features: + if f in self.df.columns: + normalization_dict[f] = {'type': 'min-max'} minimum = self.df[f].min() maximum = self.df[f].max() diff = (maximum - minimum) if diff == 0: - print("Warning: diff = %d", diff) - diff = 1 - self.df[f] = (self.df[f] - minimum) / diff - normalization_dict[f"{f}_minimum"] = minimum - normalization_dict[f"{f}_maximum"] = maximum - - elif normalization_type == "standardization": - for f in features: + print("Warning: diff = 0") + self.df[f] = (self.df[f] - minimum) + else: + self.df[f] = (self.df[f] - minimum) / diff + normalization_dict[f]["minimum"] = minimum + normalization_dict[f]["maximum"] = maximum + for f in standardization_features: + if f in self.df.columns: + normalization_dict[f] = {'type': 'standardization'} mean = self.df[f].mean() std = self.df[f].std() if std == 0: print("Warning: std = %d", std) std = 1 self.df[f] = (self.df[f] - mean) / std - normalization_dict[f"{f}_mean"] = mean - normalization_dict[f"{f}_std"] = std - - else: - raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, " - f"standardization]") + normalization_dict[f]["mean"] = mean + normalization_dict[f]["std"] = std + for f in third_quartile_features: + if f in self.df.columns: + normalization_dict[f] = {'type': '3rd quartile'} + third_quartile = self.df[f].quantile(0.75) + if third_quartile == 0: + print("Warning: third quartile = %d", third_quartile) + third_quartile = 1 + self.df[f] = self.df[f] / third_quartile + normalization_dict[f]["value"] = third_quartile + for t in divide_by_value: + f = t[0] + value = t[1] + if f in self.df.columns: + if value != 0: + normalization_dict[f] = {'type': 'divide by value', + 'value': value} + self.df[f] = self.df[f] / value + else: + print("Warning: dividing by 0") + for f in divide_by_max: + if f in self.df.columns: + maximum = self.df[f].max() + normalization_dict[f] = {'type': 'divide by max', + 'maximum': maximum} + self.df[f] = self.df[f] / maximum else: - normalization_type = normalization_dict['normalization_type'] - if normalization_type == "min-max": - for f in features: - minimum = normalization_dict[f"{f}_minimum"] - maximum = normalization_dict[f"{f}_maximum"] - diff = (maximum - minimum) - if diff == 0: - print("Warning: diff = %d", diff) - diff = 1 - self.df[f] = (self.df[f] - minimum) / diff - - elif normalization_type == "standardization": - for f in features: - mean = normalization_dict[f"{f}_mean"] - std = normalization_dict[f"{f}_std"] - if std == 0: - print("Warning: std = %d", std) - std = 1 - self.df[f] = (self.df[f] - mean) / std - - else: - raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, " - f"standardization]") - return normalization_type, normalization_dict + for f in normalization_dict: + if f in self.df.columns: + if normalization_dict[f]['type'] == 'min-max': + minimum = normalization_dict[f]["minimum"] + maximum = normalization_dict[f]["maximum"] + diff = (maximum - minimum) + if diff == 0: + print("Warning: diff = 0") + diff = 1 + self.df[f] = (self.df[f] - minimum) / diff + elif normalization_dict[f]['type'] == "standardization": + mean = normalization_dict[f]["mean"] + std = normalization_dict[f]["std"] + if std == 0: + print("Warning: std = 0") + std = 1 + self.df[f] = (self.df[f] - mean) / std + elif normalization_dict[f]['type'] == "3rd quartile": + third_quartile = normalization_dict[f]["value"] + self.df[f] = self.df[f] / third_quartile + elif normalization_dict[f]['type'] == "divide by value": + value = normalization_dict[f]["value"] + self.df[f] = self.df[f] / value + elif normalization_dict[f]['type'] == "divide by max": + maximum = normalization_dict[f]["maximum"] + self.df[f] = self.df[f] / maximum + else: + raise ValueError( + f"{normalization_dict[f]['type']} not a valid normalization method. Must be on of [min-max," + f" standardization, 3rd quartile, divide by value]") + return normalization_dict # New features def compute_drift(self): diff --git a/skais/tests/ais/test_ais_points.py b/skais/tests/ais/test_ais_points.py index e4a0310ecd079aac64105174fcf495cca162e651..330f039bd69e5c734480f6ed8bd6a687737364e4 100644 --- a/skais/tests/ais/test_ais_points.py +++ b/skais/tests/ais/test_ais_points.py @@ -17,7 +17,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } ) ) @@ -31,7 +31,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } )) @@ -49,13 +49,13 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)] + [1000] + [666], - "heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]} + "heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]} ) ) expected = pd.DataFrame( { "cog": [i for i in range(0, 359, 10)] + [666], - "heading": [0.0 for i in range(0, 359, 10)] + [0] + "heading": [0.0 for _ in range(0, 359, 10)] + [0] } ) ais_points.remove_outliers(["cog", "heading"]) @@ -65,13 +65,13 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)] + [1000] + [666], - "heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]} + "heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]} ) ) expected = pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [0.0 for i in range(0, 359, 10)] + "heading": [0.0 for _ in range(0, 359, 10)] } ) ais_points.remove_outliers(["cog", "heading"], rank=2) @@ -81,13 +81,13 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0], - "heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000]} + "heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000]} ) ) expected = pd.DataFrame( { "cog": [i / 350.0 for i in range(0, 359, 10)] + [0], - "heading": [0.0 for i in range(0, 359, 10)] + [10000] + "heading": [0.0 for _ in range(0, 359, 10)] + [10000] } ) ais_points.remove_outliers(["cog"]) @@ -98,7 +98,7 @@ class TestAISPositions(unittest.TestCase): pd.DataFrame( { "cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0], - "heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000] + "heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000] } ) ) @@ -109,7 +109,7 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)] + [489, 456, -12] + [180, 180, 180], - "heading": [180 for i in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333], + "heading": [180 for _ in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333], } ) ) @@ -117,7 +117,7 @@ class TestAISPositions(unittest.TestCase): expected = pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [180 for i in range(0, 359, 10)] + "heading": [180 for _ in range(0, 359, 10)] } ) @@ -130,17 +130,17 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [180 for i in range(0, 359, 10)] + "heading": [180.0 for _ in range(0, 359, 10)] } ) ) - ais_points.normalize(['cog', 'heading']) + ais_points.normalize(min_max_features=["cog", "heading"]) result = ais_points.df expected = pd.DataFrame( { "cog": [i / 350.0 for i in range(0, 359, 10)], - "heading": [0.0 for i in range(0, 359, 10)] + "heading": [0.0 for _ in range(0, 359, 10)] } ) @@ -150,12 +150,12 @@ class TestAISPositions(unittest.TestCase): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [180 for i in range(0, 359, 10)] + "heading": [180 for _ in range(0, 359, 10)] } ) ) - ais_points.normalize(['cog', 'heading'], normalization_type="standardization") + ais_points.normalize(standardization_features=['cog', 'heading']) result = ais_points.df expected = pd.DataFrame( { @@ -167,35 +167,60 @@ class TestAISPositions(unittest.TestCase): 0.72196643, 0.81822862, 0.91449081, 1.010753, 1.10701519, 1.20327738, 1.29953957, 1.39580176, 1.49206395, 1.58832614, 1.68458833], - "heading": [0.0 for i in range(0, 359, 10)] + "heading": [0.0 for _ in range(0, 359, 10)] } ) pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True), check_exact=False, rtol=0.05) - def test_normalize_raise(self): + def test_normalize_3r_quartile(self): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [180 for i in range(0, 359, 10)] + "heading": [180 for _ in range(0, 359, 10)] } ) ) - self.assertRaises( - ValueError, - ais_points.normalize, - ['cog', 'heading'], - normalization_type="non-existing-normalization" + ais_points.normalize(third_quartile_features=["cog", "heading"]) + result = ais_points.df + expected = pd.DataFrame( + { + "cog": [i / 270.0 for i in range(0, 359, 10)], + "heading": [1.0 for _ in range(0, 359, 10)] + } + ) + + pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True), + check_exact=False, rtol=0.05) + + def test_normalize_divide_by_value(self): + ais_points = AISPoints(pd.DataFrame( + { + "cog": [i for i in range(0, 359, 10)], + "heading": [180 for _ in range(0, 359, 10)] + } + ) ) + ais_points.normalize(divide_by_value=[("cog", 10), ("heading", 18)]) + result = ais_points.df + expected = pd.DataFrame( + { + "cog": [i / 10 for i in range(0, 359, 10)], + "heading": [10.0 for _ in range(0, 359, 10)] + } + ) + + pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True), + check_exact=False, rtol=0.05) def test_compute_drift(self): ais_points = AISPoints(pd.DataFrame( { "cog": [i for i in range(0, 359, 10)], - "heading": [180 for i in range(0, 359, 10)] + "heading": [180 for _ in range(0, 359, 10)] } ) ) @@ -217,7 +242,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } )) @@ -230,7 +255,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } )) @@ -243,7 +268,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } )) @@ -256,7 +281,7 @@ class TestAISPositions(unittest.TestCase): "diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132], "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - "mmsi": [0 for i in range(21)] + "mmsi": [0 for _ in range(21)] } )) @@ -269,11 +294,11 @@ class TestAISPositions(unittest.TestCase): "label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - "mmsi": [0 for i in range(42)] + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "mmsi": [0 for _ in range(42)] } ) value['ts'] = pd.to_datetime(value.ts) pd.testing.assert_frame_equal(AISPoints.fuse(ais_points, ais_points).df.reset_index(drop=True), - value.reset_index(drop=True)) \ No newline at end of file + value.reset_index(drop=True)) diff --git a/skais/utils/experiment_tools.py b/skais/utils/experiment_tools.py index 85fa9b31ad945381d72c5b1bd3151a3745bc7b97..8c4900f7bd2fb51a306496821112fc3e16e00055 100644 --- a/skais/utils/experiment_tools.py +++ b/skais/utils/experiment_tools.py @@ -12,8 +12,6 @@ def make_feature_vectors(trajectories, features=None, trajectory.df.dropna(inplace=True) if len(trajectory.df.index) > length_list: trajectory.df['ts'] = trajectory.df.index - trajectory.compute_all_derivatives() - trajectory.compute_diff('heading', 'cog') windows = trajectory.sliding_window(length_list, offset=sliding_window_gap, fields=features + [label_field])