Skip to content
Snippets Groups Projects
Commit 16c2a9b2 authored by Raphael Sturgis's avatar Raphael Sturgis
Browse files

modified normalization process

parent 2d27975f
Branches
Tags
2 merge requests!12version 0.2a,!10Resolve "Image creation bugs with 0 size windows"
This commit is part of merge request !10. Comments created here will be created in the context of that merge request.
......@@ -3,38 +3,6 @@ import pandas as pd
from scipy.stats import stats
# def compute_trajectories(df, time_gap, min_size=50, size_limit=500, interpolation_time=None):
# n_sample = len(df.index)
# result = []
# work_df = df.copy()
#
# index = 0
# while index < n_sample:
# i = compute_trajectory(df['ts_sec'][index:].to_numpy(), time_gap, size_limit)
# trajectory = AISTrajectory(work_df[:i], interpolation_time=interpolation_time)
# if len(trajectory.df.index) > min_size:
# result.append(trajectory)
# work_df = work_df[i:]
# index += i
#
# return result
#
#
# @jit(nopython=True)
# def compute_trajectory(times, time_gap, size_limit):
# n_samples = len(times)
#
# previous_date = times[0]
#
# i = 0
# for i in range(size_limit):
# if i >= n_samples or ((times[i] - previous_date) / 60 > time_gap):
# return i
# previous_date = times[i]
#
# return i + 1
class AISPoints:
# Todo: Should be more elegant
......@@ -73,60 +41,91 @@ class AISPoints:
self.df = self.df[self.df["heading"] <= 360]
self.df = self.df[self.df["heading"] >= 0]
def normalize(self, features, normalization_type="min-max", normalization_dict=None):
def normalize(self, min_max_features=(), standardization_features=(), third_quartile_features=(),
divide_by_value=(), divide_by_max=(), normalization_dict=None):
if normalization_dict is None:
normalization_dict = {'normalization_type': normalization_type}
if normalization_type == "min-max":
for f in features:
normalization_dict = {}
for f in min_max_features:
if f in self.df.columns:
normalization_dict[f] = {'type': 'min-max'}
minimum = self.df[f].min()
maximum = self.df[f].max()
diff = (maximum - minimum)
if diff == 0:
print("Warning: diff = %d", diff)
diff = 1
print("Warning: diff = 0")
self.df[f] = (self.df[f] - minimum)
else:
self.df[f] = (self.df[f] - minimum) / diff
normalization_dict[f"{f}_minimum"] = minimum
normalization_dict[f"{f}_maximum"] = maximum
elif normalization_type == "standardization":
for f in features:
normalization_dict[f]["minimum"] = minimum
normalization_dict[f]["maximum"] = maximum
for f in standardization_features:
if f in self.df.columns:
normalization_dict[f] = {'type': 'standardization'}
mean = self.df[f].mean()
std = self.df[f].std()
if std == 0:
print("Warning: std = %d", std)
std = 1
self.df[f] = (self.df[f] - mean) / std
normalization_dict[f"{f}_mean"] = mean
normalization_dict[f"{f}_std"] = std
normalization_dict[f]["mean"] = mean
normalization_dict[f]["std"] = std
for f in third_quartile_features:
if f in self.df.columns:
normalization_dict[f] = {'type': '3rd quartile'}
third_quartile = self.df[f].quantile(0.75)
if third_quartile == 0:
print("Warning: third quartile = %d", third_quartile)
third_quartile = 1
self.df[f] = self.df[f] / third_quartile
normalization_dict[f]["value"] = third_quartile
for t in divide_by_value:
f = t[0]
value = t[1]
if f in self.df.columns:
if value != 0:
normalization_dict[f] = {'type': 'divide by value',
'value': value}
self.df[f] = self.df[f] / value
else:
raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, "
f"standardization]")
print("Warning: dividing by 0")
for f in divide_by_max:
if f in self.df.columns:
maximum = self.df[f].max()
normalization_dict[f] = {'type': 'divide by max',
'maximum': maximum}
self.df[f] = self.df[f] / maximum
else:
normalization_type = normalization_dict['normalization_type']
if normalization_type == "min-max":
for f in features:
minimum = normalization_dict[f"{f}_minimum"]
maximum = normalization_dict[f"{f}_maximum"]
for f in normalization_dict:
if f in self.df.columns:
if normalization_dict[f]['type'] == 'min-max':
minimum = normalization_dict[f]["minimum"]
maximum = normalization_dict[f]["maximum"]
diff = (maximum - minimum)
if diff == 0:
print("Warning: diff = %d", diff)
print("Warning: diff = 0")
diff = 1
self.df[f] = (self.df[f] - minimum) / diff
elif normalization_type == "standardization":
for f in features:
mean = normalization_dict[f"{f}_mean"]
std = normalization_dict[f"{f}_std"]
elif normalization_dict[f]['type'] == "standardization":
mean = normalization_dict[f]["mean"]
std = normalization_dict[f]["std"]
if std == 0:
print("Warning: std = %d", std)
print("Warning: std = 0")
std = 1
self.df[f] = (self.df[f] - mean) / std
elif normalization_dict[f]['type'] == "3rd quartile":
third_quartile = normalization_dict[f]["value"]
self.df[f] = self.df[f] / third_quartile
elif normalization_dict[f]['type'] == "divide by value":
value = normalization_dict[f]["value"]
self.df[f] = self.df[f] / value
elif normalization_dict[f]['type'] == "divide by max":
maximum = normalization_dict[f]["maximum"]
self.df[f] = self.df[f] / maximum
else:
raise ValueError(f"{normalization_type} not a valid normalization method. Must be on of [min-max, "
f"standardization]")
return normalization_type, normalization_dict
raise ValueError(
f"{normalization_dict[f]['type']} not a valid normalization method. Must be on of [min-max,"
f" standardization, 3rd quartile, divide by value]")
return normalization_dict
# New features
def compute_drift(self):
......
......@@ -17,7 +17,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
)
)
......@@ -31,7 +31,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -49,13 +49,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [1000] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]}
)
)
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0]
"heading": [0.0 for _ in range(0, 359, 10)] + [0]
}
)
ais_points.remove_outliers(["cog", "heading"])
......@@ -65,13 +65,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [1000] + [666],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [0]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [0]}
)
)
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [0.0 for i in range(0, 359, 10)]
"heading": [0.0 for _ in range(0, 359, 10)]
}
)
ais_points.remove_outliers(["cog", "heading"], rank=2)
......@@ -81,13 +81,13 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000]}
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000]}
)
)
expected = pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [10000]
"heading": [0.0 for _ in range(0, 359, 10)] + [10000]
}
)
ais_points.remove_outliers(["cog"])
......@@ -98,7 +98,7 @@ class TestAISPositions(unittest.TestCase):
pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)] + [500] + [0],
"heading": [0.0 for i in range(0, 359, 10)] + [0] + [10000]
"heading": [0.0 for _ in range(0, 359, 10)] + [0] + [10000]
}
)
)
......@@ -109,7 +109,7 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)] + [489, 456, -12] + [180, 180, 180],
"heading": [180 for i in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333],
"heading": [180 for _ in range(0, 359, 10)] + [489, 180, 180] + [999, 666, -333],
}
)
)
......@@ -117,7 +117,7 @@ class TestAISPositions(unittest.TestCase):
expected = pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180 for _ in range(0, 359, 10)]
}
)
......@@ -130,17 +130,17 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180.0 for _ in range(0, 359, 10)]
}
)
)
ais_points.normalize(['cog', 'heading'])
ais_points.normalize(min_max_features=["cog", "heading"])
result = ais_points.df
expected = pd.DataFrame(
{
"cog": [i / 350.0 for i in range(0, 359, 10)],
"heading": [0.0 for i in range(0, 359, 10)]
"heading": [0.0 for _ in range(0, 359, 10)]
}
)
......@@ -150,12 +150,12 @@ class TestAISPositions(unittest.TestCase):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180 for _ in range(0, 359, 10)]
}
)
)
ais_points.normalize(['cog', 'heading'], normalization_type="standardization")
ais_points.normalize(standardization_features=['cog', 'heading'])
result = ais_points.df
expected = pd.DataFrame(
{
......@@ -167,35 +167,60 @@ class TestAISPositions(unittest.TestCase):
0.72196643, 0.81822862, 0.91449081, 1.010753, 1.10701519,
1.20327738, 1.29953957, 1.39580176, 1.49206395, 1.58832614,
1.68458833],
"heading": [0.0 for i in range(0, 359, 10)]
"heading": [0.0 for _ in range(0, 359, 10)]
}
)
pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True),
check_exact=False, rtol=0.05)
def test_normalize_raise(self):
def test_normalize_3r_quartile(self):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180 for _ in range(0, 359, 10)]
}
)
)
self.assertRaises(
ValueError,
ais_points.normalize,
['cog', 'heading'],
normalization_type="non-existing-normalization"
ais_points.normalize(third_quartile_features=["cog", "heading"])
result = ais_points.df
expected = pd.DataFrame(
{
"cog": [i / 270.0 for i in range(0, 359, 10)],
"heading": [1.0 for _ in range(0, 359, 10)]
}
)
pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True),
check_exact=False, rtol=0.05)
def test_normalize_divide_by_value(self):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for _ in range(0, 359, 10)]
}
)
)
ais_points.normalize(divide_by_value=[("cog", 10), ("heading", 18)])
result = ais_points.df
expected = pd.DataFrame(
{
"cog": [i / 10 for i in range(0, 359, 10)],
"heading": [10.0 for _ in range(0, 359, 10)]
}
)
pd.testing.assert_frame_equal(expected.reset_index(drop=True), result.reset_index(drop=True),
check_exact=False, rtol=0.05)
def test_compute_drift(self):
ais_points = AISPoints(pd.DataFrame(
{
"cog": [i for i in range(0, 359, 10)],
"heading": [180 for i in range(0, 359, 10)]
"heading": [180 for _ in range(0, 359, 10)]
}
)
)
......@@ -217,7 +242,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -230,7 +255,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -243,7 +268,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -256,7 +281,7 @@ class TestAISPositions(unittest.TestCase):
"diff": [35, 45, 59, 12, 1, 2, 54, 5, 47, 86, 119, 68, 75, 54, 55, 12, 32, 62, 159, 157, 132],
"label": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
"mmsi": [0 for i in range(21)]
"mmsi": [0 for _ in range(21)]
}
))
......@@ -270,7 +295,7 @@ class TestAISPositions(unittest.TestCase):
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
"ts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
"mmsi": [0 for i in range(42)]
"mmsi": [0 for _ in range(42)]
}
)
......
......@@ -12,8 +12,6 @@ def make_feature_vectors(trajectories, features=None,
trajectory.df.dropna(inplace=True)
if len(trajectory.df.index) > length_list:
trajectory.df['ts'] = trajectory.df.index
trajectory.compute_all_derivatives()
trajectory.compute_diff('heading', 'cog')
windows = trajectory.sliding_window(length_list, offset=sliding_window_gap,
fields=features + [label_field])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment