"""Implementation of the data cleansing process."""
from logging import getLogger
from typing import Optional, Tuple
import numpy as np
logger = getLogger(__name__)
[docs]class DataCleanser:
r"""
Check for dead or faulty sensors, NaNs and correct shape.
A :code:`DataCleanser` object can be used for some or all of the above tasks. Most often the
:meth:`ecgan.preprocessing.cleansing.DataCleanser.should_cleanse` method is called which checks if the series
fulfills all of the checks. Each check can also be called individually. The input series is generally expected to
be a single 2D series of shape `(seq_len, features)` with `features` being the different data channels.
By default, all values are accepted if no threshold/condition is set.
Args:
lower_fault_threshold: Lowest value accepted without removing the series from dataset.
upper_fault_threshold: Highest value accepted without removing the series from dataset.
nan_threshold: Upper limit of allowed percentage of NaNs. Remove series if
more than :math:`(self.nan\_threshold \cdot 100)\%` of all values are NaN.
target_shape: Accepted shape of series.
"""
def __init__(
self,
lower_fault_threshold: Optional[int] = None,
upper_fault_threshold: Optional[int] = None,
nan_threshold: Optional[float] = None,
target_shape: Optional[Tuple[int, int]] = None,
):
self.cleansed_total = 0
self.target_shape = target_shape
self.upper_fault_threshold = upper_fault_threshold
self.lower_fault_threshold = lower_fault_threshold
self.nan_threshold = nan_threshold
[docs] def should_cleanse(self, series: np.ndarray) -> bool:
"""
Conduct checks for a given 2D time series to determine if it should be cleansed.
Remove sample from dataset if any check fails.
Performed checks:
* :meth:`ecgan.preprocessing.cleansing.DataCleanser.check_shape`
* :meth:`ecgan.preprocessing.cleansing.DataCleanser.check_for_nan`
* :meth:`ecgan.preprocessing.cleansing.DataCleanser.check_for_dead_sensor`
* :meth:`ecgan.preprocessing.cleansing.DataCleanser.check_for_faulty_sensor`
Args:
series: 2D series of shape `seq_len, features`.
Returns:
Flag indicating whether the sample should be removed from the final dataset.
"""
cleanse_sample = (
self.check_shape(series)
or self.check_for_nan(series)
or self.check_for_dead_sensor(series)
or self.check_for_faulty_sensor(series)
)
if cleanse_sample:
self.cleansed_total += 1
return cleanse_sample
[docs] def check_shape(self, series: np.ndarray) -> bool:
"""
Check if the sample should be removed because its shape.
If no target_shape is specified in the instance creation, the shape is assumed to be a simple 2D (seq_len,
features).
Args:
series: 2D series of shape `seq_len, features`.
Returns:
Flag indicating whether the sample should be removed from the final dataset.
"""
if self.target_shape is not None and series.shape != self.target_shape:
return True
if len(series.shape) != 2:
return True
return False
[docs] def check_for_nan(self, series: np.ndarray) -> bool:
r"""
Check for NaN values in the data.
Data is marked for cleansing when at least :math:`(self.nan\_threshold \cdot 100)\%` of values of one feature
are NaN. The data is expected to be a single time series sample of shape (seq_len, features), i.e. a 2D array.
Series with more than 0 but less NaNs than allowed can impute the remaining NaNs using the
:class:`ecgan.preprocessing.preprocessor.BasePreprocessor`.
Args:
series: 2D series of shape `seq_len, features`.
Returns:
Flag indicating whether the sample should be removed from the final dataset.
"""
if self.nan_threshold is not None:
for feature in range(series.shape[1]):
nan_count = np.count_nonzero(np.isnan(series[:, feature]))
if nan_count > self.nan_threshold * series.shape[0]:
return True
return False
[docs] @staticmethod
def check_for_dead_sensor(series: np.ndarray) -> bool:
"""
Check for dead sensors in the data.
Data is marked as dead and subsequently as 'to be cleansed' if the variance (and thus the standard deviation)
of a sensor is close to zero.
Args:
series: 2D series of shape `seq_len, features`.
Returns:
Flag indicating whether the sample should be removed from the final dataset.
"""
for feature in range(series.shape[1]):
std = np.std(series[:, feature])
if np.allclose(std, 0):
return True
return False
[docs] def check_for_faulty_sensor(self, series: np.ndarray) -> bool:
"""
Check for faulty sensors in the data.
Data is marked for cleansing if certain values in the data exceed a threshold or if all values are NaN.
Args:
series: 2D series of shape `seq_len, features`.
Returns:
Flag indicating whether the sample should be removed from the final dataset.
"""
if self.upper_fault_threshold is None and self.lower_fault_threshold is None:
logger.debug('Threshold for faulty sensors not specified. Skipping fault checks.')
return False
lower_threshold = 0.0
upper_threshold = 0.0
if self.lower_fault_threshold != 0:
lower_threshold = self.lower_fault_threshold or float('-inf')
if self.upper_fault_threshold != 0:
upper_threshold = self.upper_fault_threshold or float('inf')
min_ = np.nanmin(series)
max_ = np.nanmax(series)
if min_ < lower_threshold or max_ > upper_threshold:
return True
return False