Module secfsdstools.f_standardize.base_validation_rules
This module contains "Validation Rules"
Expand source code
"""
This module contains "Validation Rules"
"""
from abc import ABC
from abc import abstractmethod
from typing import List
import numpy as np
import pandas as pd
import pandera as pa
from secfsdstools.f_standardize.base_rule_framework import DescriptionEntry
class ValidationRule(ABC):
"""
Base class for all validation rules.
A Validation Rules checks if some basic requirements are met. For instance, in a Balance Sheet,
the equation Assets = AssetsCurrent + AssetsNoncurrent should be true if the data is valid.
So validation rules check if such requirements are actually true. They add two columns to the
main dataset, an "error" column, which contains the relative error, and a "cat" column which
contains the category.
The categories are: 0 for an exact match, 1 if the relative error is less than 1% off,
5 if the relativ error is less than 5% off, 10 if the relative error is less than 10 % off,
and a 100 if it is above 10%.
"""
def __init__(self, identifier: str):
"""
set the identifier of the rule. This is used as prefix when the validation columns are
added to the dataframe.
Args:
id: the id of the rule
"""
self.identifier = identifier
@abstractmethod
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
For validation rules this means to select the rows for which all necessary tags
are available.
Args:
data_df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
@abstractmethod
def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \
pa.typing.Series[np.float64]:
"""
implements the calculation logic.
Args:
data_df: the dataset to validate
mask: the mask masking the rows to which the calculation should be applied to.
Returns:
pa.typing.Series[np.float64]: containing the relative error
"""
def validate(self, data_df: pd.DataFrame):
"""
executes the validation on the dataframe and adds the two validation columns to the
dataframe. <id>_error containing the relative error and <id>_cat containing the category.
"""
mask = self.mask(data_df)
error = self.calculate_error(data_df, mask)
error_column_name = f'{self.identifier}_error'
cat_column_name = f'{self.identifier}_cat'
data_df[error_column_name] = np.nan
data_df.loc[mask, error_column_name] = error
data_df.loc[mask, cat_column_name] = 100 # gt > 0.1 / 10%
data_df.loc[data_df[error_column_name] <= 0.1, cat_column_name] = 10 # 5-10 %
data_df.loc[data_df[error_column_name] <= 0.05, cat_column_name] = 5 # 1-5 %
data_df.loc[data_df[error_column_name] <= 0.01, cat_column_name] = 1 # < 1%
data_df.loc[data_df[error_column_name] == 0.0, cat_column_name] = 0 # exact match
@abstractmethod
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
def collect_description(self, part: str) -> DescriptionEntry:
"""
Returns the description of this rule elements and its children as a list.
Args:
part (str): the part to which the element belongs to
Returns:
DescriptionEntry
"""
return DescriptionEntry(
part=part,
type="Validation",
ruleclass=self.__class__.__name__,
identifier=self.identifier,
description=self.get_description())
class SumValidationRule(ValidationRule):
"""
Checks whether an expected sum - equation is actually true. So for instance if the equation
Assets = AssetsCurrent + AssetsNoncurrent is actually true.
"""
def __init__(self, identifier: str, sum_tag: str, summands: List[str]):
"""
Args:
identifier: the identifier of the rule
sum_tag: the tag that should contain the sum of all summands
summands: the list with summands
"""
super().__init__(identifier=identifier)
self.sum_tag = sum_tag
self.summands = summands
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
For validation rules this means to select the rows for which all necessary tags
are available.
Args:
df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
mask = ~data_df[self.sum_tag].isna()
for summand in self.summands:
mask = mask & ~data_df[summand].isna()
return mask
def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \
pa.typing.Series[np.float64]:
"""
implements the calculation logic.
Args:
data_df: the dataset to validate
mask: the mask masking the rows to which the calculation should be applied to.
Returns:
pa.typing.Series[np.float64]: containing the relative error
"""
# subtract the sum of all summands from the sum_tag and make it relatvie by dividing
# with the sum_tag. finally make the result absolute, so that error is always positive
result = ((data_df[self.sum_tag] - data_df[self.summands].sum(axis='columns'))
/ data_df[self.sum_tag]).abs()
# correct if we did devide by zero
mask_zero = data_df[self.sum_tag] == 0.0
result.loc[mask_zero & (data_df[self.summands].sum(axis='columns') == 0.0)] = 0.0
return result
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
return f"Checks whether the sum of {self.summands} equals the value in '{self.sum_tag}'"
class IsSetValidationRule(ValidationRule):
"""
Checks whether a certain tag has a value and is not nan.
"""
def __init__(self, identifier: str, tag: str):
"""
Args:
identifier: the identifier of the rule
tag: the tag to check if it has a value
"""
super().__init__(identifier=identifier)
self.tag = tag
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
For validation rules this means to select the rows for which all necessary tags
are available.
Args:
df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
# we have to mask all rows
return pd.Series([True] * len(data_df), index=data_df.index)
def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \
pa.typing.Series[np.float64]:
"""
implements the calculation logic.
Args:
data_df: the dataset to validate
mask: the mask masking the rows to which the calculation should be applied to.
Returns:
pa.typing.Series[np.float64]: containing the relative error
"""
# create Series with same length as dataframe and use same index
result = pd.Series([0.0] * len(data_df), index=data_df.index)
# mask the entries which don't have a value and set them as error = 1
# this will lead to only two categories: 0 and 100
result.loc[data_df[self.tag].isna()] = 1.0
return result
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
return f"Checks whether the {self.tag} is set."
Classes
class IsSetValidationRule (identifier: str, tag: str)
-
Checks whether a certain tag has a value and is not nan.
Args
identifier
- the identifier of the rule
tag
- the tag to check if it has a value
Expand source code
class IsSetValidationRule(ValidationRule): """ Checks whether a certain tag has a value and is not nan. """ def __init__(self, identifier: str, tag: str): """ Args: identifier: the identifier of the rule tag: the tag to check if it has a value """ super().__init__(identifier=identifier) self.tag = tag def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ # we have to mask all rows return pd.Series([True] * len(data_df), index=data_df.index) def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \ pa.typing.Series[np.float64]: """ implements the calculation logic. Args: data_df: the dataset to validate mask: the mask masking the rows to which the calculation should be applied to. Returns: pa.typing.Series[np.float64]: containing the relative error """ # create Series with same length as dataframe and use same index result = pd.Series([0.0] * len(data_df), index=data_df.index) # mask the entries which don't have a value and set them as error = 1 # this will lead to only two categories: 0 and 100 result.loc[data_df[self.tag].isna()] = 1.0 return result def get_description(self) -> str: """ Returns the description String Returns: str: description """ return f"Checks whether the {self.tag} is set."
Ancestors
- ValidationRule
- abc.ABC
Methods
def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]
-
returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available.
Args
df
- dataframe on which the rules should be applied
Returns
pa.typing.Series[bool]
- a boolean Series that marks which rows have to be calculated
Expand source code
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ # we have to mask all rows return pd.Series([True] * len(data_df), index=data_df.index)
Inherited members
class SumValidationRule (identifier: str, sum_tag: str, summands: List[str])
-
Checks whether an expected sum - equation is actually true. So for instance if the equation Assets = AssetsCurrent + AssetsNoncurrent is actually true.
Args
identifier
- the identifier of the rule
sum_tag
- the tag that should contain the sum of all summands
summands
- the list with summands
Expand source code
class SumValidationRule(ValidationRule): """ Checks whether an expected sum - equation is actually true. So for instance if the equation Assets = AssetsCurrent + AssetsNoncurrent is actually true. """ def __init__(self, identifier: str, sum_tag: str, summands: List[str]): """ Args: identifier: the identifier of the rule sum_tag: the tag that should contain the sum of all summands summands: the list with summands """ super().__init__(identifier=identifier) self.sum_tag = sum_tag self.summands = summands def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ mask = ~data_df[self.sum_tag].isna() for summand in self.summands: mask = mask & ~data_df[summand].isna() return mask def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \ pa.typing.Series[np.float64]: """ implements the calculation logic. Args: data_df: the dataset to validate mask: the mask masking the rows to which the calculation should be applied to. Returns: pa.typing.Series[np.float64]: containing the relative error """ # subtract the sum of all summands from the sum_tag and make it relatvie by dividing # with the sum_tag. finally make the result absolute, so that error is always positive result = ((data_df[self.sum_tag] - data_df[self.summands].sum(axis='columns')) / data_df[self.sum_tag]).abs() # correct if we did devide by zero mask_zero = data_df[self.sum_tag] == 0.0 result.loc[mask_zero & (data_df[self.summands].sum(axis='columns') == 0.0)] = 0.0 return result def get_description(self) -> str: """ Returns the description String Returns: str: description """ return f"Checks whether the sum of {self.summands} equals the value in '{self.sum_tag}'"
Ancestors
- ValidationRule
- abc.ABC
Methods
def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]
-
returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available.
Args
df
- dataframe on which the rules should be applied
Returns
pa.typing.Series[bool]
- a boolean Series that marks which rows have to be calculated
Expand source code
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ mask = ~data_df[self.sum_tag].isna() for summand in self.summands: mask = mask & ~data_df[summand].isna() return mask
Inherited members
class ValidationRule (identifier: str)
-
Base class for all validation rules.
A Validation Rules checks if some basic requirements are met. For instance, in a Balance Sheet, the equation Assets = AssetsCurrent + AssetsNoncurrent should be true if the data is valid.
So validation rules check if such requirements are actually true. They add two columns to the main dataset, an "error" column, which contains the relative error, and a "cat" column which contains the category.
The categories are: 0 for an exact match, 1 if the relative error is less than 1% off, 5 if the relativ error is less than 5% off, 10 if the relative error is less than 10 % off, and a 100 if it is above 10%.
set the identifier of the rule. This is used as prefix when the validation columns are added to the dataframe.
Args
id
- the id of the rule
Expand source code
class ValidationRule(ABC): """ Base class for all validation rules. A Validation Rules checks if some basic requirements are met. For instance, in a Balance Sheet, the equation Assets = AssetsCurrent + AssetsNoncurrent should be true if the data is valid. So validation rules check if such requirements are actually true. They add two columns to the main dataset, an "error" column, which contains the relative error, and a "cat" column which contains the category. The categories are: 0 for an exact match, 1 if the relative error is less than 1% off, 5 if the relativ error is less than 5% off, 10 if the relative error is less than 10 % off, and a 100 if it is above 10%. """ def __init__(self, identifier: str): """ set the identifier of the rule. This is used as prefix when the validation columns are added to the dataframe. Args: id: the id of the rule """ self.identifier = identifier @abstractmethod def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: data_df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ @abstractmethod def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \ pa.typing.Series[np.float64]: """ implements the calculation logic. Args: data_df: the dataset to validate mask: the mask masking the rows to which the calculation should be applied to. Returns: pa.typing.Series[np.float64]: containing the relative error """ def validate(self, data_df: pd.DataFrame): """ executes the validation on the dataframe and adds the two validation columns to the dataframe. <id>_error containing the relative error and <id>_cat containing the category. """ mask = self.mask(data_df) error = self.calculate_error(data_df, mask) error_column_name = f'{self.identifier}_error' cat_column_name = f'{self.identifier}_cat' data_df[error_column_name] = np.nan data_df.loc[mask, error_column_name] = error data_df.loc[mask, cat_column_name] = 100 # gt > 0.1 / 10% data_df.loc[data_df[error_column_name] <= 0.1, cat_column_name] = 10 # 5-10 % data_df.loc[data_df[error_column_name] <= 0.05, cat_column_name] = 5 # 1-5 % data_df.loc[data_df[error_column_name] <= 0.01, cat_column_name] = 1 # < 1% data_df.loc[data_df[error_column_name] == 0.0, cat_column_name] = 0 # exact match @abstractmethod def get_description(self) -> str: """ Returns the description String Returns: str: description """ def collect_description(self, part: str) -> DescriptionEntry: """ Returns the description of this rule elements and its children as a list. Args: part (str): the part to which the element belongs to Returns: DescriptionEntry """ return DescriptionEntry( part=part, type="Validation", ruleclass=self.__class__.__name__, identifier=self.identifier, description=self.get_description())
Ancestors
- abc.ABC
Subclasses
Methods
def calculate_error(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandera.typing.pandas.Series[numpy.float64]
-
implements the calculation logic.
Args
data_df
- the dataset to validate
mask
- the mask masking the rows to which the calculation should be applied to.
Returns
pa.typing.Series[np.float64]
- containing the relative error
Expand source code
@abstractmethod def calculate_error(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> \ pa.typing.Series[np.float64]: """ implements the calculation logic. Args: data_df: the dataset to validate mask: the mask masking the rows to which the calculation should be applied to. Returns: pa.typing.Series[np.float64]: containing the relative error """
def collect_description(self, part: str) ‑> DescriptionEntry
-
Returns the description of this rule elements and its children as a list.
Args
part
:str
- the part to which the element belongs to
Returns
DescriptionEntry
Expand source code
def collect_description(self, part: str) -> DescriptionEntry: """ Returns the description of this rule elements and its children as a list. Args: part (str): the part to which the element belongs to Returns: DescriptionEntry """ return DescriptionEntry( part=part, type="Validation", ruleclass=self.__class__.__name__, identifier=self.identifier, description=self.get_description())
def get_description(self) ‑> str
-
Returns the description String
Returns
str
- description
Expand source code
@abstractmethod def get_description(self) -> str: """ Returns the description String Returns: str: description """
def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]
-
returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available.
Args
data_df
- dataframe on which the rules should be applied
Returns
pa.typing.Series[bool]
- a boolean Series that marks which rows have to be calculated
Expand source code
@abstractmethod def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. For validation rules this means to select the rows for which all necessary tags are available. Args: data_df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """
def validate(self, data_df: pandas.core.frame.DataFrame)
-
executes the validation on the dataframe and adds the two validation columns to the dataframe.
_error containing the relative error and _cat containing the category. Expand source code
def validate(self, data_df: pd.DataFrame): """ executes the validation on the dataframe and adds the two validation columns to the dataframe. <id>_error containing the relative error and <id>_cat containing the category. """ mask = self.mask(data_df) error = self.calculate_error(data_df, mask) error_column_name = f'{self.identifier}_error' cat_column_name = f'{self.identifier}_cat' data_df[error_column_name] = np.nan data_df.loc[mask, error_column_name] = error data_df.loc[mask, cat_column_name] = 100 # gt > 0.1 / 10% data_df.loc[data_df[error_column_name] <= 0.1, cat_column_name] = 10 # 5-10 % data_df.loc[data_df[error_column_name] <= 0.05, cat_column_name] = 5 # 1-5 % data_df.loc[data_df[error_column_name] <= 0.01, cat_column_name] = 1 # < 1% data_df.loc[data_df[error_column_name] == 0.0, cat_column_name] = 0 # exact match