Module `secfsdstools.f_standardize.base_rules`

This module contains the Base Rule implementations.

Expand source code

""" This module contains the Base Rule implementations."""
from typing import List, Set

import pandas as pd
import pandera as pa

from secfsdstools.f_standardize.base_rule_framework import Rule, RuleEntity


class PreSumUpCorrection(Rule):
    """
    it happens, that the values for Assets and AssetsNoncurrent
    were mixed up, resp. swapped when tagged:  Example is adsh 0001692981-19-000022

    So instead of Assets = AssetsCurrent + AssetsNoncurrent
    it matches AssetsNoncurrent = Assets + AssetsCurrent.

    That is obviously wrong, so this rule corrects such a constellation.

    This rule is usually used in a preprocess step.
    """

    def __init__(self, sum_tag: str, mixed_up_summand: str, other_summand: str):
        """

        Args:
            sum_tag: the tag that should contain the sum
            mixed_up_summand: the summand that actually does contain the sum
            other_summand: the other summand
        """
        self.sum_tag = sum_tag
        self.mixed_up_summand = mixed_up_summand
        self.other_summand = other_summand

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag, self.mixed_up_summand]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.sum_tag, self.mixed_up_summand, self.other_summand}

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df: dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.mixed_up_summand] ==
                data_df[self.sum_tag] + data_df[self.other_summand]) \
            & (data_df[self.other_summand] > 0)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            df: dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        mixed_up_values = data_df[self.mixed_up_summand].copy()
        data_df.loc[mask, self.mixed_up_summand] = data_df[self.sum_tag]
        data_df.loc[mask, self.sum_tag] = mixed_up_values
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Swaps the values between the tag '{self.sum_tag}' and '{self.mixed_up_summand}' " \
               f"if the following equation is True " \
               f"\"'{self.mixed_up_summand}' = '{self.sum_tag}' + '{self.other_summand}\" " \
               f" and '{self.other_summand}' > 0"


class CopyTagRule(Rule):
    """ Copies the content of the original tag into the target tag, if the target tag is not set
    and the original tag is set"""

    def __init__(self, original: str, target: str):
        """
        Args:
            original: the name of the tag from which the value is copied
            target: the name of the tag to which the value is copied
        """
        self.original = original
        self.target = target

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df: dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.target].isna() &
                ~data_df[self.original].isna())

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.target] = data_df[self.original]
        return data_df

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.target, self.original}

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.target]

    def set_id(self, prefix: str):
        """
        sets the identifier of this rule entity
        Args:
            prefix: the prefix from the parent in the rule-tree which has to be part of the id
        """
        self.identifier = f'{prefix}_{self.target}<-{self.original}'

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the values from {self.original} to {self.target} " \
               f"if {self.original} is not null and {self.target} is nan"


class MissingSumRule(Rule):
    """ sums up the value from the summand_tags columns into the
        sum_tag if the sum_tag is nan and all summand_tags have values"""

    def __init__(self, sum_tag: str, summand_tags: List[str]):
        """
        Args:
            sum_tag: tag which shall contain the sum of the summand_tags
            summand_tags: tags which values have to be summed up
        """
        self.sum_tag = sum_tag
        self.summand_tags = summand_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag}
        result.update(self.summand_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = data_df[self.sum_tag].isna()
        for summand_name in self.summand_tags:
            mask = mask & ~data_df[summand_name].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = data_df[self.summand_tags].sum(axis=1)
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Sums up the values in the columns {self.summand_tags} into the column " \
               f"'{self.sum_tag}', if the column '{self.sum_tag}' is nan and if all columns" \
               f" {self.summand_tags} have a value"


class MissingSummandRule(Rule):
    """ calculates the value of a missing summand if all other summands are and the sum is set.
        """

    def __init__(self, sum_tag: str, missing_summand_tag: str, existing_summands_tags: List[str]):
        """
        Args:
            sum_tag: the tag that contains the sum
            missing_summand_tag: the summand tag that is not set
            existing_summands_tags: the summand tags which have values
        """
        self.sum_tag = sum_tag
        self.missing_summand_tag = missing_summand_tag
        self.existing_summands_tags = existing_summands_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.missing_summand_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.missing_summand_tag}
        result.update(self.existing_summands_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # sum and other_summands must be set, missing_summand must not be set
        mask = ~data_df[self.sum_tag].isna()
        for summand_name in self.existing_summands_tags:
            mask = mask & ~data_df[summand_name].isna()
        mask = mask & data_df[self.missing_summand_tag].isna()
        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        data_df.loc[mask, self.missing_summand_tag] = \
            data_df[self.sum_tag] - data_df[self.existing_summands_tags].sum(axis=1)
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Calculates the value for the missing column '{self.missing_summand_tag}' by " \
               f"subtracting the values of the columns '{self.existing_summands_tags}' from the " \
               f"column '{self.sum_tag}' if all of the columns " \
               f"{[self.sum_tag, *self.existing_summands_tags]} are set."


class SumUpRule(Rule):
    """Sums up the available Summands to a target column if the target colum is not set"""

    def __init__(self, sum_tag: str, potential_summands: List[str],
                 optional_summands=None):
        """
        Args:
            sum_tag: the tag containing the sum
            potential_summands: potential tags with values that have to be summed up
            optional_summands: optional summands are only added if at least one potential summand
                               is present
        """
        if optional_summands is None:
            optional_summands = []

        self.sum_tag = sum_tag
        self.potential_summands = potential_summands
        self.optional_summands = optional_summands
        self.all_summands = potential_summands + optional_summands

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.sum_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag}
        result.update(self.potential_summands)
        result.update(self.optional_summands)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # mask if the target was not set and at least one of the summands is present.
        # only  potential summands are relevant to mask which entries are selected
        mask_summands = (~data_df[self.potential_summands].isna()).any(axis=1)

        return data_df[self.sum_tag].isna() & mask_summands

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = 0.0  # initialize

        for summand in self.all_summands:
            summand_mask = mask & ~data_df[summand].isna()
            data_df.loc[summand_mask, self.sum_tag] = data_df[self.sum_tag] + data_df[
                summand]

        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """

        if len(self.optional_summands) > 0:
            return f"Sums up the available values in the columns {self.potential_summands} and" \
                   f" {self.optional_summands} into the column '{self.sum_tag}'. Values from " \
                   f"{self.optional_summands} are only added, if at least one value in " \
                   f" {self.potential_summands} is present. '{self.sum_tag}' must be nan."

        return f"Sums up the availalbe values in the columns {self.potential_summands} into" \
               f" the column '{self.sum_tag}', if the column '{self.sum_tag}' is nan"


class SubtractFromRule(Rule):
    """Subtracts any of the available subtract tags from the a tag and stores the value in the
     target tag if column if the target colum is not set"""

    def __init__(self, target_tag: str,
                 subtract_from_tag: str,
                 potential_subtract_tags: List[str]):
        """
        Args:
            target_tag: the tag to store the results in
            subtract_from_tag: the tag from which values have to be subtracted
            potential_subtract_tags: tags to subtract
        """
        self.target_tag = target_tag
        self.subtract_from_tag = subtract_from_tag
        self.potential_subtract_tags = potential_subtract_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.target_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.target_tag, self.subtract_from_tag}
        result.update(self.potential_subtract_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # mask if the target was not set and the subtract_from_tag is set and
        # at least one of the potential subtracts is present.
        mask_subtract_tags = (~data_df[self.potential_subtract_tags].isna()).any(axis=1)

        return (data_df[self.target_tag].isna() & ~data_df[self.subtract_from_tag].isna()
                & mask_subtract_tags)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.target_tag] = data_df[self.subtract_from_tag]  # initialize

        for subtract in self.potential_subtract_tags:
            subtract_mask = mask & ~data_df[subtract].isna()
            data_df.loc[subtract_mask, self.target_tag] = (data_df[self.target_tag] -
                                                           data_df[subtract])
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """

        return f"Subtracts the available values in the columns {self.potential_subtract_tags}" \
               f" from the value in '{self.subtract_from_tag}' and stores the result in " \
               f" '{self.target_tag}', if '{self.target_tag}' is not set and " \
               f" '{self.subtract_from_tag}' has a value and at least one value in " \
               f" {self.potential_subtract_tags} is present."


class SetSumIfOnlyOneSummand(Rule):
    """
    If the sumt_tag is nan and only one summand has a value, then the
    sum_tag gets the copy of the summand available and the other summands are set to zero.
    """

    def __init__(self, sum_tag: str, summand_set: str, summands_nan: List[str]):
        """
        Args:
            sum_tag: target that contains the sum.
            summand_set: the summand that contains a value
            summands_nan: the summands without a value
        """
        self.sum_tag = sum_tag
        self.summand_set = summand_set
        self.summands_nan = summands_nan

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag, *self.summands_nan]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.summand_set}
        result.update(self.summands_nan)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = data_df[self.sum_tag].isna() & ~data_df[self.summand_set].isna()
        for summand_nan in self.summands_nan:
            mask = mask & data_df[summand_nan].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = data_df[self.summand_set]  # initialize
        for summand_nan in self.summands_nan:
            data_df.loc[mask, summand_nan] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the value of the column '{self.summand_set}' into the column " \
               f"'{self.sum_tag}' and sets the columns {self.summands_nan} to 0.0 if the " \
               f"column '{self.summand_set} is set and the columns " \
               f"{[self.sum_tag, *self.summands_nan]} are nan."


class PostCopyToFirstSummand(Rule):
    """
    if the sum_tag is set and the first summand and the other summands are nan,
    then copy the sum_tag value to the first summand and set the other summands to '0.0'
    """

    def __init__(self, sum_tag: str, first_summand: str, other_summands: List[str]):
        """

        Args:
            sum_tag: set sum_tag
            first_summand: missing first summand
            other_summands: missing second summand
        """
        self.sum_tag = sum_tag
        self.first_summand = first_summand
        self.other_summands = other_summands

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.first_summand, *self.other_summands]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.first_summand}
        result.update(self.other_summands)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = ~data_df[self.sum_tag].isna() & data_df[self.first_summand].isna()
        for other_summand in self.other_summands:
            mask = mask & data_df[other_summand].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        data_df.loc[mask, self.first_summand] = data_df[self.sum_tag]  # initialize
        for other_summand in self.other_summands:
            data_df.loc[mask, other_summand] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the value of the '{self.sum_tag}' to the first summand " \
               f"'{self.first_summand}' and set the other summands {self.other_summands} to 0.0 " \
               f"if '{self.sum_tag} is set and the summands " \
               f"{[self.first_summand, *self.other_summands]} are nan."


class PostSetToZero(Rule):
    """sets all tags to zero if all tags are not set"""

    def __init__(self, tags: List[str]):
        """

        Args:
            tags: list of tags that should be zeroed if all tags not set
        """
        self.tags = tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return self.tags

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return set(self.tags)

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return data_df[self.tags].isna().all(axis=1)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        for tag in self.tags:
            data_df.loc[mask, tag] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Set the value of the {self.tags} to 0.0 " \
               f"if all {self.tags} are nan."


class PostFixSign(Rule):
    """
    Fixes the sign of an Addition, if the Summand has the wrong sign.

    For instance, the following equation should be true:
    ProfitLoss + AllIncomeTaxExpenseBenefit =
         IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

    However, there are cases when
    ProfitLoss - AllIncomeTaxExpenseBenefit =
        IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

    is true, so obviously AllIncomTaxExpenseBenefit has the wrong sign.

    """

    def __init__(self, start_tag: str, summand_tag: str, result_tag: str):
        self.start_tag = start_tag
        self.summand_tag = summand_tag
        self.result_tag = result_tag

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.summand_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.start_tag, self.summand_tag, self.result_tag}

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.summand_tag] != 0) & \
            ((data_df[self.start_tag] - data_df[self.summand_tag]) == data_df[self.result_tag])

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            df: dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        # simply invert the sign
        data_df.loc[mask, self.summand_tag] = -data_df[self.summand_tag]

        return data_df

    def get_description(self) -> str:
        return f"Corrects the sign of '{self.summand_tag}' if it seems to be wrong." \
               f"Checks if '{self.start_tag} - {self.summand_tag} = {self.result_tag}'." \
               f" If that equation holds true, the sign of '{self.summand_tag}' is inversed, " \
               f" so that '{self.start_tag} - {self.summand_tag} = {self.result_tag}' becomes true"


def missingsumparts_rules_creator(sum_tag: str, summand_tags: List[str]) -> List[RuleEntity]:
    """
    Helper method that
    creates a list with rules that handle all the cases if one element of
    an addition is not set. For instance, called with 'Assets' and
    [AssetsCurrent, AssetsNoncurrent] it would create three rules:

    - MissingSumRule(Assets, [AssetsCurrent, AssetsNoncurrent])
    - MissingSummandRule(Assets, AssetsCurrent, AssetsNoncurrent)
    - MissingSummandRule(Assets, AssetsNoncurrent, AssetsCurrent)

    Args:
        sum_tag: name of the column that contains the sum
        summand_tags: name of the summands

    Returns:
        List[RuleEntity]
    """

    rules: List[RuleEntity] = [MissingSumRule(sum_tag=sum_tag,
                                              summand_tags=summand_tags)]
    for summand in summand_tags:
        others = summand_tags.copy()
        others.remove(summand)
        rules.append(MissingSummandRule(sum_tag=sum_tag,
                                        missing_summand_tag=summand,
                                        existing_summands_tags=others))
    return rules


def setsumifonlyonesummand_rules_creator(sum_tag: str, summand_tags: List[str]) -> List[RuleEntity]:
    """
    Helper method that creates all the rules to copy the value from one summand to the sum, if
    all other summands are not set.

    Args:
        sum_tag: tag that contains the sum
        summand_tags:  all summand tags

    Returns:
        List[RuleEntity]: List of SetSumIfOnlyOneSummand instances
    """

    rules: List[RuleEntity] = []

    for summand in summand_tags:
        others = summand_tags.copy()
        others.remove(summand)
        rules.append(SetSumIfOnlyOneSummand(sum_tag=sum_tag,
                                            summand_set=summand,
                                            summands_nan=others))
    return rules

Functions

def missingsumparts_rules_creator(sum_tag: str, summand_tags: List[str]) ‑> List[RuleEntity]

Helper method that creates a list with rules that handle all the cases if one element of an addition is not set. For instance, called with 'Assets' and [AssetsCurrent, AssetsNoncurrent] it would create three rules:

MissingSumRule(Assets, [AssetsCurrent, AssetsNoncurrent])
MissingSummandRule(Assets, AssetsCurrent, AssetsNoncurrent)
MissingSummandRule(Assets, AssetsNoncurrent, AssetsCurrent)

Args

sum_tag: name of the column that contains the sum
summand_tags: name of the summands

Returns

List[RuleEntity]

Expand source code

def missingsumparts_rules_creator(sum_tag: str, summand_tags: List[str]) -> List[RuleEntity]:
    """
    Helper method that
    creates a list with rules that handle all the cases if one element of
    an addition is not set. For instance, called with 'Assets' and
    [AssetsCurrent, AssetsNoncurrent] it would create three rules:

    - MissingSumRule(Assets, [AssetsCurrent, AssetsNoncurrent])
    - MissingSummandRule(Assets, AssetsCurrent, AssetsNoncurrent)
    - MissingSummandRule(Assets, AssetsNoncurrent, AssetsCurrent)

    Args:
        sum_tag: name of the column that contains the sum
        summand_tags: name of the summands

    Returns:
        List[RuleEntity]
    """

    rules: List[RuleEntity] = [MissingSumRule(sum_tag=sum_tag,
                                              summand_tags=summand_tags)]
    for summand in summand_tags:
        others = summand_tags.copy()
        others.remove(summand)
        rules.append(MissingSummandRule(sum_tag=sum_tag,
                                        missing_summand_tag=summand,
                                        existing_summands_tags=others))
    return rules

def setsumifonlyonesummand_rules_creator(sum_tag: str, summand_tags: List[str]) ‑> List[RuleEntity]

Helper method that creates all the rules to copy the value from one summand to the sum, if all other summands are not set.

Args

sum_tag: tag that contains the sum
summand_tags: all summand tags

Returns

List[RuleEntity]: List of SetSumIfOnlyOneSummand instances

Expand source code

def setsumifonlyonesummand_rules_creator(sum_tag: str, summand_tags: List[str]) -> List[RuleEntity]:
    """
    Helper method that creates all the rules to copy the value from one summand to the sum, if
    all other summands are not set.

    Args:
        sum_tag: tag that contains the sum
        summand_tags:  all summand tags

    Returns:
        List[RuleEntity]: List of SetSumIfOnlyOneSummand instances
    """

    rules: List[RuleEntity] = []

    for summand in summand_tags:
        others = summand_tags.copy()
        others.remove(summand)
        rules.append(SetSumIfOnlyOneSummand(sum_tag=sum_tag,
                                            summand_set=summand,
                                            summands_nan=others))
    return rules

Classes

class CopyTagRule (original: str, target: str)

Copies the content of the original tag into the target tag, if the target tag is not set and the original tag is set

Args

original: the name of the tag from which the value is copied
target: the name of the tag to which the value is copied

Expand source code

class CopyTagRule(Rule):
    """ Copies the content of the original tag into the target tag, if the target tag is not set
    and the original tag is set"""

    def __init__(self, original: str, target: str):
        """
        Args:
            original: the name of the tag from which the value is copied
            target: the name of the tag to which the value is copied
        """
        self.original = original
        self.target = target

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df: dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.target].isna() &
                ~data_df[self.original].isna())

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.target] = data_df[self.original]
        return data_df

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.target, self.original}

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.target]

    def set_id(self, prefix: str):
        """
        sets the identifier of this rule entity
        Args:
            prefix: the prefix from the parent in the rule-tree which has to be part of the id
        """
        self.identifier = f'{prefix}_{self.target}<-{self.original}'

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the values from {self.original} to {self.target} " \
               f"if {self.original} is not null and {self.target} is nan"

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    data_df.loc[mask, self.target] = data_df[self.original]
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    return {self.target, self.original}

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- mask
- process
- set_id

class MissingSumRule (sum_tag: str, summand_tags: List[str])

sums up the value from the summand_tags columns into the sum_tag if the sum_tag is nan and all summand_tags have values

Args

sum_tag: tag which shall contain the sum of the summand_tags
summand_tags: tags which values have to be summed up

Expand source code

class MissingSumRule(Rule):
    """ sums up the value from the summand_tags columns into the
        sum_tag if the sum_tag is nan and all summand_tags have values"""

    def __init__(self, sum_tag: str, summand_tags: List[str]):
        """
        Args:
            sum_tag: tag which shall contain the sum of the summand_tags
            summand_tags: tags which values have to be summed up
        """
        self.sum_tag = sum_tag
        self.summand_tags = summand_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag}
        result.update(self.summand_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = data_df[self.sum_tag].isna()
        for summand_name in self.summand_tags:
            mask = mask & ~data_df[summand_name].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = data_df[self.summand_tags].sum(axis=1)
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Sums up the values in the columns {self.summand_tags} into the column " \
               f"'{self.sum_tag}', if the column '{self.sum_tag}' is nan and if all columns" \
               f" {self.summand_tags} have a value"

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    data_df.loc[mask, self.sum_tag] = data_df[self.summand_tags].sum(axis=1)
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.sum_tag}
    result.update(self.summand_tags)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """
    mask = data_df[self.sum_tag].isna()
    for summand_name in self.summand_tags:
        mask = mask & ~data_df[summand_name].isna()

    return mask

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class MissingSummandRule (sum_tag: str, missing_summand_tag: str, existing_summands_tags: List[str])

calculates the value of a missing summand if all other summands are and the sum is set.

Args

sum_tag: the tag that contains the sum
missing_summand_tag: the summand tag that is not set
existing_summands_tags: the summand tags which have values

Expand source code

class MissingSummandRule(Rule):
    """ calculates the value of a missing summand if all other summands are and the sum is set.
        """

    def __init__(self, sum_tag: str, missing_summand_tag: str, existing_summands_tags: List[str]):
        """
        Args:
            sum_tag: the tag that contains the sum
            missing_summand_tag: the summand tag that is not set
            existing_summands_tags: the summand tags which have values
        """
        self.sum_tag = sum_tag
        self.missing_summand_tag = missing_summand_tag
        self.existing_summands_tags = existing_summands_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.missing_summand_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.missing_summand_tag}
        result.update(self.existing_summands_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # sum and other_summands must be set, missing_summand must not be set
        mask = ~data_df[self.sum_tag].isna()
        for summand_name in self.existing_summands_tags:
            mask = mask & ~data_df[summand_name].isna()
        mask = mask & data_df[self.missing_summand_tag].isna()
        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        data_df.loc[mask, self.missing_summand_tag] = \
            data_df[self.sum_tag] - data_df[self.existing_summands_tags].sum(axis=1)
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Calculates the value for the missing column '{self.missing_summand_tag}' by " \
               f"subtracting the values of the columns '{self.existing_summands_tags}' from the " \
               f"column '{self.sum_tag}' if all of the columns " \
               f"{[self.sum_tag, *self.existing_summands_tags]} are set."

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """

    data_df.loc[mask, self.missing_summand_tag] = \
        data_df[self.sum_tag] - data_df[self.existing_summands_tags].sum(axis=1)
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.sum_tag, self.missing_summand_tag}
    result.update(self.existing_summands_tags)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """

    # sum and other_summands must be set, missing_summand must not be set
    mask = ~data_df[self.sum_tag].isna()
    for summand_name in self.existing_summands_tags:
        mask = mask & ~data_df[summand_name].isna()
    mask = mask & data_df[self.missing_summand_tag].isna()
    return mask

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class PostCopyToFirstSummand (sum_tag: str, first_summand: str, other_summands: List[str])

if the sum_tag is set and the first summand and the other summands are nan, then copy the sum_tag value to the first summand and set the other summands to '0.0'

Args

sum_tag: set sum_tag
first_summand: missing first summand
other_summands: missing second summand

Expand source code

class PostCopyToFirstSummand(Rule):
    """
    if the sum_tag is set and the first summand and the other summands are nan,
    then copy the sum_tag value to the first summand and set the other summands to '0.0'
    """

    def __init__(self, sum_tag: str, first_summand: str, other_summands: List[str]):
        """

        Args:
            sum_tag: set sum_tag
            first_summand: missing first summand
            other_summands: missing second summand
        """
        self.sum_tag = sum_tag
        self.first_summand = first_summand
        self.other_summands = other_summands

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.first_summand, *self.other_summands]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.first_summand}
        result.update(self.other_summands)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = ~data_df[self.sum_tag].isna() & data_df[self.first_summand].isna()
        for other_summand in self.other_summands:
            mask = mask & data_df[other_summand].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        data_df.loc[mask, self.first_summand] = data_df[self.sum_tag]  # initialize
        for other_summand in self.other_summands:
            data_df.loc[mask, other_summand] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the value of the '{self.sum_tag}' to the first summand " \
               f"'{self.first_summand}' and set the other summands {self.other_summands} to 0.0 " \
               f"if '{self.sum_tag} is set and the summands " \
               f"{[self.first_summand, *self.other_summands]} are nan."

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """

    data_df.loc[mask, self.first_summand] = data_df[self.sum_tag]  # initialize
    for other_summand in self.other_summands:
        data_df.loc[mask, other_summand] = 0.0
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.sum_tag, self.first_summand}
    result.update(self.other_summands)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """
    mask = ~data_df[self.sum_tag].isna() & data_df[self.first_summand].isna()
    for other_summand in self.other_summands:
        mask = mask & data_df[other_summand].isna()

    return mask

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class PostFixSign (start_tag: str, summand_tag: str, result_tag: str)

Fixes the sign of an Addition, if the Summand has the wrong sign.

For instance, the following equation should be true: ProfitLoss + AllIncomeTaxExpenseBenefit = IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

However, there are cases when ProfitLoss - AllIncomeTaxExpenseBenefit = IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

is true, so obviously AllIncomTaxExpenseBenefit has the wrong sign.

Expand source code

class PostFixSign(Rule):
    """
    Fixes the sign of an Addition, if the Summand has the wrong sign.

    For instance, the following equation should be true:
    ProfitLoss + AllIncomeTaxExpenseBenefit =
         IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

    However, there are cases when
    ProfitLoss - AllIncomeTaxExpenseBenefit =
        IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit

    is true, so obviously AllIncomTaxExpenseBenefit has the wrong sign.

    """

    def __init__(self, start_tag: str, summand_tag: str, result_tag: str):
        self.start_tag = start_tag
        self.summand_tag = summand_tag
        self.result_tag = result_tag

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.summand_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.start_tag, self.summand_tag, self.result_tag}

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.summand_tag] != 0) & \
            ((data_df[self.start_tag] - data_df[self.summand_tag]) == data_df[self.result_tag])

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            df: dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        # simply invert the sign
        data_df.loc[mask, self.summand_tag] = -data_df[self.summand_tag]

        return data_df

    def get_description(self) -> str:
        return f"Corrects the sign of '{self.summand_tag}' if it seems to be wrong." \
               f"Checks if '{self.start_tag} - {self.summand_tag} = {self.result_tag}'." \
               f" If that equation holds true, the sign of '{self.summand_tag}' is inversed, " \
               f" so that '{self.start_tag} - {self.summand_tag} = {self.result_tag}' becomes true"

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

df: dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        df: dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    # simply invert the sign
    data_df.loc[mask, self.summand_tag] = -data_df[self.summand_tag]

    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    return {self.start_tag, self.summand_tag, self.result_tag}

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """
    return (data_df[self.summand_tag] != 0) & \
        ((data_df[self.start_tag] - data_df[self.summand_tag]) == data_df[self.result_tag])

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class PostSetToZero (tags: List[str])

sets all tags to zero if all tags are not set

Args

tags: list of tags that should be zeroed if all tags not set

Expand source code

class PostSetToZero(Rule):
    """sets all tags to zero if all tags are not set"""

    def __init__(self, tags: List[str]):
        """

        Args:
            tags: list of tags that should be zeroed if all tags not set
        """
        self.tags = tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return self.tags

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return set(self.tags)

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return data_df[self.tags].isna().all(axis=1)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        for tag in self.tags:
            data_df.loc[mask, tag] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Set the value of the {self.tags} to 0.0 " \
               f"if all {self.tags} are nan."

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """

    for tag in self.tags:
        data_df.loc[mask, tag] = 0.0
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    return set(self.tags)

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """
    return data_df[self.tags].isna().all(axis=1)

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class PreSumUpCorrection (sum_tag: str, mixed_up_summand: str, other_summand: str)

it happens, that the values for Assets and AssetsNoncurrent were mixed up, resp. swapped when tagged: Example is adsh 0001692981-19-000022

So instead of Assets = AssetsCurrent + AssetsNoncurrent it matches AssetsNoncurrent = Assets + AssetsCurrent.

That is obviously wrong, so this rule corrects such a constellation.

This rule is usually used in a preprocess step.

Args

sum_tag: the tag that should contain the sum
mixed_up_summand: the summand that actually does contain the sum
other_summand: the other summand

Expand source code

class PreSumUpCorrection(Rule):
    """
    it happens, that the values for Assets and AssetsNoncurrent
    were mixed up, resp. swapped when tagged:  Example is adsh 0001692981-19-000022

    So instead of Assets = AssetsCurrent + AssetsNoncurrent
    it matches AssetsNoncurrent = Assets + AssetsCurrent.

    That is obviously wrong, so this rule corrects such a constellation.

    This rule is usually used in a preprocess step.
    """

    def __init__(self, sum_tag: str, mixed_up_summand: str, other_summand: str):
        """

        Args:
            sum_tag: the tag that should contain the sum
            mixed_up_summand: the summand that actually does contain the sum
            other_summand: the other summand
        """
        self.sum_tag = sum_tag
        self.mixed_up_summand = mixed_up_summand
        self.other_summand = other_summand

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag, self.mixed_up_summand]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        return {self.sum_tag, self.mixed_up_summand, self.other_summand}

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df: dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        return (data_df[self.mixed_up_summand] ==
                data_df[self.sum_tag] + data_df[self.other_summand]) \
            & (data_df[self.other_summand] > 0)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            df: dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """

        mixed_up_values = data_df[self.mixed_up_summand].copy()
        data_df.loc[mask, self.mixed_up_summand] = data_df[self.sum_tag]
        data_df.loc[mask, self.sum_tag] = mixed_up_values
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Swaps the values between the tag '{self.sum_tag}' and '{self.mixed_up_summand}' " \
               f"if the following equation is True " \
               f"\"'{self.mixed_up_summand}' = '{self.sum_tag}' + '{self.other_summand}\" " \
               f" and '{self.other_summand}' > 0"

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

df: dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        df: dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """

    mixed_up_values = data_df[self.mixed_up_summand].copy()
    data_df.loc[mask, self.mixed_up_summand] = data_df[self.sum_tag]
    data_df.loc[mask, self.sum_tag] = mixed_up_values
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    return {self.sum_tag, self.mixed_up_summand, self.other_summand}

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- mask
- process
- set_id

class SetSumIfOnlyOneSummand (sum_tag: str, summand_set: str, summands_nan: List[str])

If the sumt_tag is nan and only one summand has a value, then the sum_tag gets the copy of the summand available and the other summands are set to zero.

Args

sum_tag: target that contains the sum.
summand_set: the summand that contains a value
summands_nan: the summands without a value

Expand source code

class SetSumIfOnlyOneSummand(Rule):
    """
    If the sumt_tag is nan and only one summand has a value, then the
    sum_tag gets the copy of the summand available and the other summands are set to zero.
    """

    def __init__(self, sum_tag: str, summand_set: str, summands_nan: List[str]):
        """
        Args:
            sum_tag: target that contains the sum.
            summand_set: the summand that contains a value
            summands_nan: the summands without a value
        """
        self.sum_tag = sum_tag
        self.summand_set = summand_set
        self.summands_nan = summands_nan

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns
        """
        return [self.sum_tag, *self.summands_nan]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag, self.summand_set}
        result.update(self.summands_nan)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """
        mask = data_df[self.sum_tag].isna() & ~data_df[self.summand_set].isna()
        for summand_nan in self.summands_nan:
            mask = mask & data_df[summand_nan].isna()

        return mask

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = data_df[self.summand_set]  # initialize
        for summand_nan in self.summands_nan:
            data_df.loc[mask, summand_nan] = 0.0
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """
        return f"Copies the value of the column '{self.summand_set}' into the column " \
               f"'{self.sum_tag}' and sets the columns {self.summands_nan} to 0.0 if the " \
               f"column '{self.summand_set} is set and the columns " \
               f"{[self.sum_tag, *self.summands_nan]} are nan."

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    data_df.loc[mask, self.sum_tag] = data_df[self.summand_set]  # initialize
    for summand_nan in self.summands_nan:
        data_df.loc[mask, summand_nan] = 0.0
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.sum_tag, self.summand_set}
    result.update(self.summands_nan)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """
    mask = data_df[self.sum_tag].isna() & ~data_df[self.summand_set].isna()
    for summand_nan in self.summands_nan:
        mask = mask & data_df[summand_nan].isna()

    return mask

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class SubtractFromRule (target_tag: str, subtract_from_tag: str, potential_subtract_tags: List[str])

Subtracts any of the available subtract tags from the a tag and stores the value in the target tag if column if the target colum is not set

Args

target_tag: the tag to store the results in
subtract_from_tag: the tag from which values have to be subtracted
potential_subtract_tags: tags to subtract

Expand source code

class SubtractFromRule(Rule):
    """Subtracts any of the available subtract tags from the a tag and stores the value in the
     target tag if column if the target colum is not set"""

    def __init__(self, target_tag: str,
                 subtract_from_tag: str,
                 potential_subtract_tags: List[str]):
        """
        Args:
            target_tag: the tag to store the results in
            subtract_from_tag: the tag from which values have to be subtracted
            potential_subtract_tags: tags to subtract
        """
        self.target_tag = target_tag
        self.subtract_from_tag = subtract_from_tag
        self.potential_subtract_tags = potential_subtract_tags

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.target_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.target_tag, self.subtract_from_tag}
        result.update(self.potential_subtract_tags)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # mask if the target was not set and the subtract_from_tag is set and
        # at least one of the potential subtracts is present.
        mask_subtract_tags = (~data_df[self.potential_subtract_tags].isna()).any(axis=1)

        return (data_df[self.target_tag].isna() & ~data_df[self.subtract_from_tag].isna()
                & mask_subtract_tags)

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.target_tag] = data_df[self.subtract_from_tag]  # initialize

        for subtract in self.potential_subtract_tags:
            subtract_mask = mask & ~data_df[subtract].isna()
            data_df.loc[subtract_mask, self.target_tag] = (data_df[self.target_tag] -
                                                           data_df[subtract])
        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """

        return f"Subtracts the available values in the columns {self.potential_subtract_tags}" \
               f" from the value in '{self.subtract_from_tag}' and stores the result in " \
               f" '{self.target_tag}', if '{self.target_tag}' is not set and " \
               f" '{self.subtract_from_tag}' has a value and at least one value in " \
               f" {self.potential_subtract_tags} is present."

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    data_df.loc[mask, self.target_tag] = data_df[self.subtract_from_tag]  # initialize

    for subtract in self.potential_subtract_tags:
        subtract_mask = mask & ~data_df[subtract].isna()
        data_df.loc[subtract_mask, self.target_tag] = (data_df[self.target_tag] -
                                                       data_df[subtract])
    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.target_tag, self.subtract_from_tag}
    result.update(self.potential_subtract_tags)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """

    # mask if the target was not set and the subtract_from_tag is set and
    # at least one of the potential subtracts is present.
    mask_subtract_tags = (~data_df[self.potential_subtract_tags].isna()).any(axis=1)

    return (data_df[self.target_tag].isna() & ~data_df[self.subtract_from_tag].isna()
            & mask_subtract_tags)

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id

class SumUpRule (sum_tag: str, potential_summands: List[str], optional_summands=None)

Sums up the available Summands to a target column if the target colum is not set

Args

sum_tag: the tag containing the sum
potential_summands: potential tags with values that have to be summed up
optional_summands: optional summands are only added if at least one potential summand is present

Expand source code

class SumUpRule(Rule):
    """Sums up the available Summands to a target column if the target colum is not set"""

    def __init__(self, sum_tag: str, potential_summands: List[str],
                 optional_summands=None):
        """
        Args:
            sum_tag: the tag containing the sum
            potential_summands: potential tags with values that have to be summed up
            optional_summands: optional summands are only added if at least one potential summand
                               is present
        """
        if optional_summands is None:
            optional_summands = []

        self.sum_tag = sum_tag
        self.potential_summands = potential_summands
        self.optional_summands = optional_summands
        self.all_summands = potential_summands + optional_summands

    def get_target_tags(self) -> List[str]:
        """
        returns a list of tags that could be affected/changed by this rule

        Returns:
            List[str]: list with affected tags/columns

        """
        return [self.sum_tag]

    def get_input_tags(self) -> Set[str]:
        """
        return all tags that the rules within this group need.

        Returns:
            Set[str]: set of all input tags that are used by rules in this group
        """
        result = {self.sum_tag}
        result.update(self.potential_summands)
        result.update(self.optional_summands)
        return result

    def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
        """
            returns a Series[bool] which defines the rows to which this rule has to be applied.

        Args:
            data_df dataframe on which the rules should be applied

        Returns:
            pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
        """

        # mask if the target was not set and at least one of the summands is present.
        # only  potential summands are relevant to mask which entries are selected
        mask_summands = (~data_df[self.potential_summands].isna()).any(axis=1)

        return data_df[self.sum_tag].isna() & mask_summands

    def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
        """
        apply the rule on the provided dataframe. the rows, on which the rule has to be applied
        is defined by the provide mask Series.

        Important, the rules have to be applied "in-place", so no new dataframe is produced.

        Args:
            data_df dataframe on which the rule has to be applied
            mask: a Series marking the rows in the dataframe on which the rule has to be applied
        Returns:
            pd.DataFrame: make the process chainable
        """
        data_df.loc[mask, self.sum_tag] = 0.0  # initialize

        for summand in self.all_summands:
            summand_mask = mask & ~data_df[summand].isna()
            data_df.loc[summand_mask, self.sum_tag] = data_df[self.sum_tag] + data_df[
                summand]

        return data_df

    def get_description(self) -> str:
        """
        Returns the description String
        Returns:
            str: description
        """

        if len(self.optional_summands) > 0:
            return f"Sums up the available values in the columns {self.potential_summands} and" \
                   f" {self.optional_summands} into the column '{self.sum_tag}'. Values from " \
                   f"{self.optional_summands} are only added, if at least one value in " \
                   f" {self.potential_summands} is present. '{self.sum_tag}' must be nan."

        return f"Sums up the availalbe values in the columns {self.potential_summands} into" \
               f" the column '{self.sum_tag}', if the column '{self.sum_tag}' is nan"

Ancestors

Methods

def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame

apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.

Important, the rules have to be applied "in-place", so no new dataframe is produced.

Args

data_df dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied

Returns

pd.DataFrame: make the process chainable

Expand source code

def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
    """
    apply the rule on the provided dataframe. the rows, on which the rule has to be applied
    is defined by the provide mask Series.

    Important, the rules have to be applied "in-place", so no new dataframe is produced.

    Args:
        data_df dataframe on which the rule has to be applied
        mask: a Series marking the rows in the dataframe on which the rule has to be applied
    Returns:
        pd.DataFrame: make the process chainable
    """
    data_df.loc[mask, self.sum_tag] = 0.0  # initialize

    for summand in self.all_summands:
        summand_mask = mask & ~data_df[summand].isna()
        data_df.loc[summand_mask, self.sum_tag] = data_df[self.sum_tag] + data_df[
            summand]

    return data_df

def get_input_tags(self) ‑> Set[str]

return all tags that the rules within this group need.

Returns

Set[str]: set of all input tags that are used by rules in this group

Expand source code

def get_input_tags(self) -> Set[str]:
    """
    return all tags that the rules within this group need.

    Returns:
        Set[str]: set of all input tags that are used by rules in this group
    """
    result = {self.sum_tag}
    result.update(self.potential_summands)
    result.update(self.optional_summands)
    return result

def mask(self, data_df: pandas.core.frame.DataFrame) ‑> pandera.typing.pandas.Series[bool]

returns a Series[bool] which defines the rows to which this rule has to be applied.

Args

data_df dataframe on which the rules should be applied

Returns

pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated

Expand source code

def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
    """
        returns a Series[bool] which defines the rows to which this rule has to be applied.

    Args:
        data_df dataframe on which the rules should be applied

    Returns:
        pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
    """

    # mask if the target was not set and at least one of the summands is present.
    # only  potential summands are relevant to mask which entries are selected
    mask_summands = (~data_df[self.potential_summands].isna()).any(axis=1)

    return data_df[self.sum_tag].isna() & mask_summands

Inherited members

Rule:
- collect_description
- get_description
- get_mask
- get_target_tags
- get_target_tags_str
- process
- set_id