Module secfsdstools.f_standardize.base_prepivot_rules
Contains PrePivotRules Definitions.
Expand source code
"""
Contains PrePivotRules Definitions.
"""
from typing import List
import pandas as pd
import pandera as pa
from secfsdstools.f_standardize.base_rule_framework import PrePivotRule
class PrePivotDeduplicate(PrePivotRule):
"""
Deduplicates the dataset based on the index_cols that are defined in the base class.
sometimes, only single tags are duplicated, however, there are also reports
where all tags of a report are duplicated.
without deduplication, the pivot command will fail.
"""
def __init__(self):
super().__init__("DeDup")
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
Args:
data_df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
return data_df.duplicated(self.index_cols)
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
"""
apply the rule on the provided dataframe. the rows, on which the rule has to be applied
is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args:
df: dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied
Returns:
pd.DataFrame: make the process chainable
"""
data_df.drop(data_df[mask].index, inplace=True)
return data_df
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
return f"Deduplicates the dataframe based on the columns {self.index_cols + ['value']}"
class PrePivotCorrectSign(PrePivotRule):
"""
Certain tags are expected to be either positive or negative when present, resp. should
in all reports be displayed the same way:
(e.g. Assets in the BS is always shown as a positive number)
However, sometimes these values are mixed up, also when considering the "negating" flag.
E.g. CostOfGoodAndServices is displayed positive in 95% of the cases and
in 5% of the cases it is displayed "positive" but with the negating flag set.
This rules ensures, that the provided tags have the expected sign.
"""
def __init__(self, tag_list: List[str], is_positive: bool):
"""
ensure that the values of the tags in the tag_list are positive (if is_positive is true),
or are negative (if is_positive is false).
Args:
tag_list: list with the names of tag that have to be checked
is_positive: flag that indicates whether positive or negative values are expected
"""
super().__init__("CorSign")
self.tag_list = tag_list
self.is_positive = is_positive
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
Args:
data_df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
# don't consider values that are 0 or values that are not set
return (data_df.tag.isin(self.tag_list)
& ~data_df.value.isna()
& ~(data_df.value == 0.0)
& ((data_df.value < 0) == self.is_positive))
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
"""
apply the rule on the provided dataframe. the rows, on which the rule has to be applied
is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args:
df: dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied
Returns:
pd.DataFrame: make the process chainable
"""
data_df.loc[mask, 'value'] = data_df.value * -1
return data_df
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
pos_neg_text = "positive" if self.is_positive else "negative"
return f"Ensures that the tags {self.tag_list} have a {pos_neg_text} value. Applied when " \
f"the expectation of having a negative or positive value is not met"
class PrePivotMaxQtrs(PrePivotRule):
"""
filters the entries that have qtrs value that are equal or below the configured max_qtrs.
"""
def __init__(self, max_qtrs: int = 4):
super().__init__("MaxQtr")
self.max_qtrs = max_qtrs
def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]:
"""
returns a Series[bool] which defines the rows to which this rule has to be applied.
Args:
data_df: dataframe on which the rules should be applied
Returns:
pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated
"""
return data_df.qtrs <= self.max_qtrs
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame:
"""
apply the rule on the provided dataframe. the rows, on which the rule has to be applied
is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args:
df: dataframe on which the rule has to be applied
mask: a Series marking the rows in the dataframe on which the rule has to be applied
Returns:
pd.DataFrame: make the process chainable
"""
return data_df[mask]
def get_description(self) -> str:
"""
Returns the description String
Returns:
str: description
"""
return f"Removes the entries that have a bigger qtrs value than {self.max_qtrs}"
Classes
class PrePivotCorrectSign (tag_list: List[str], is_positive: bool)
-
Certain tags are expected to be either positive or negative when present, resp. should in all reports be displayed the same way: (e.g. Assets in the BS is always shown as a positive number)
However, sometimes these values are mixed up, also when considering the "negating" flag.
E.g. CostOfGoodAndServices is displayed positive in 95% of the cases and in 5% of the cases it is displayed "positive" but with the negating flag set.
This rules ensures, that the provided tags have the expected sign.
ensure that the values of the tags in the tag_list are positive (if is_positive is true), or are negative (if is_positive is false).
Args
tag_list
- list with the names of tag that have to be checked
is_positive
- flag that indicates whether positive or negative values are expected
Expand source code
class PrePivotCorrectSign(PrePivotRule): """ Certain tags are expected to be either positive or negative when present, resp. should in all reports be displayed the same way: (e.g. Assets in the BS is always shown as a positive number) However, sometimes these values are mixed up, also when considering the "negating" flag. E.g. CostOfGoodAndServices is displayed positive in 95% of the cases and in 5% of the cases it is displayed "positive" but with the negating flag set. This rules ensures, that the provided tags have the expected sign. """ def __init__(self, tag_list: List[str], is_positive: bool): """ ensure that the values of the tags in the tag_list are positive (if is_positive is true), or are negative (if is_positive is false). Args: tag_list: list with the names of tag that have to be checked is_positive: flag that indicates whether positive or negative values are expected """ super().__init__("CorSign") self.tag_list = tag_list self.is_positive = is_positive def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. Args: data_df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ # don't consider values that are 0 or values that are not set return (data_df.tag.isin(self.tag_list) & ~data_df.value.isna() & ~(data_df.value == 0.0) & ((data_df.value < 0) == self.is_positive)) def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ data_df.loc[mask, 'value'] = data_df.value * -1 return data_df def get_description(self) -> str: """ Returns the description String Returns: str: description """ pos_neg_text = "positive" if self.is_positive else "negative" return f"Ensures that the tags {self.tag_list} have a {pos_neg_text} value. Applied when " \ f"the expectation of having a negative or positive value is not met"
Ancestors
- PrePivotRule
- AbstractRule
- RuleEntity
- abc.ABC
Class variables
var identifier : str
Methods
def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame
-
apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args
df
- dataframe on which the rule has to be applied
mask
- a Series marking the rows in the dataframe on which the rule has to be applied
Returns
pd.DataFrame
- make the process chainable
Expand source code
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ data_df.loc[mask, 'value'] = data_df.value * -1 return data_df
Inherited members
class PrePivotDeduplicate
-
Deduplicates the dataset based on the index_cols that are defined in the base class.
sometimes, only single tags are duplicated, however, there are also reports where all tags of a report are duplicated.
without deduplication, the pivot command will fail.
Expand source code
class PrePivotDeduplicate(PrePivotRule): """ Deduplicates the dataset based on the index_cols that are defined in the base class. sometimes, only single tags are duplicated, however, there are also reports where all tags of a report are duplicated. without deduplication, the pivot command will fail. """ def __init__(self): super().__init__("DeDup") def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. Args: data_df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ return data_df.duplicated(self.index_cols) def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ data_df.drop(data_df[mask].index, inplace=True) return data_df def get_description(self) -> str: """ Returns the description String Returns: str: description """ return f"Deduplicates the dataframe based on the columns {self.index_cols + ['value']}"
Ancestors
- PrePivotRule
- AbstractRule
- RuleEntity
- abc.ABC
Class variables
var identifier : str
Methods
def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame
-
apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args
df
- dataframe on which the rule has to be applied
mask
- a Series marking the rows in the dataframe on which the rule has to be applied
Returns
pd.DataFrame
- make the process chainable
Expand source code
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ data_df.drop(data_df[mask].index, inplace=True) return data_df
Inherited members
class PrePivotMaxQtrs (max_qtrs: int = 4)
-
filters the entries that have qtrs value that are equal or below the configured max_qtrs.
Expand source code
class PrePivotMaxQtrs(PrePivotRule): """ filters the entries that have qtrs value that are equal or below the configured max_qtrs. """ def __init__(self, max_qtrs: int = 4): super().__init__("MaxQtr") self.max_qtrs = max_qtrs def mask(self, data_df: pd.DataFrame) -> pa.typing.Series[bool]: """ returns a Series[bool] which defines the rows to which this rule has to be applied. Args: data_df: dataframe on which the rules should be applied Returns: pa.typing.Series[bool]: a boolean Series that marks which rows have to be calculated """ return data_df.qtrs <= self.max_qtrs def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ return data_df[mask] def get_description(self) -> str: """ Returns the description String Returns: str: description """ return f"Removes the entries that have a bigger qtrs value than {self.max_qtrs}"
Ancestors
- PrePivotRule
- AbstractRule
- RuleEntity
- abc.ABC
Class variables
var identifier : str
Methods
def apply(self, data_df: pandas.core.frame.DataFrame, mask: pandera.typing.pandas.Series[bool]) ‑> pandas.core.frame.DataFrame
-
apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series.
Important, the rules have to be applied "in-place", so no new dataframe is produced.
Args
df
- dataframe on which the rule has to be applied
mask
- a Series marking the rows in the dataframe on which the rule has to be applied
Returns
pd.DataFrame
- make the process chainable
Expand source code
def apply(self, data_df: pd.DataFrame, mask: pa.typing.Series[bool]) -> pd.DataFrame: """ apply the rule on the provided dataframe. the rows, on which the rule has to be applied is defined by the provide mask Series. Important, the rules have to be applied "in-place", so no new dataframe is produced. Args: df: dataframe on which the rule has to be applied mask: a Series marking the rows in the dataframe on which the rule has to be applied Returns: pd.DataFrame: make the process chainable """ return data_df[mask]
Inherited members