Module secfsdstools.u_usecases.analyzes

Provices some helper method to analyze data in the bags.

Expand source code
"""
Provices some helper method to analyze data in the bags.
"""
from typing import List

import pandas as pd

from secfsdstools.d_container.databagmodel import JoinedDataBag


def find_adshs_with_all_tags(bag: JoinedDataBag, tag_list: List[str]) -> List[str]:
    """
    Returns a list with adshs, which contain all the tags mentioned in the taglist

    Args:
        bag:
        tag_list:

    Returns:
        pd.DataFrame

    """
    filtered_tags_df = bag.pre_num_df[bag.pre_num_df.tag.isin(tag_list)]
    filtered_df = filtered_tags_df[['adsh', 'tag']].unique()
    return filtered_df.groupby(['adsh']).count().reset_index().adsh().tolist()


def find_tags_containing(bag: JoinedDataBag, contains: str) -> pd.DataFrame:
    """
    returns a value counts of all tags that contain the provides string.

    Args:
        bag: the bag to check
        contains: text that should be contained in the tag name

    Returns:
        pd.DataFrame: a Dataframe with the tagname and the value_counts as columns

    """
    filtered_df = bag.pre_num_df[bag.pre_num_df.tag.str.contains(contains)]
    return filtered_df.tag.value_counts()


def count_tags(bag: JoinedDataBag) -> pd.DataFrame:
    """
    returns a value counts of all tags that are present in the bag.
    gives also a relative number to the number of unique statements in the bag
    (unique combinations of ['adsh', 'stmt', 'coreg', 'report', 'ddate', 'uom', 'qtrs']).

    Args:
        bag: the bag to check

    Returns:
        pd.DataFrame: a Dataframe with the tagname and the value_counts as columns

    """

    count_df = bag.pre_num_df.tag.value_counts().reset_index()
    count_df.columns = ['tag', 'count']
    unique_stmts = bag.pre_num_df[
        ['adsh', 'stmt', 'coreg', 'report', 'ddate', 'uom', 'qtrs']].drop_duplicates().shape[0]

    count_df['rel'] = count_df['count'] / unique_stmts
    return count_df


def reports_using_tags_count(bag: JoinedDataBag, used_tags: List[str]) -> pd.DataFrame:
    """
    Returns a dataframe that counts the number of used tags within a report.
    Args:
        bag:
        used_tags:

    Returns:

    """
    relevant_cols = bag.pre_num_df[['adsh', 'tag']]
    relevant_tags = relevant_cols[relevant_cols.tag.isin(used_tags)]
    unique_df = relevant_tags.drop_duplicates()
    return unique_df.groupby('adsh').count()


def reports_using_all(bag: JoinedDataBag, used_tags: List[str]) -> List[str]:
    """
    returns a list with adsh numbers that use all the provided tags.
    Args:
        bag:
        used_tags:

    Returns:

    """
    counted_df = reports_using_tags_count(bag=bag, used_tags=used_tags)
    filterd_df = counted_df[counted_df.tag == len(used_tags)]

    return filterd_df.index.to_list()

Functions

def count_tags(bag: JoinedDataBag) ‑> pandas.core.frame.DataFrame

returns a value counts of all tags that are present in the bag. gives also a relative number to the number of unique statements in the bag (unique combinations of ['adsh', 'stmt', 'coreg', 'report', 'ddate', 'uom', 'qtrs']).

Args

bag
the bag to check

Returns

pd.DataFrame
a Dataframe with the tagname and the value_counts as columns
Expand source code
def count_tags(bag: JoinedDataBag) -> pd.DataFrame:
    """
    returns a value counts of all tags that are present in the bag.
    gives also a relative number to the number of unique statements in the bag
    (unique combinations of ['adsh', 'stmt', 'coreg', 'report', 'ddate', 'uom', 'qtrs']).

    Args:
        bag: the bag to check

    Returns:
        pd.DataFrame: a Dataframe with the tagname and the value_counts as columns

    """

    count_df = bag.pre_num_df.tag.value_counts().reset_index()
    count_df.columns = ['tag', 'count']
    unique_stmts = bag.pre_num_df[
        ['adsh', 'stmt', 'coreg', 'report', 'ddate', 'uom', 'qtrs']].drop_duplicates().shape[0]

    count_df['rel'] = count_df['count'] / unique_stmts
    return count_df
def find_adshs_with_all_tags(bag: JoinedDataBag, tag_list: List[str]) ‑> List[str]

Returns a list with adshs, which contain all the tags mentioned in the taglist

Args

bag: tag_list:

Returns

pd.DataFrame

Expand source code
def find_adshs_with_all_tags(bag: JoinedDataBag, tag_list: List[str]) -> List[str]:
    """
    Returns a list with adshs, which contain all the tags mentioned in the taglist

    Args:
        bag:
        tag_list:

    Returns:
        pd.DataFrame

    """
    filtered_tags_df = bag.pre_num_df[bag.pre_num_df.tag.isin(tag_list)]
    filtered_df = filtered_tags_df[['adsh', 'tag']].unique()
    return filtered_df.groupby(['adsh']).count().reset_index().adsh().tolist()
def find_tags_containing(bag: JoinedDataBag, contains: str) ‑> pandas.core.frame.DataFrame

returns a value counts of all tags that contain the provides string.

Args

bag
the bag to check
contains
text that should be contained in the tag name

Returns

pd.DataFrame
a Dataframe with the tagname and the value_counts as columns
Expand source code
def find_tags_containing(bag: JoinedDataBag, contains: str) -> pd.DataFrame:
    """
    returns a value counts of all tags that contain the provides string.

    Args:
        bag: the bag to check
        contains: text that should be contained in the tag name

    Returns:
        pd.DataFrame: a Dataframe with the tagname and the value_counts as columns

    """
    filtered_df = bag.pre_num_df[bag.pre_num_df.tag.str.contains(contains)]
    return filtered_df.tag.value_counts()
def reports_using_all(bag: JoinedDataBag, used_tags: List[str]) ‑> List[str]

returns a list with adsh numbers that use all the provided tags.

Args

bag: used_tags: Returns:

Expand source code
def reports_using_all(bag: JoinedDataBag, used_tags: List[str]) -> List[str]:
    """
    returns a list with adsh numbers that use all the provided tags.
    Args:
        bag:
        used_tags:

    Returns:

    """
    counted_df = reports_using_tags_count(bag=bag, used_tags=used_tags)
    filterd_df = counted_df[counted_df.tag == len(used_tags)]

    return filterd_df.index.to_list()
def reports_using_tags_count(bag: JoinedDataBag, used_tags: List[str]) ‑> pandas.core.frame.DataFrame

Returns a dataframe that counts the number of used tags within a report.

Args

bag: used_tags: Returns:

Expand source code
def reports_using_tags_count(bag: JoinedDataBag, used_tags: List[str]) -> pd.DataFrame:
    """
    Returns a dataframe that counts the number of used tags within a report.
    Args:
        bag:
        used_tags:

    Returns:

    """
    relevant_cols = bag.pre_num_df[['adsh', 'tag']]
    relevant_tags = relevant_cols[relevant_cols.tag.isin(used_tags)]
    unique_df = relevant_tags.drop_duplicates()
    return unique_df.groupby('adsh').count()