Module secfsdstools.d_container.databagmodel
Defines the container that keeps the data of sub.txt, num.txt, and pre.txt together.
Expand source code
"""
Defines the container that keeps the data of sub.txt, num.txt, and  pre.txt together.
"""
import os
from dataclasses import dataclass
from typing import Dict, List, TypeVar, Generic
import pandas as pd
from secfsdstools.a_utils.constants import SUB_TXT, PRE_TXT, NUM_TXT, PRE_NUM_TXT
from secfsdstools.d_container.filter import FilterBase
from secfsdstools.d_container.presentation import Presenter
RAW = TypeVar('RAW', bound='RawDataBag')
JOINED = TypeVar('JOINED', bound='JoinedDataBag')
T = TypeVar('T')
class DataBagBase(Generic[T]):
    """
    Base class for the DataBag types
    """
    def __getitem__(self, bagfilter: FilterBase[T]) -> T:
        """
        forwards to the filter method, so that filters can be chained in a simple syntax:
        bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2)
        Args:
            bagfilter: the filter to be applied
        Returns:
            RawDataBag: the databag with the filtered content
        """
        return self.filter(bagfilter)
    def filter(self, bagfilter: FilterBase[T]) -> T:
        """
        applies a filter to the bag and produces a new bag based on the filter.
        instead of using the filter, you can also use the "index" syntax to apply filters:
        bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2)
        Args:
            bagfilter: the filter to be applied
        Returns:
            RawDataBag: the databag with the filtered content
        """
        return bagfilter.filter(self)
    def present(self, presenter: Presenter[T]) -> pd.DataFrame:
        """
        apply a presenter
        """
        return presenter.present(self)
class JoinedDataBag(DataBagBase[JOINED]):
    """
    the DataBag in which the pre.txt and the num.txt are joined based on the
    adsh, tag, and version.
    """
    @classmethod
    def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED:
        """
        create a new JoinedDataBag.
        Args:
            sub_df: sub.txt dataframe
            pre_num_df: joined pre.txt and num.txt dataframe
        Returns:
            JoinedDataBag: new instance of JoinedDataBag
        """
        return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df)
    def __init__(self, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame):
        """
        constructor.
        Args:
            sub_df: sub.txt dataframe
            pre_num_df: joined pre.txt and num.txt dataframe
        """
        self.sub_df = sub_df
        self.pre_num_df = pre_num_df
    def get_sub_copy(self) -> pd.DataFrame:
        """
        Returns a copy of the sub dataframe.
        Returns:
            pd.DataFrame: copy of the sub dataframe.
        """
        return self.sub_df.copy()
    def get_pre_num_copy(self) -> pd.DataFrame:
        """
        Returns a copy of the joined pre_num dataframe.
        Returns:
            pd.DataFrame: copy of joined pre_num dataframe.
        """
        return self.pre_num_df.copy()
    def copy_bag(self) -> JOINED:
        """
        creates a bag with new copies of the internal dataframes.
        Returns:
            JoinedDataBag: new instance of JoinedDataBag
        """
        return JoinedDataBag.create(sub_df=self.sub_df.copy(),
                                    pre_num_df=self.pre_num_df.copy())
    def save(self, target_path: str):
        """
        Stores the bag under the given directory.
        The directory has to exist and must be empty.
        Args:
            target_path: the directory under which the parquet files for sub and pre_num
                  will be created
        """
        if not os.path.isdir(target_path):
            raise ValueError(f"the path {target_path} does not exist")
        if len(os.listdir(target_path)) > 0:
            raise ValueError(f"the target_path {target_path} is not empty")
        self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
        self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
    @staticmethod
    def load(target_path: str) -> JOINED:
        """
        Loads the content of the current bag at the specified location.
        Args:
            target_path: the directory which contains the parquet files for sub and pre_num
        Returns:
            JoinedDataBag: the loaded Databag
        """
        sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
        pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
        return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df)
    @staticmethod
    def concat(bags: List[JOINED]) -> JOINED:
        """
        Merges multiple Bags together into one bag.
        Note: merge does not check if DataBags with the same reports are merged together.
        Args:
            bags: List of bags to be merged
        Returns:
            JoinedDataBag: a Bag with the merged content
        """
        sub_dfs = [db.sub_df for db in bags]
        pre_num_dfs = [db.pre_num_df for db in bags]
        return JoinedDataBag.create(sub_df=pd.concat(sub_dfs),
                                    pre_num_df=pd.concat(pre_num_dfs))
@dataclass
class RawDataBagStats:
    """
    Contains simple statistics of a report.
    """
    num_entries: int
    pre_entries: int
    number_of_reports: int
    reports_per_form: Dict[str, int]
    reports_per_period_date: Dict[int, int]
class RawDataBag(DataBagBase[RAW]):
    """
    Container class to keep the data for sub.txt, pre.txt, and num.txt together.
    """
    @classmethod
    def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW:
        """
        create method for RawDataBag
        Args:
            sub_df(pd.DataFrame): sub.txt dataframe
            pre_df(pd.DataFrame): pre.txt dataframe
            num_df(pd.DataFrame): num.txt dataframe
        Returns:
            RawDataBag:
        """
        return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
    def __init__(self, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame):
        self.sub_df = sub_df
        self.pre_df = pre_df
        self.num_df = num_df
    def copy_bag(self):
        """
        creates a bag with new copies of the internal dataframes.
        Returns:
            RawDataBag: new instance of JoinedDataBag
        """
        return RawDataBag.create(sub_df=self.sub_df.copy(),
                                 pre_df=self.pre_df.copy(),
                                 num_df=self.num_df.copy())
    def get_sub_copy(self) -> pd.DataFrame:
        """
        Returns a copy of the sub.txt dataframe.
        Returns:
            pd.DataFrame: copy of the sub.txt dataframe.
        """
        return self.sub_df.copy()
    def get_pre_copy(self) -> pd.DataFrame:
        """
        Returns a copy of the pre.txt dataframe.
        Returns:
            pd.DataFrame: copy of the pre.txt dataframe.
        """
        return self.pre_df.copy()
    def get_num_copy(self) -> pd.DataFrame:
        """
        Returns a copy of the num.txt dataframe.
        Returns:
            pd.DataFrame: copy of the num.txt dataframe.
        """
        return self.num_df.copy()
    def join(self) -> JoinedDataBag:
        """
        merges the raw data of pre and num together.
        Returns:
            JoinedDataBag: the DataBag where pre and num are merged
        """
        # merge num and pre together. only rows in num are considered for which entries in pre exist
        pre_num_df = pd.merge(self.num_df,
                              self.pre_df,
                              on=['adsh', 'tag',
                                  'version'])  # don't produce index_x and index_y columns
        return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df)
    def statistics(self) -> RawDataBagStats:
        """
        calculate a few simple statistics of a report.
        - number of entries in the num-file
        - number of entries in the pre-file
        - number of reports in the zip-file (equals number of entries in sub-file)
        - number of reports per form (10-K, 10-Q, ...)
        - number of reports per period date (counts per value in the period column of sub-file)
        Returns:
            RawDataBagStats: instance with basic report infos
        """
        num_entries = len(self.num_df)
        pre_entries = len(self.pre_df)
        number_of_reports = len(self.sub_df)
        reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict()
        reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict()
        return RawDataBagStats(num_entries=num_entries,
                               pre_entries=pre_entries,
                               number_of_reports=number_of_reports,
                               reports_per_form=reports_per_form,
                               reports_per_period_date=reports_per_period_date
                               )
    def save(self, target_path: str):
        """
        Stores the bag under the given directory.
        The directory has to exist and must be empty.
        Args:
            target_path: the directory under which three parquet files for sub_txt, pre_text,
                  and num_txt will be created
        """
        if not os.path.isdir(target_path):
            raise ValueError(f"the path {target_path} does not exist")
        if len(os.listdir(target_path)) > 0:
            raise ValueError(f"the target_path {target_path} is not empty")
        self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
        self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet'))
        self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
    @staticmethod
    def load(target_path: str) -> RAW:
        """
        Loads the content of the current bag at the specified location.
        Args:
            target_path: the directory which contains the three parquet files for sub_txt, pre_txt,
             and num_txt
        Returns:
            RawDataBag: the loaded Databag
        """
        sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
        pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet'))
        num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
        return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
    @staticmethod
    def concat(bags: List[RAW]) -> RAW:
        """
        Merges multiple Bags together into one bag.
        Note: merge does not check if DataBags with the same reports are merged together.
        Args:
            bags: List of bags to be merged
        Returns:
            RawDataBag: a Bag with the merged content
        """
        sub_dfs = [db.sub_df for db in bags]
        pre_dfs = [db.pre_df for db in bags]
        num_dfs = [db.num_df for db in bags]
        # todo: might be more efficient if the contained maps were just combined
        #       instead of being recalculated
        return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True),
                                 pre_df=pd.concat(pre_dfs, ignore_index=True),
                                 num_df=pd.concat(num_dfs, ignore_index=True))Classes
- class DataBagBase (*args, **kwds)
- 
Base class for the DataBag types Expand source codeclass DataBagBase(Generic[T]): """ Base class for the DataBag types """ def __getitem__(self, bagfilter: FilterBase[T]) -> T: """ forwards to the filter method, so that filters can be chained in a simple syntax: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return self.filter(bagfilter) def filter(self, bagfilter: FilterBase[T]) -> T: """ applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return bagfilter.filter(self) def present(self, presenter: Presenter[T]) -> pd.DataFrame: """ apply a presenter """ return presenter.present(self)Ancestors- typing.Generic
 SubclassesMethods- def filter(self, bagfilter: FilterBase[~T]) ‑> ~T
- 
applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args- bagfilter
- the filter to be applied
 Returns- RawDataBag
- the databag with the filtered content
 Expand source codedef filter(self, bagfilter: FilterBase[T]) -> T: """ applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return bagfilter.filter(self)
- def present(self, presenter: Presenter[~T]) ‑> pandas.core.frame.DataFrame
- 
apply a presenter Expand source codedef present(self, presenter: Presenter[T]) -> pd.DataFrame: """ apply a presenter """ return presenter.present(self)
 
- class JoinedDataBag (sub_df: pandas.core.frame.DataFrame, pre_num_df: pandas.core.frame.DataFrame)
- 
the DataBag in which the pre.txt and the num.txt are joined based on the adsh, tag, and version. constructor. Args- sub_df
- sub.txt dataframe
- pre_num_df
- joined pre.txt and num.txt dataframe
 Expand source codeclass JoinedDataBag(DataBagBase[JOINED]): """ the DataBag in which the pre.txt and the num.txt are joined based on the adsh, tag, and version. """ @classmethod def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED: """ create a new JoinedDataBag. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df) def __init__(self, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame): """ constructor. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe """ self.sub_df = sub_df self.pre_num_df = pre_num_df def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub dataframe. Returns: pd.DataFrame: copy of the sub dataframe. """ return self.sub_df.copy() def get_pre_num_copy(self) -> pd.DataFrame: """ Returns a copy of the joined pre_num dataframe. Returns: pd.DataFrame: copy of joined pre_num dataframe. """ return self.pre_num_df.copy() def copy_bag(self) -> JOINED: """ creates a bag with new copies of the internal dataframes. Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag.create(sub_df=self.sub_df.copy(), pre_num_df=self.pre_num_df.copy()) def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which the parquet files for sub and pre_num will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) @staticmethod def load(target_path: str) -> JOINED: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the parquet files for sub and pre_num Returns: JoinedDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df) @staticmethod def concat(bags: List[JOINED]) -> JOINED: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: JoinedDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_num_dfs = [db.pre_num_df for db in bags] return JoinedDataBag.create(sub_df=pd.concat(sub_dfs), pre_num_df=pd.concat(pre_num_dfs))Ancestors- DataBagBase
- typing.Generic
 Static methods- def concat(bags: List[~JOINED]) ‑> ~JOINED
- 
Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args- bags
- List of bags to be merged
 Returns- JoinedDataBag
- a Bag with the merged content
 Expand source code@staticmethod def concat(bags: List[JOINED]) -> JOINED: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: JoinedDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_num_dfs = [db.pre_num_df for db in bags] return JoinedDataBag.create(sub_df=pd.concat(sub_dfs), pre_num_df=pd.concat(pre_num_dfs))
- def create(sub_df: pandas.core.frame.DataFrame, pre_num_df: pandas.core.frame.DataFrame) ‑> ~JOINED
- 
create a new JoinedDataBag. Args- sub_df
- sub.txt dataframe
- pre_num_df
- joined pre.txt and num.txt dataframe
 Returns- JoinedDataBag
- new instance of JoinedDataBag
 Expand source code@classmethod def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED: """ create a new JoinedDataBag. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df)
- def load(target_path: str) ‑> ~JOINED
- 
Loads the content of the current bag at the specified location. Args- target_path
- the directory which contains the parquet files for sub and pre_num
 Returns- JoinedDataBag
- the loaded Databag
 Expand source code@staticmethod def load(target_path: str) -> JOINED: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the parquet files for sub and pre_num Returns: JoinedDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df)
 Methods- def copy_bag(self) ‑> ~JOINED
- 
creates a bag with new copies of the internal dataframes. Returns- JoinedDataBag
- new instance of JoinedDataBag
 Expand source codedef copy_bag(self) -> JOINED: """ creates a bag with new copies of the internal dataframes. Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag.create(sub_df=self.sub_df.copy(), pre_num_df=self.pre_num_df.copy())
- def get_pre_num_copy(self) ‑> pandas.core.frame.DataFrame
- 
Returns a copy of the joined pre_num dataframe. Returns- pd.DataFrame
- copy of joined pre_num dataframe.
 Expand source codedef get_pre_num_copy(self) -> pd.DataFrame: """ Returns a copy of the joined pre_num dataframe. Returns: pd.DataFrame: copy of joined pre_num dataframe. """ return self.pre_num_df.copy()
- def get_sub_copy(self) ‑> pandas.core.frame.DataFrame
- 
Returns a copy of the sub dataframe. Returns- pd.DataFrame
- copy of the sub dataframe.
 Expand source codedef get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub dataframe. Returns: pd.DataFrame: copy of the sub dataframe. """ return self.sub_df.copy()
- def save(self, target_path: str)
- 
Stores the bag under the given directory. The directory has to exist and must be empty. Args- target_path
- the directory under which the parquet files for sub and pre_num will be created
 Expand source codedef save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which the parquet files for sub and pre_num will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
 Inherited members
- class RawDataBag (sub_df: pandas.core.frame.DataFrame, pre_df: pandas.core.frame.DataFrame, num_df: pandas.core.frame.DataFrame)
- 
Container class to keep the data for sub.txt, pre.txt, and num.txt together. Expand source codeclass RawDataBag(DataBagBase[RAW]): """ Container class to keep the data for sub.txt, pre.txt, and num.txt together. """ @classmethod def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW: """ create method for RawDataBag Args: sub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe Returns: RawDataBag: """ return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df) def __init__(self, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame): self.sub_df = sub_df self.pre_df = pre_df self.num_df = num_df def copy_bag(self): """ creates a bag with new copies of the internal dataframes. Returns: RawDataBag: new instance of JoinedDataBag """ return RawDataBag.create(sub_df=self.sub_df.copy(), pre_df=self.pre_df.copy(), num_df=self.num_df.copy()) def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub.txt dataframe. Returns: pd.DataFrame: copy of the sub.txt dataframe. """ return self.sub_df.copy() def get_pre_copy(self) -> pd.DataFrame: """ Returns a copy of the pre.txt dataframe. Returns: pd.DataFrame: copy of the pre.txt dataframe. """ return self.pre_df.copy() def get_num_copy(self) -> pd.DataFrame: """ Returns a copy of the num.txt dataframe. Returns: pd.DataFrame: copy of the num.txt dataframe. """ return self.num_df.copy() def join(self) -> JoinedDataBag: """ merges the raw data of pre and num together. Returns: JoinedDataBag: the DataBag where pre and num are merged """ # merge num and pre together. only rows in num are considered for which entries in pre exist pre_num_df = pd.merge(self.num_df, self.pre_df, on=['adsh', 'tag', 'version']) # don't produce index_x and index_y columns return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df) def statistics(self) -> RawDataBagStats: """ calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, ...) - number of reports per period date (counts per value in the period column of sub-file) Returns: RawDataBagStats: instance with basic report infos """ num_entries = len(self.num_df) pre_entries = len(self.pre_df) number_of_reports = len(self.sub_df) reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict() reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict() return RawDataBagStats(num_entries=num_entries, pre_entries=pre_entries, number_of_reports=number_of_reports, reports_per_form=reports_per_form, reports_per_period_date=reports_per_period_date ) def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) @staticmethod def load(target_path: str) -> RAW: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the three parquet files for sub_txt, pre_txt, and num_txt Returns: RawDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df) @staticmethod def concat(bags: List[RAW]) -> RAW: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: RawDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_dfs = [db.pre_df for db in bags] num_dfs = [db.num_df for db in bags] # todo: might be more efficient if the contained maps were just combined # instead of being recalculated return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True), pre_df=pd.concat(pre_dfs, ignore_index=True), num_df=pd.concat(num_dfs, ignore_index=True))Ancestors- DataBagBase
- typing.Generic
 Static methods- def concat(bags: List[~RAW]) ‑> ~RAW
- 
Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args- bags
- List of bags to be merged
 Returns- RawDataBag
- a Bag with the merged content
 Expand source code@staticmethod def concat(bags: List[RAW]) -> RAW: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: RawDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_dfs = [db.pre_df for db in bags] num_dfs = [db.num_df for db in bags] # todo: might be more efficient if the contained maps were just combined # instead of being recalculated return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True), pre_df=pd.concat(pre_dfs, ignore_index=True), num_df=pd.concat(num_dfs, ignore_index=True))
- def create(sub_df: pandas.core.frame.DataFrame, pre_df: pandas.core.frame.DataFrame, num_df: pandas.core.frame.DataFrame) ‑> ~RAW
- 
create method for RawDataBag Argssub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe ReturnsRawDataBag: Expand source code@classmethod def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW: """ create method for RawDataBag Args: sub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe Returns: RawDataBag: """ return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
- def load(target_path: str) ‑> ~RAW
- 
Loads the content of the current bag at the specified location. Args- target_path
- the directory which contains the three parquet files for sub_txt, pre_txt,
 and num_txt Returns- RawDataBag
- the loaded Databag
 Expand source code@staticmethod def load(target_path: str) -> RAW: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the three parquet files for sub_txt, pre_txt, and num_txt Returns: RawDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
 Methods- def copy_bag(self)
- 
creates a bag with new copies of the internal dataframes. Returns- RawDataBag
- new instance of JoinedDataBag
 Expand source codedef copy_bag(self): """ creates a bag with new copies of the internal dataframes. Returns: RawDataBag: new instance of JoinedDataBag """ return RawDataBag.create(sub_df=self.sub_df.copy(), pre_df=self.pre_df.copy(), num_df=self.num_df.copy())
- def get_num_copy(self) ‑> pandas.core.frame.DataFrame
- 
Returns a copy of the num.txt dataframe. Returns- pd.DataFrame
- copy of the num.txt dataframe.
 Expand source codedef get_num_copy(self) -> pd.DataFrame: """ Returns a copy of the num.txt dataframe. Returns: pd.DataFrame: copy of the num.txt dataframe. """ return self.num_df.copy()
- def get_pre_copy(self) ‑> pandas.core.frame.DataFrame
- 
Returns a copy of the pre.txt dataframe. Returns- pd.DataFrame
- copy of the pre.txt dataframe.
 Expand source codedef get_pre_copy(self) -> pd.DataFrame: """ Returns a copy of the pre.txt dataframe. Returns: pd.DataFrame: copy of the pre.txt dataframe. """ return self.pre_df.copy()
- def get_sub_copy(self) ‑> pandas.core.frame.DataFrame
- 
Returns a copy of the sub.txt dataframe. Returns- pd.DataFrame
- copy of the sub.txt dataframe.
 Expand source codedef get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub.txt dataframe. Returns: pd.DataFrame: copy of the sub.txt dataframe. """ return self.sub_df.copy()
- def join(self) ‑> JoinedDataBag
- 
merges the raw data of pre and num together. Returns- JoinedDataBag
- the DataBag where pre and num are merged
 Expand source codedef join(self) -> JoinedDataBag: """ merges the raw data of pre and num together. Returns: JoinedDataBag: the DataBag where pre and num are merged """ # merge num and pre together. only rows in num are considered for which entries in pre exist pre_num_df = pd.merge(self.num_df, self.pre_df, on=['adsh', 'tag', 'version']) # don't produce index_x and index_y columns return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df)
- def save(self, target_path: str)
- 
Stores the bag under the given directory. The directory has to exist and must be empty. Args- target_path
- the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created
 Expand source codedef save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
- def statistics(self) ‑> RawDataBagStats
- 
calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, …) - number of reports per period date (counts per value in the period column of sub-file) Returns- RawDataBagStats
- instance with basic report infos
 Expand source codedef statistics(self) -> RawDataBagStats: """ calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, ...) - number of reports per period date (counts per value in the period column of sub-file) Returns: RawDataBagStats: instance with basic report infos """ num_entries = len(self.num_df) pre_entries = len(self.pre_df) number_of_reports = len(self.sub_df) reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict() reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict() return RawDataBagStats(num_entries=num_entries, pre_entries=pre_entries, number_of_reports=number_of_reports, reports_per_form=reports_per_form, reports_per_period_date=reports_per_period_date )
 Inherited members
- class RawDataBagStats (num_entries: int, pre_entries: int, number_of_reports: int, reports_per_form: Dict[str, int], reports_per_period_date: Dict[int, int])
- 
Contains simple statistics of a report. Expand source codeclass RawDataBagStats: """ Contains simple statistics of a report. """ num_entries: int pre_entries: int number_of_reports: int reports_per_form: Dict[str, int] reports_per_period_date: Dict[int, int]Class variables- var num_entries : int
- var number_of_reports : int
- var pre_entries : int
- var reports_per_form : Dict[str, int]
- var reports_per_period_date : Dict[int, int]