Module secfsdstools.d_container.databagmodel
Defines the container that keeps the data of sub.txt, num.txt, and pre.txt together.
Expand source code
"""
Defines the container that keeps the data of sub.txt, num.txt, and pre.txt together.
"""
import os
from dataclasses import dataclass
from typing import Dict, List, TypeVar, Generic
import pandas as pd
from secfsdstools.a_utils.constants import SUB_TXT, PRE_TXT, NUM_TXT, PRE_NUM_TXT
from secfsdstools.d_container.filter import FilterBase
from secfsdstools.d_container.presentation import Presenter
RAW = TypeVar('RAW', bound='RawDataBag')
JOINED = TypeVar('JOINED', bound='JoinedDataBag')
T = TypeVar('T')
class DataBagBase(Generic[T]):
"""
Base class for the DataBag types
"""
def __getitem__(self, bagfilter: FilterBase[T]) -> T:
"""
forwards to the filter method, so that filters can be chained in a simple syntax:
bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2)
Args:
bagfilter: the filter to be applied
Returns:
RawDataBag: the databag with the filtered content
"""
return self.filter(bagfilter)
def filter(self, bagfilter: FilterBase[T]) -> T:
"""
applies a filter to the bag and produces a new bag based on the filter.
instead of using the filter, you can also use the "index" syntax to apply filters:
bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2)
Args:
bagfilter: the filter to be applied
Returns:
RawDataBag: the databag with the filtered content
"""
return bagfilter.filter(self)
def present(self, presenter: Presenter[T]) -> pd.DataFrame:
"""
apply a presenter
"""
return presenter.present(self)
class JoinedDataBag(DataBagBase[JOINED]):
"""
the DataBag in which the pre.txt and the num.txt are joined based on the
adsh, tag, and version.
"""
@classmethod
def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED:
"""
create a new JoinedDataBag.
Args:
sub_df: sub.txt dataframe
pre_num_df: joined pre.txt and num.txt dataframe
Returns:
JoinedDataBag: new instance of JoinedDataBag
"""
return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df)
def __init__(self, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame):
"""
constructor.
Args:
sub_df: sub.txt dataframe
pre_num_df: joined pre.txt and num.txt dataframe
"""
self.sub_df = sub_df
self.pre_num_df = pre_num_df
def get_sub_copy(self) -> pd.DataFrame:
"""
Returns a copy of the sub dataframe.
Returns:
pd.DataFrame: copy of the sub dataframe.
"""
return self.sub_df.copy()
def get_pre_num_copy(self) -> pd.DataFrame:
"""
Returns a copy of the joined pre_num dataframe.
Returns:
pd.DataFrame: copy of joined pre_num dataframe.
"""
return self.pre_num_df.copy()
def copy_bag(self) -> JOINED:
"""
creates a bag with new copies of the internal dataframes.
Returns:
JoinedDataBag: new instance of JoinedDataBag
"""
return JoinedDataBag.create(sub_df=self.sub_df.copy(),
pre_num_df=self.pre_num_df.copy())
def save(self, target_path: str):
"""
Stores the bag under the given directory.
The directory has to exist and must be empty.
Args:
target_path: the directory under which the parquet files for sub and pre_num
will be created
"""
if not os.path.isdir(target_path):
raise ValueError(f"the path {target_path} does not exist")
if len(os.listdir(target_path)) > 0:
raise ValueError(f"the target_path {target_path} is not empty")
self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
@staticmethod
def load(target_path: str) -> JOINED:
"""
Loads the content of the current bag at the specified location.
Args:
target_path: the directory which contains the parquet files for sub and pre_num
Returns:
JoinedDataBag: the loaded Databag
"""
sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df)
@staticmethod
def concat(bags: List[JOINED]) -> JOINED:
"""
Merges multiple Bags together into one bag.
Note: merge does not check if DataBags with the same reports are merged together.
Args:
bags: List of bags to be merged
Returns:
JoinedDataBag: a Bag with the merged content
"""
sub_dfs = [db.sub_df for db in bags]
pre_num_dfs = [db.pre_num_df for db in bags]
return JoinedDataBag.create(sub_df=pd.concat(sub_dfs),
pre_num_df=pd.concat(pre_num_dfs))
@dataclass
class RawDataBagStats:
"""
Contains simple statistics of a report.
"""
num_entries: int
pre_entries: int
number_of_reports: int
reports_per_form: Dict[str, int]
reports_per_period_date: Dict[int, int]
class RawDataBag(DataBagBase[RAW]):
"""
Container class to keep the data for sub.txt, pre.txt, and num.txt together.
"""
@classmethod
def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW:
"""
create method for RawDataBag
Args:
sub_df(pd.DataFrame): sub.txt dataframe
pre_df(pd.DataFrame): pre.txt dataframe
num_df(pd.DataFrame): num.txt dataframe
Returns:
RawDataBag:
"""
return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
def __init__(self, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame):
self.sub_df = sub_df
self.pre_df = pre_df
self.num_df = num_df
def copy_bag(self):
"""
creates a bag with new copies of the internal dataframes.
Returns:
RawDataBag: new instance of JoinedDataBag
"""
return RawDataBag.create(sub_df=self.sub_df.copy(),
pre_df=self.pre_df.copy(),
num_df=self.num_df.copy())
def get_sub_copy(self) -> pd.DataFrame:
"""
Returns a copy of the sub.txt dataframe.
Returns:
pd.DataFrame: copy of the sub.txt dataframe.
"""
return self.sub_df.copy()
def get_pre_copy(self) -> pd.DataFrame:
"""
Returns a copy of the pre.txt dataframe.
Returns:
pd.DataFrame: copy of the pre.txt dataframe.
"""
return self.pre_df.copy()
def get_num_copy(self) -> pd.DataFrame:
"""
Returns a copy of the num.txt dataframe.
Returns:
pd.DataFrame: copy of the num.txt dataframe.
"""
return self.num_df.copy()
def join(self) -> JoinedDataBag:
"""
merges the raw data of pre and num together.
Returns:
JoinedDataBag: the DataBag where pre and num are merged
"""
# merge num and pre together. only rows in num are considered for which entries in pre exist
pre_num_df = pd.merge(self.num_df,
self.pre_df,
on=['adsh', 'tag',
'version']) # don't produce index_x and index_y columns
return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df)
def statistics(self) -> RawDataBagStats:
"""
calculate a few simple statistics of a report.
- number of entries in the num-file
- number of entries in the pre-file
- number of reports in the zip-file (equals number of entries in sub-file)
- number of reports per form (10-K, 10-Q, ...)
- number of reports per period date (counts per value in the period column of sub-file)
Returns:
RawDataBagStats: instance with basic report infos
"""
num_entries = len(self.num_df)
pre_entries = len(self.pre_df)
number_of_reports = len(self.sub_df)
reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict()
reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict()
return RawDataBagStats(num_entries=num_entries,
pre_entries=pre_entries,
number_of_reports=number_of_reports,
reports_per_form=reports_per_form,
reports_per_period_date=reports_per_period_date
)
def save(self, target_path: str):
"""
Stores the bag under the given directory.
The directory has to exist and must be empty.
Args:
target_path: the directory under which three parquet files for sub_txt, pre_text,
and num_txt will be created
"""
if not os.path.isdir(target_path):
raise ValueError(f"the path {target_path} does not exist")
if len(os.listdir(target_path)) > 0:
raise ValueError(f"the target_path {target_path} is not empty")
self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet'))
self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
@staticmethod
def load(target_path: str) -> RAW:
"""
Loads the content of the current bag at the specified location.
Args:
target_path: the directory which contains the three parquet files for sub_txt, pre_txt,
and num_txt
Returns:
RawDataBag: the loaded Databag
"""
sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet'))
pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet'))
num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
@staticmethod
def concat(bags: List[RAW]) -> RAW:
"""
Merges multiple Bags together into one bag.
Note: merge does not check if DataBags with the same reports are merged together.
Args:
bags: List of bags to be merged
Returns:
RawDataBag: a Bag with the merged content
"""
sub_dfs = [db.sub_df for db in bags]
pre_dfs = [db.pre_df for db in bags]
num_dfs = [db.num_df for db in bags]
# todo: might be more efficient if the contained maps were just combined
# instead of being recalculated
return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True),
pre_df=pd.concat(pre_dfs, ignore_index=True),
num_df=pd.concat(num_dfs, ignore_index=True))
Classes
class DataBagBase (*args, **kwds)
-
Base class for the DataBag types
Expand source code
class DataBagBase(Generic[T]): """ Base class for the DataBag types """ def __getitem__(self, bagfilter: FilterBase[T]) -> T: """ forwards to the filter method, so that filters can be chained in a simple syntax: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return self.filter(bagfilter) def filter(self, bagfilter: FilterBase[T]) -> T: """ applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return bagfilter.filter(self) def present(self, presenter: Presenter[T]) -> pd.DataFrame: """ apply a presenter """ return presenter.present(self)
Ancestors
- typing.Generic
Subclasses
Methods
def filter(self, bagfilter: FilterBase[~T]) ‑> ~T
-
applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2)
Args
bagfilter
- the filter to be applied
Returns
RawDataBag
- the databag with the filtered content
Expand source code
def filter(self, bagfilter: FilterBase[T]) -> T: """ applies a filter to the bag and produces a new bag based on the filter. instead of using the filter, you can also use the "index" syntax to apply filters: bag[filter1][filter2] is equal to bag.filter(filter1).filter(filter2) Args: bagfilter: the filter to be applied Returns: RawDataBag: the databag with the filtered content """ return bagfilter.filter(self)
def present(self, presenter: Presenter[~T]) ‑> pandas.core.frame.DataFrame
-
apply a presenter
Expand source code
def present(self, presenter: Presenter[T]) -> pd.DataFrame: """ apply a presenter """ return presenter.present(self)
class JoinedDataBag (sub_df: pandas.core.frame.DataFrame, pre_num_df: pandas.core.frame.DataFrame)
-
the DataBag in which the pre.txt and the num.txt are joined based on the adsh, tag, and version.
constructor.
Args
sub_df
- sub.txt dataframe
pre_num_df
- joined pre.txt and num.txt dataframe
Expand source code
class JoinedDataBag(DataBagBase[JOINED]): """ the DataBag in which the pre.txt and the num.txt are joined based on the adsh, tag, and version. """ @classmethod def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED: """ create a new JoinedDataBag. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df) def __init__(self, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame): """ constructor. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe """ self.sub_df = sub_df self.pre_num_df = pre_num_df def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub dataframe. Returns: pd.DataFrame: copy of the sub dataframe. """ return self.sub_df.copy() def get_pre_num_copy(self) -> pd.DataFrame: """ Returns a copy of the joined pre_num dataframe. Returns: pd.DataFrame: copy of joined pre_num dataframe. """ return self.pre_num_df.copy() def copy_bag(self) -> JOINED: """ creates a bag with new copies of the internal dataframes. Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag.create(sub_df=self.sub_df.copy(), pre_num_df=self.pre_num_df.copy()) def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which the parquet files for sub and pre_num will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) @staticmethod def load(target_path: str) -> JOINED: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the parquet files for sub and pre_num Returns: JoinedDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df) @staticmethod def concat(bags: List[JOINED]) -> JOINED: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: JoinedDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_num_dfs = [db.pre_num_df for db in bags] return JoinedDataBag.create(sub_df=pd.concat(sub_dfs), pre_num_df=pd.concat(pre_num_dfs))
Ancestors
- DataBagBase
- typing.Generic
Static methods
def concat(bags: List[~JOINED]) ‑> ~JOINED
-
Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together.
Args
bags
- List of bags to be merged
Returns
JoinedDataBag
- a Bag with the merged content
Expand source code
@staticmethod def concat(bags: List[JOINED]) -> JOINED: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: JoinedDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_num_dfs = [db.pre_num_df for db in bags] return JoinedDataBag.create(sub_df=pd.concat(sub_dfs), pre_num_df=pd.concat(pre_num_dfs))
def create(sub_df: pandas.core.frame.DataFrame, pre_num_df: pandas.core.frame.DataFrame) ‑> ~JOINED
-
create a new JoinedDataBag.
Args
sub_df
- sub.txt dataframe
pre_num_df
- joined pre.txt and num.txt dataframe
Returns
JoinedDataBag
- new instance of JoinedDataBag
Expand source code
@classmethod def create(cls, sub_df: pd.DataFrame, pre_num_df: pd.DataFrame) -> JOINED: """ create a new JoinedDataBag. Args: sub_df: sub.txt dataframe pre_num_df: joined pre.txt and num.txt dataframe Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag(sub_df=sub_df, pre_num_df=pre_num_df)
def load(target_path: str) ‑> ~JOINED
-
Loads the content of the current bag at the specified location.
Args
target_path
- the directory which contains the parquet files for sub and pre_num
Returns
JoinedDataBag
- the loaded Databag
Expand source code
@staticmethod def load(target_path: str) -> JOINED: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the parquet files for sub and pre_num Returns: JoinedDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_num_df = pd.read_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet')) return JoinedDataBag.create(sub_df=sub_df, pre_num_df=pre_num_df)
Methods
def copy_bag(self) ‑> ~JOINED
-
creates a bag with new copies of the internal dataframes.
Returns
JoinedDataBag
- new instance of JoinedDataBag
Expand source code
def copy_bag(self) -> JOINED: """ creates a bag with new copies of the internal dataframes. Returns: JoinedDataBag: new instance of JoinedDataBag """ return JoinedDataBag.create(sub_df=self.sub_df.copy(), pre_num_df=self.pre_num_df.copy())
def get_pre_num_copy(self) ‑> pandas.core.frame.DataFrame
-
Returns a copy of the joined pre_num dataframe.
Returns
pd.DataFrame
- copy of joined pre_num dataframe.
Expand source code
def get_pre_num_copy(self) -> pd.DataFrame: """ Returns a copy of the joined pre_num dataframe. Returns: pd.DataFrame: copy of joined pre_num dataframe. """ return self.pre_num_df.copy()
def get_sub_copy(self) ‑> pandas.core.frame.DataFrame
-
Returns a copy of the sub dataframe.
Returns
pd.DataFrame
- copy of the sub dataframe.
Expand source code
def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub dataframe. Returns: pd.DataFrame: copy of the sub dataframe. """ return self.sub_df.copy()
def save(self, target_path: str)
-
Stores the bag under the given directory. The directory has to exist and must be empty.
Args
target_path
- the directory under which the parquet files for sub and pre_num will be created
Expand source code
def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which the parquet files for sub and pre_num will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_num_df.to_parquet(os.path.join(target_path, f'{PRE_NUM_TXT}.parquet'))
Inherited members
class RawDataBag (sub_df: pandas.core.frame.DataFrame, pre_df: pandas.core.frame.DataFrame, num_df: pandas.core.frame.DataFrame)
-
Container class to keep the data for sub.txt, pre.txt, and num.txt together.
Expand source code
class RawDataBag(DataBagBase[RAW]): """ Container class to keep the data for sub.txt, pre.txt, and num.txt together. """ @classmethod def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW: """ create method for RawDataBag Args: sub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe Returns: RawDataBag: """ return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df) def __init__(self, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame): self.sub_df = sub_df self.pre_df = pre_df self.num_df = num_df def copy_bag(self): """ creates a bag with new copies of the internal dataframes. Returns: RawDataBag: new instance of JoinedDataBag """ return RawDataBag.create(sub_df=self.sub_df.copy(), pre_df=self.pre_df.copy(), num_df=self.num_df.copy()) def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub.txt dataframe. Returns: pd.DataFrame: copy of the sub.txt dataframe. """ return self.sub_df.copy() def get_pre_copy(self) -> pd.DataFrame: """ Returns a copy of the pre.txt dataframe. Returns: pd.DataFrame: copy of the pre.txt dataframe. """ return self.pre_df.copy() def get_num_copy(self) -> pd.DataFrame: """ Returns a copy of the num.txt dataframe. Returns: pd.DataFrame: copy of the num.txt dataframe. """ return self.num_df.copy() def join(self) -> JoinedDataBag: """ merges the raw data of pre and num together. Returns: JoinedDataBag: the DataBag where pre and num are merged """ # merge num and pre together. only rows in num are considered for which entries in pre exist pre_num_df = pd.merge(self.num_df, self.pre_df, on=['adsh', 'tag', 'version']) # don't produce index_x and index_y columns return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df) def statistics(self) -> RawDataBagStats: """ calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, ...) - number of reports per period date (counts per value in the period column of sub-file) Returns: RawDataBagStats: instance with basic report infos """ num_entries = len(self.num_df) pre_entries = len(self.pre_df) number_of_reports = len(self.sub_df) reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict() reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict() return RawDataBagStats(num_entries=num_entries, pre_entries=pre_entries, number_of_reports=number_of_reports, reports_per_form=reports_per_form, reports_per_period_date=reports_per_period_date ) def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) @staticmethod def load(target_path: str) -> RAW: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the three parquet files for sub_txt, pre_txt, and num_txt Returns: RawDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df) @staticmethod def concat(bags: List[RAW]) -> RAW: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: RawDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_dfs = [db.pre_df for db in bags] num_dfs = [db.num_df for db in bags] # todo: might be more efficient if the contained maps were just combined # instead of being recalculated return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True), pre_df=pd.concat(pre_dfs, ignore_index=True), num_df=pd.concat(num_dfs, ignore_index=True))
Ancestors
- DataBagBase
- typing.Generic
Static methods
def concat(bags: List[~RAW]) ‑> ~RAW
-
Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together.
Args
bags
- List of bags to be merged
Returns
RawDataBag
- a Bag with the merged content
Expand source code
@staticmethod def concat(bags: List[RAW]) -> RAW: """ Merges multiple Bags together into one bag. Note: merge does not check if DataBags with the same reports are merged together. Args: bags: List of bags to be merged Returns: RawDataBag: a Bag with the merged content """ sub_dfs = [db.sub_df for db in bags] pre_dfs = [db.pre_df for db in bags] num_dfs = [db.num_df for db in bags] # todo: might be more efficient if the contained maps were just combined # instead of being recalculated return RawDataBag.create(sub_df=pd.concat(sub_dfs, ignore_index=True), pre_df=pd.concat(pre_dfs, ignore_index=True), num_df=pd.concat(num_dfs, ignore_index=True))
def create(sub_df: pandas.core.frame.DataFrame, pre_df: pandas.core.frame.DataFrame, num_df: pandas.core.frame.DataFrame) ‑> ~RAW
-
create method for RawDataBag
Args
sub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe
Returns
RawDataBag:
Expand source code
@classmethod def create(cls, sub_df: pd.DataFrame, pre_df: pd.DataFrame, num_df: pd.DataFrame) -> RAW: """ create method for RawDataBag Args: sub_df(pd.DataFrame): sub.txt dataframe pre_df(pd.DataFrame): pre.txt dataframe num_df(pd.DataFrame): num.txt dataframe Returns: RawDataBag: """ return RawDataBag(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
def load(target_path: str) ‑> ~RAW
-
Loads the content of the current bag at the specified location.
Args
target_path
- the directory which contains the three parquet files for sub_txt, pre_txt,
and num_txt
Returns
RawDataBag
- the loaded Databag
Expand source code
@staticmethod def load(target_path: str) -> RAW: """ Loads the content of the current bag at the specified location. Args: target_path: the directory which contains the three parquet files for sub_txt, pre_txt, and num_txt Returns: RawDataBag: the loaded Databag """ sub_df = pd.read_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) pre_df = pd.read_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) num_df = pd.read_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet')) return RawDataBag.create(sub_df=sub_df, pre_df=pre_df, num_df=num_df)
Methods
def copy_bag(self)
-
creates a bag with new copies of the internal dataframes.
Returns
RawDataBag
- new instance of JoinedDataBag
Expand source code
def copy_bag(self): """ creates a bag with new copies of the internal dataframes. Returns: RawDataBag: new instance of JoinedDataBag """ return RawDataBag.create(sub_df=self.sub_df.copy(), pre_df=self.pre_df.copy(), num_df=self.num_df.copy())
def get_num_copy(self) ‑> pandas.core.frame.DataFrame
-
Returns a copy of the num.txt dataframe.
Returns
pd.DataFrame
- copy of the num.txt dataframe.
Expand source code
def get_num_copy(self) -> pd.DataFrame: """ Returns a copy of the num.txt dataframe. Returns: pd.DataFrame: copy of the num.txt dataframe. """ return self.num_df.copy()
def get_pre_copy(self) ‑> pandas.core.frame.DataFrame
-
Returns a copy of the pre.txt dataframe.
Returns
pd.DataFrame
- copy of the pre.txt dataframe.
Expand source code
def get_pre_copy(self) -> pd.DataFrame: """ Returns a copy of the pre.txt dataframe. Returns: pd.DataFrame: copy of the pre.txt dataframe. """ return self.pre_df.copy()
def get_sub_copy(self) ‑> pandas.core.frame.DataFrame
-
Returns a copy of the sub.txt dataframe.
Returns
pd.DataFrame
- copy of the sub.txt dataframe.
Expand source code
def get_sub_copy(self) -> pd.DataFrame: """ Returns a copy of the sub.txt dataframe. Returns: pd.DataFrame: copy of the sub.txt dataframe. """ return self.sub_df.copy()
def join(self) ‑> JoinedDataBag
-
merges the raw data of pre and num together.
Returns
JoinedDataBag
- the DataBag where pre and num are merged
Expand source code
def join(self) -> JoinedDataBag: """ merges the raw data of pre and num together. Returns: JoinedDataBag: the DataBag where pre and num are merged """ # merge num and pre together. only rows in num are considered for which entries in pre exist pre_num_df = pd.merge(self.num_df, self.pre_df, on=['adsh', 'tag', 'version']) # don't produce index_x and index_y columns return JoinedDataBag.create(sub_df=self.sub_df, pre_num_df=pre_num_df)
def save(self, target_path: str)
-
Stores the bag under the given directory. The directory has to exist and must be empty.
Args
target_path
- the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created
Expand source code
def save(self, target_path: str): """ Stores the bag under the given directory. The directory has to exist and must be empty. Args: target_path: the directory under which three parquet files for sub_txt, pre_text, and num_txt will be created """ if not os.path.isdir(target_path): raise ValueError(f"the path {target_path} does not exist") if len(os.listdir(target_path)) > 0: raise ValueError(f"the target_path {target_path} is not empty") self.sub_df.to_parquet(os.path.join(target_path, f'{SUB_TXT}.parquet')) self.pre_df.to_parquet(os.path.join(target_path, f'{PRE_TXT}.parquet')) self.num_df.to_parquet(os.path.join(target_path, f'{NUM_TXT}.parquet'))
def statistics(self) ‑> RawDataBagStats
-
calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, …) - number of reports per period date (counts per value in the period column of sub-file)
Returns
RawDataBagStats
- instance with basic report infos
Expand source code
def statistics(self) -> RawDataBagStats: """ calculate a few simple statistics of a report. - number of entries in the num-file - number of entries in the pre-file - number of reports in the zip-file (equals number of entries in sub-file) - number of reports per form (10-K, 10-Q, ...) - number of reports per period date (counts per value in the period column of sub-file) Returns: RawDataBagStats: instance with basic report infos """ num_entries = len(self.num_df) pre_entries = len(self.pre_df) number_of_reports = len(self.sub_df) reports_per_period_date: Dict[int, int] = self.sub_df.period.value_counts().to_dict() reports_per_form: Dict[str, int] = self.sub_df.form.value_counts().to_dict() return RawDataBagStats(num_entries=num_entries, pre_entries=pre_entries, number_of_reports=number_of_reports, reports_per_form=reports_per_form, reports_per_period_date=reports_per_period_date )
Inherited members
class RawDataBagStats (num_entries: int, pre_entries: int, number_of_reports: int, reports_per_form: Dict[str, int], reports_per_period_date: Dict[int, int])
-
Contains simple statistics of a report.
Expand source code
class RawDataBagStats: """ Contains simple statistics of a report. """ num_entries: int pre_entries: int number_of_reports: int reports_per_form: Dict[str, int] reports_per_period_date: Dict[int, int]
Class variables
var num_entries : int
var number_of_reports : int
var pre_entries : int
var reports_per_form : Dict[str, int]
var reports_per_period_date : Dict[int, int]