Module `secfsdstools.c_download.secdownloading_process`

Downloading zip files of the financial statement data sets from the sec.

Expand source code

"""
Downloading zip files of the financial statement data sets from the sec.
"""
import logging
import os
import re
from typing import List, Tuple

from secfsdstools.a_utils.downloadutils import UrlDownloader
from secfsdstools.c_download.basedownloading_process import BaseDownloadingProcess

LOGGER = logging.getLogger(__name__)


class SecDownloadingProcess(BaseDownloadingProcess):
    """
        Downloading the quarterly zip files of the financial statement data sets
    """
    FIN_STAT_DATASET_URL = 'https://www.sec.gov/dera/data/financial-statement-data-sets.html'
    # FIN_STAT_DATASET_ARCHIVE_URL = \
    #     'https://www.sec.gov/data-research/sec-markets-data/financial-statement-data-sets-archive'

    table_re = re.compile('<TABLE.*?>.*</TABLE>', re.IGNORECASE + re.MULTILINE + re.DOTALL)
    href_re = re.compile("href=\".*?\"", re.IGNORECASE + re.MULTILINE + re.DOTALL)

    def __init__(self,
                 zip_dir: str,
                 parquet_root_dir: str,
                 urldownloader: UrlDownloader,
                 execute_serial: bool = False):
        super().__init__(zip_dir=zip_dir,
                         urldownloader=urldownloader,
                         parquet_dir=os.path.join(parquet_root_dir, 'quarter'),
                         execute_serial=execute_serial
                         )

    def _get_available_zips(self) -> List[Tuple[str, str]]:

        # # reading data from the archived page - until 2024q3.zip
        # LOGGER.info("reading table in archive: %s", self.FIN_STAT_DATASET_ARCHIVE_URL)
        # archive_content = self.urldownloader.get_url_content(self.FIN_STAT_DATASET_ARCHIVE_URL)
        # archive_tables = self.table_re.findall(archive_content.text)
        #
        # archive_hrefs: List[str] = []
        #
        # if len(archive_tables) == 0:
        #     LOGGER.warning("No archive table found at: %s", self.FIN_STAT_DATASET_ARCHIVE_URL)
        # else:
        #     archive_first_table = archive_tables[0]
        #     archive_hrefs = self.href_re.findall(archive_first_table)
        #     archive_hrefs = [f'https://www.sec.gov{href[6:-1]}' for href in archive_hrefs]

        # reading data from the main url - starting with 2024q4.zip
        LOGGER.info("reading table in main page: %s", self.FIN_STAT_DATASET_URL)
        main_content = self.urldownloader.get_url_content(self.FIN_STAT_DATASET_URL)
        main_tables = self.table_re.findall(main_content.text)

        main_hrefs: List[str] = []

        if len(main_tables) == 0:
            LOGGER.warning("No table found at: %s", self.FIN_STAT_DATASET_URL)
        else:
            main_first_table = main_tables[0]
            main_hrefs = self.href_re.findall(main_first_table)
            main_hrefs = [f'https://www.sec.gov{href[6:-1]}' for href in main_hrefs]

        # hrefs = archive_hrefs + main_hrefs
        # return_value: List[Tuple[str, str]] = [(os.path.basename(href), href) for href in hrefs]
        #
        # return_value = [(n.replace("-archive", ""), p) for n, p in return_value]
        # return return_value

        return_value: List[Tuple[str, str]] = \
            [(os.path.basename(href), href) for href in main_hrefs]
        return return_value

    def _calculate_missing_zips(self) -> List[Tuple[str, str]]:
        downloaded_zip_files = self._get_downloaded_zips()
        transformed_parquet = self._get_transformed_parquet()
        available_zips_to_dld_dict = self._get_available_zips()

        # define which zip files don't have to be downloaded
        download_or_transformed_zips = set(downloaded_zip_files).union(set(transformed_parquet))

        return [(name, href) for name, href in available_zips_to_dld_dict if
                name not in download_or_transformed_zips]

Classes

class SecDownloadingProcess (zip_dir: str, parquet_root_dir: str, urldownloader: UrlDownloader, execute_serial: bool = False)

Downloading the quarterly zip files of the financial statement data sets

Constructor.

Args

zip_dir: target folder to store downloaded files to.
parquet_dir: directory where the transformed content of zip files are
urldownloader: UrlDownloader instance
execute_serial: whether to execute it in parallel or serial

Expand source code

class SecDownloadingProcess(BaseDownloadingProcess):
    """
        Downloading the quarterly zip files of the financial statement data sets
    """
    FIN_STAT_DATASET_URL = 'https://www.sec.gov/dera/data/financial-statement-data-sets.html'
    # FIN_STAT_DATASET_ARCHIVE_URL = \
    #     'https://www.sec.gov/data-research/sec-markets-data/financial-statement-data-sets-archive'

    table_re = re.compile('<TABLE.*?>.*</TABLE>', re.IGNORECASE + re.MULTILINE + re.DOTALL)
    href_re = re.compile("href=\".*?\"", re.IGNORECASE + re.MULTILINE + re.DOTALL)

    def __init__(self,
                 zip_dir: str,
                 parquet_root_dir: str,
                 urldownloader: UrlDownloader,
                 execute_serial: bool = False):
        super().__init__(zip_dir=zip_dir,
                         urldownloader=urldownloader,
                         parquet_dir=os.path.join(parquet_root_dir, 'quarter'),
                         execute_serial=execute_serial
                         )

    def _get_available_zips(self) -> List[Tuple[str, str]]:

        # # reading data from the archived page - until 2024q3.zip
        # LOGGER.info("reading table in archive: %s", self.FIN_STAT_DATASET_ARCHIVE_URL)
        # archive_content = self.urldownloader.get_url_content(self.FIN_STAT_DATASET_ARCHIVE_URL)
        # archive_tables = self.table_re.findall(archive_content.text)
        #
        # archive_hrefs: List[str] = []
        #
        # if len(archive_tables) == 0:
        #     LOGGER.warning("No archive table found at: %s", self.FIN_STAT_DATASET_ARCHIVE_URL)
        # else:
        #     archive_first_table = archive_tables[0]
        #     archive_hrefs = self.href_re.findall(archive_first_table)
        #     archive_hrefs = [f'https://www.sec.gov{href[6:-1]}' for href in archive_hrefs]

        # reading data from the main url - starting with 2024q4.zip
        LOGGER.info("reading table in main page: %s", self.FIN_STAT_DATASET_URL)
        main_content = self.urldownloader.get_url_content(self.FIN_STAT_DATASET_URL)
        main_tables = self.table_re.findall(main_content.text)

        main_hrefs: List[str] = []

        if len(main_tables) == 0:
            LOGGER.warning("No table found at: %s", self.FIN_STAT_DATASET_URL)
        else:
            main_first_table = main_tables[0]
            main_hrefs = self.href_re.findall(main_first_table)
            main_hrefs = [f'https://www.sec.gov{href[6:-1]}' for href in main_hrefs]

        # hrefs = archive_hrefs + main_hrefs
        # return_value: List[Tuple[str, str]] = [(os.path.basename(href), href) for href in hrefs]
        #
        # return_value = [(n.replace("-archive", ""), p) for n, p in return_value]
        # return return_value

        return_value: List[Tuple[str, str]] = \
            [(os.path.basename(href), href) for href in main_hrefs]
        return return_value

    def _calculate_missing_zips(self) -> List[Tuple[str, str]]:
        downloaded_zip_files = self._get_downloaded_zips()
        transformed_parquet = self._get_transformed_parquet()
        available_zips_to_dld_dict = self._get_available_zips()

        # define which zip files don't have to be downloaded
        download_or_transformed_zips = set(downloaded_zip_files).union(set(transformed_parquet))

        return [(name, href) for name, href in available_zips_to_dld_dict if
                name not in download_or_transformed_zips]

Ancestors

Class variables

var FIN_STAT_DATASET_URL
var href_re
var table_re

Inherited members

BaseDownloadingProcess:
- calculate_tasks
- do_execution
- get_headers
- post_process
- pre_process
- process
- process_task