Module `secfsdstools.a_utils.downloadutils`

Download utils to download data from the SEC website.

Expand source code

"""
Download utils to download data from the SEC website.
"""

import logging
from time import sleep
from typing import Dict

import requests

from secfsdstools.a_utils.fileutils import write_content_to_zip

LOGGER = logging.getLogger(__name__)


class UrlDownloader:
    """
    Main downloader class
    """

    def __init__(self, user_agent: str = "<not set>"):
        """
        Args:
            user_agent (str): according to https://www.sec.gov/os/accessing-edgar-data in the form
        User-Agent: Sample Company Name AdminContact@<sample company domain>.com
        """

        self.user_agent = user_agent

    def download_url_to_file(self, file_url: str, target_file: str,
                             expected_size: int = None,
                             max_tries: int = 6,
                             sleep_time: int = 1,
                             headers: Dict[str, str] = None):
        """
            downloads the content auf an url and stores it into the target-file.
            retries a download several times, if it fails

        Args:
            file_url (str): url that referencese the file to be downloaded
            target_file (str): the file to store the content into
               (it will be written into a zipfile)
            expected_size (str, optional, None): the expected size of
              the data that is downloaded.
            logs a warning if the size doesn't match
            max_tries (int, optional, 6): maximum retries, default is 6
            sleep_time (int, optional, 1): wait time between retries,
              default is one second
            headers (Dict[str, str], optional, None}): additional headers

        Returns:
            str: the written file name
        """
        response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)
        content = response.text

        if expected_size is not None:
            if len(content) != expected_size:
                LOGGER.info('warning expected size %d - real size %d', expected_size, len(content))

        return write_content_to_zip(content, target_file)

    def binary_download_url_to_file(self, file_url: str,
                                    target_file: str,
                                    max_tries: int = 6,
                                    sleep_time: int = 1,
                                    headers: Dict[str, str] = None):
        """
            downloads the binary of an url and stores it into the target-file.
            retries a download several times, if it fails

        Args:
            file_url (str): url that referencese the file to be downloaded
            target_file (str): the file to store the content into
              (it will be written into a zipfile)
            max_tries (int, optional, 6): maximum retries, default is 6
            sleep_time (int, optional, 1): wait time between retries, default is one second
            headers (Dict[str, str], optional, None}): additional headers
        """
        response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)

        with open(target_file, "wb") as target_fp:
            target_fp.write(response.content)

    def get_url_content(self, url: str, max_tries: int = 6,
                        sleep_time: int = 1, headers: Dict[str, str] = None) \
            -> requests.models.Response:
        """
            downloads the content auf an url and returns it as a string.
            retries a download several times, if it fails.
            Uses the defined user-agent as header information

        Args:
            url (str): url that referencese the file to be downloaded
            max_tries (int, optional, 6): maximum number of tries to get the data
            sleep_time (int, optional, 1): wait time between retries, default is one second
            headers (Dict[str, str], optional, None}): additional headers

        Returns:
             requests.models.Response
        """
        response = None
        current_try = 0
        while current_try < max_tries:
            current_try += 1
            try:
                if headers is None:
                    headers = {'User-Agent': self.user_agent}
                else:
                    headers.update({'User-Agent': self.user_agent})
                response = requests.get(url, timeout=10,
                                        headers=headers, stream=True)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as err:
                if current_try >= max_tries:
                    LOGGER.info('RequestException: failed to download %s2', url)
                    raise err
                sleep(sleep_time)

        return response

Classes

class UrlDownloader (user_agent: str = '<not set>')

Main downloader class

Args

user_agent : str: according to https://www.sec.gov/os/accessing-edgar-data in the form

User-Agent: Sample Company Name AdminContact@.com

Expand source code

class UrlDownloader:
    """
    Main downloader class
    """

    def __init__(self, user_agent: str = "<not set>"):
        """
        Args:
            user_agent (str): according to https://www.sec.gov/os/accessing-edgar-data in the form
        User-Agent: Sample Company Name AdminContact@<sample company domain>.com
        """

        self.user_agent = user_agent

    def download_url_to_file(self, file_url: str, target_file: str,
                             expected_size: int = None,
                             max_tries: int = 6,
                             sleep_time: int = 1,
                             headers: Dict[str, str] = None):
        """
            downloads the content auf an url and stores it into the target-file.
            retries a download several times, if it fails

        Args:
            file_url (str): url that referencese the file to be downloaded
            target_file (str): the file to store the content into
               (it will be written into a zipfile)
            expected_size (str, optional, None): the expected size of
              the data that is downloaded.
            logs a warning if the size doesn't match
            max_tries (int, optional, 6): maximum retries, default is 6
            sleep_time (int, optional, 1): wait time between retries,
              default is one second
            headers (Dict[str, str], optional, None}): additional headers

        Returns:
            str: the written file name
        """
        response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)
        content = response.text

        if expected_size is not None:
            if len(content) != expected_size:
                LOGGER.info('warning expected size %d - real size %d', expected_size, len(content))

        return write_content_to_zip(content, target_file)

    def binary_download_url_to_file(self, file_url: str,
                                    target_file: str,
                                    max_tries: int = 6,
                                    sleep_time: int = 1,
                                    headers: Dict[str, str] = None):
        """
            downloads the binary of an url and stores it into the target-file.
            retries a download several times, if it fails

        Args:
            file_url (str): url that referencese the file to be downloaded
            target_file (str): the file to store the content into
              (it will be written into a zipfile)
            max_tries (int, optional, 6): maximum retries, default is 6
            sleep_time (int, optional, 1): wait time between retries, default is one second
            headers (Dict[str, str], optional, None}): additional headers
        """
        response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)

        with open(target_file, "wb") as target_fp:
            target_fp.write(response.content)

    def get_url_content(self, url: str, max_tries: int = 6,
                        sleep_time: int = 1, headers: Dict[str, str] = None) \
            -> requests.models.Response:
        """
            downloads the content auf an url and returns it as a string.
            retries a download several times, if it fails.
            Uses the defined user-agent as header information

        Args:
            url (str): url that referencese the file to be downloaded
            max_tries (int, optional, 6): maximum number of tries to get the data
            sleep_time (int, optional, 1): wait time between retries, default is one second
            headers (Dict[str, str], optional, None}): additional headers

        Returns:
             requests.models.Response
        """
        response = None
        current_try = 0
        while current_try < max_tries:
            current_try += 1
            try:
                if headers is None:
                    headers = {'User-Agent': self.user_agent}
                else:
                    headers.update({'User-Agent': self.user_agent})
                response = requests.get(url, timeout=10,
                                        headers=headers, stream=True)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as err:
                if current_try >= max_tries:
                    LOGGER.info('RequestException: failed to download %s2', url)
                    raise err
                sleep(sleep_time)

        return response

Methods

def binary_download_url_to_file(self, file_url: str, target_file: str, max_tries: int = 6, sleep_time: int = 1, headers: Dict[str, str] = None)

downloads the binary of an url and stores it into the target-file. retries a download several times, if it fails

Args

file_url : str: url that referencese the file to be downloaded
target_file : str: the file to store the content into (it will be written into a zipfile)
max_tries : int, optional, 6: maximum retries, default is 6
sleep_time : int, optional, 1: wait time between retries, default is one second

headers (Dict[str, str], optional, None}): additional headers

Expand source code

def binary_download_url_to_file(self, file_url: str,
                                target_file: str,
                                max_tries: int = 6,
                                sleep_time: int = 1,
                                headers: Dict[str, str] = None):
    """
        downloads the binary of an url and stores it into the target-file.
        retries a download several times, if it fails

    Args:
        file_url (str): url that referencese the file to be downloaded
        target_file (str): the file to store the content into
          (it will be written into a zipfile)
        max_tries (int, optional, 6): maximum retries, default is 6
        sleep_time (int, optional, 1): wait time between retries, default is one second
        headers (Dict[str, str], optional, None}): additional headers
    """
    response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)

    with open(target_file, "wb") as target_fp:
        target_fp.write(response.content)

def download_url_to_file(self, file_url: str, target_file: str, expected_size: int = None, max_tries: int = 6, sleep_time: int = 1, headers: Dict[str, str] = None)

downloads the content auf an url and stores it into the target-file. retries a download several times, if it fails

Args

file_url : str: url that referencese the file to be downloaded
target_file : str: the file to store the content into (it will be written into a zipfile)
expected_size : str, optional, None: the expected size of the data that is downloaded.
logs a warning if the size doesn't match
max_tries : int, optional, 6: maximum retries, default is 6
sleep_time : int, optional, 1: wait time between retries, default is one second

headers (Dict[str, str], optional, None}): additional headers

Returns

str: the written file name

Expand source code

def download_url_to_file(self, file_url: str, target_file: str,
                         expected_size: int = None,
                         max_tries: int = 6,
                         sleep_time: int = 1,
                         headers: Dict[str, str] = None):
    """
        downloads the content auf an url and stores it into the target-file.
        retries a download several times, if it fails

    Args:
        file_url (str): url that referencese the file to be downloaded
        target_file (str): the file to store the content into
           (it will be written into a zipfile)
        expected_size (str, optional, None): the expected size of
          the data that is downloaded.
        logs a warning if the size doesn't match
        max_tries (int, optional, 6): maximum retries, default is 6
        sleep_time (int, optional, 1): wait time between retries,
          default is one second
        headers (Dict[str, str], optional, None}): additional headers

    Returns:
        str: the written file name
    """
    response = self.get_url_content(file_url, max_tries, sleep_time, headers=headers)
    content = response.text

    if expected_size is not None:
        if len(content) != expected_size:
            LOGGER.info('warning expected size %d - real size %d', expected_size, len(content))

    return write_content_to_zip(content, target_file)

def get_url_content(self, url: str, max_tries: int = 6, sleep_time: int = 1, headers: Dict[str, str] = None) ‑> requests.models.Response

downloads the content auf an url and returns it as a string. retries a download several times, if it fails. Uses the defined user-agent as header information

Args

url : str: url that referencese the file to be downloaded
max_tries : int, optional, 6: maximum number of tries to get the data
sleep_time : int, optional, 1: wait time between retries, default is one second

headers (Dict[str, str], optional, None}): additional headers

Returns

requests.models.Response

Expand source code

def get_url_content(self, url: str, max_tries: int = 6,
                    sleep_time: int = 1, headers: Dict[str, str] = None) \
        -> requests.models.Response:
    """
        downloads the content auf an url and returns it as a string.
        retries a download several times, if it fails.
        Uses the defined user-agent as header information

    Args:
        url (str): url that referencese the file to be downloaded
        max_tries (int, optional, 6): maximum number of tries to get the data
        sleep_time (int, optional, 1): wait time between retries, default is one second
        headers (Dict[str, str], optional, None}): additional headers

    Returns:
         requests.models.Response
    """
    response = None
    current_try = 0
    while current_try < max_tries:
        current_try += 1
        try:
            if headers is None:
                headers = {'User-Agent': self.user_agent}
            else:
                headers.update({'User-Agent': self.user_agent})
            response = requests.get(url, timeout=10,
                                    headers=headers, stream=True)
            response.raise_for_status()
            break
        except requests.exceptions.RequestException as err:
            if current_try >= max_tries:
                LOGGER.info('RequestException: failed to download %s2', url)
                raise err
            sleep(sleep_time)

    return response