Source code for smoke_forcing.utils.url_parse

import requests
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup


[docs]def get_auth(username, password):
    return HTTPBasicAuth(username, password)


[docs]def parse_protected_html(url, username, password):
    """ Parse password protected site

    Args:
        url (str): url for secure site
        username (str)
        password (str)

    Returns:
        BeautifulSoup: parsed HTML
    """
    parent_dir = requests.get(url, auth=get_auth(username, password)).text
    soup = BeautifulSoup(parent_dir, "html.parser")
    if "404" in soup.find_all("title")[0].text:
        raise requests.HTTPError(f"Invalid authentication for {url}")
    return soup


[docs]def get_url_list(base_url, soup, container="a"):
    """ Get all urls from webpage in container type container

    Args:
        base_url (str)
        soup (BeautifulSoup): parsed webpage
        container (str, optional): What HTML container to look for hrefs in. Defaults to "a".

    Returns:
        [type]: [description]
    """
    return [base_url + node.get("href") for node in soup.find_all(container)]


[docs]def filter_by_ext(url_list, ext):
    """ Filter list by extension

    Args:
        url_list (list of str)
        ext (str): File extension to filter by

    Returns:
        [type]: [description]
    """
    return [url for url in url_list if url.endswith(ext)]