Source code for smoke_forcing.utils.url_parse

import requests
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup


[docs]def get_auth(username, password): return HTTPBasicAuth(username, password)
[docs]def parse_protected_html(url, username, password): """ Parse password protected site Args: url (str): url for secure site username (str) password (str) Returns: BeautifulSoup: parsed HTML """ parent_dir = requests.get(url, auth=get_auth(username, password)).text soup = BeautifulSoup(parent_dir, "html.parser") if "404" in soup.find_all("title")[0].text: raise requests.HTTPError(f"Invalid authentication for {url}") return soup
[docs]def get_url_list(base_url, soup, container="a"): """ Get all urls from webpage in container type container Args: base_url (str) soup (BeautifulSoup): parsed webpage container (str, optional): What HTML container to look for hrefs in. Defaults to "a". Returns: [type]: [description] """ return [base_url + node.get("href") for node in soup.find_all(container)]
[docs]def filter_by_ext(url_list, ext): """ Filter list by extension Args: url_list (list of str) ext (str): File extension to filter by Returns: [type]: [description] """ return [url for url in url_list if url.endswith(ext)]