import requests
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup
[docs]def get_auth(username, password):
return HTTPBasicAuth(username, password)
[docs]def parse_protected_html(url, username, password):
""" Parse password protected site
Args:
url (str): url for secure site
username (str)
password (str)
Returns:
BeautifulSoup: parsed HTML
"""
parent_dir = requests.get(url, auth=get_auth(username, password)).text
soup = BeautifulSoup(parent_dir, "html.parser")
if "404" in soup.find_all("title")[0].text:
raise requests.HTTPError(f"Invalid authentication for {url}")
return soup
[docs]def get_url_list(base_url, soup, container="a"):
""" Get all urls from webpage in container type container
Args:
base_url (str)
soup (BeautifulSoup): parsed webpage
container (str, optional): What HTML container to look for hrefs in. Defaults to "a".
Returns:
[type]: [description]
"""
return [base_url + node.get("href") for node in soup.find_all(container)]
[docs]def filter_by_ext(url_list, ext):
""" Filter list by extension
Args:
url_list (list of str)
ext (str): File extension to filter by
Returns:
[type]: [description]
"""
return [url for url in url_list if url.endswith(ext)]