website-v2/core/htmlhelper.py

from bs4 import BeautifulSoup


# List HTML elements (with relevant attributes) to remove the FIRST occurrence
REMOVE_TAGS = [
    # Remove custom headers, these vary from lib to lib, it's usually a table
    # /docs/libs/1_82_0/ (maps to index.html and has removable headers)
    (
        "table",
        {
            "bgcolor": "#D7EEFF",
            "border": "0",
            "bordercolor": "#111111",
            "cellpadding": "5",
            "cellspacing": "0",
            "style": "border-collapse: collapse",
        },
    ),
    # /doc/libs/1_82_0/libs/functional/index.html
    # /doc/libs/1_82_0/libs/functional/negators.html
    # /doc/libs/1_82_0/libs/functional/ptr_fun.html
    # /doc/libs/1_82_0/libs/functional/function_traits.html
    # /doc/libs/1_82_0/libs/functional/mem_fun.html
    # /doc/libs/1_82_0/libs/functional/binders.html
    # /doc/libs/1_82_0/libs/uuid/doc/index.html
    # /doc/libs/1_82_0/libs/rational/index.html
    # /doc/libs/1_82_0/libs/format/index.html
    ("table", {"bgcolor": "#007F7F", "border": "1", "cellpadding": "2"}),
    # /docs/libs/1_82_0/libs/multi_array/doc/index.html (lowercase)
    ("table", {"bgcolor": "#007f7f", "border": "1", "cellpadding": "2"}),
    # /docs/libs/1_82_0/libs/gil/doc/html/index.html
    (
        "table",
        {
            "summary": "header",
            "width": "100%",
            "cellspacing": "0",
            "cellpadding": "7",
            "border": "0",
        },
    ),
    # very prominent header
    # /docs/libs/1_82_0/libs/locale/doc/html/index.html
    ("div", {"id": "top"}),
    # almost every other page has this as a header
    ("table", {"cellpadding": "2", "width": "100%"}),
]

# List HTML elements (with relevant attributes) to remove ALL occurrences
REMOVE_ALL = [
    # the legacy logo referenced from multiple pages at different depths
    ("header", {"class": "header"}),
    ("img", {"src": "../../../../boost.png"}),
    ("img", {"src": "../../../boost.png"}),
    ("img", {"src": "../../boost.png"}),
    ("img", {"src": "../boost.png"}),
    ("img", {"src": "boost.png"}),
    ("img", {"src": "images/boost.png"}),
    # These are navigation controls, like next/up/prev. Do not remove for now.
    # most pages, e.g. /docs/libs/1_82_0/libs/iterator/doc/html/index.html
    # ("div", {"class": "spirit-nav"}),
    # /docs/libs/1_82_0/libs/gil/doc/html/index.html
    # ("div", {"class": "navbar"}),
    # /docs/libs/1_82_0/libs/iostreams/doc/guide/generic_streams.html
    # ("div", {"class": "nav"}),
]

# List HTML elements (with relevant attributes) to remove ONLY their CSS class
REMOVE_CSS_CLASSESS = [
    # /docs/libs/1_55_0/libs/exception/doc/boost_exception_all_hpp.html
    ("div", {"class": "body-0"}),
    ("div", {"class": "body-1"}),
    ("div", {"class": "body-2"}),
    # /docs/libs/1_82_0/libs/numeric/conversion/doc/html/index.html
    ("div", {"class": "toc"}),
    ("dl", {"class": "toc"}),
]


def _insert_in_doc(target, elements, append=True):
    to_add = [
        BeautifulSoup("<!-- BEGIN Manually appending items -->"),
        *elements,
        BeautifulSoup("<!-- END Manually appending items -->"),
    ]
    if append:
        target.extend(to_add)
    else:
        for i in reversed(to_add):
            target.insert(0, i)


def _insert_head(result, head_adding):
    if result.head is None:
        result.html.insert(0, result.new_tag("head"))
    _insert_in_doc(result.head, head_adding)
    if result.head.head is not None:
        result.head.head.unwrap()


def _replace_body(result, original_body, base_body):
    base_body_content = base_body.find("div", {"id": "boost-legacy-docs-body"})
    if base_body_content is not None:
        result.body.replace_with(base_body)
        _insert_in_doc(base_body_content, [original_body])
        result.body.body.unwrap()


def modernize_legacy_page(content, base_html, head_selector="head", insert_body=True):
    result = BeautifulSoup(content, "html.parser")
    if result.html is None:
        # Not an HTML file we care about
        return content

    # Remove the first occurrence of legacy header(s) and other stuff
    for tag_name, tag_attrs in REMOVE_TAGS:
        tag = result.find(tag_name, tag_attrs)
        if tag:
            tag.decompose()

    # Remove all navbar-like divs, if any
    for tag_name, tag_attrs in REMOVE_ALL:
        for tag in result.find_all(tag_name, tag_attrs):
            tag.decompose()

    # Remove CSS classes that produce visual harm
    for tag_name, tag_attrs in REMOVE_CSS_CLASSESS:
        for tag in result.find_all(tag_name, tag_attrs):
            tag.attrs.pop("class")

    # Use the base HTML to later extract the <head> and (part of) the <body>
    placeholder = BeautifulSoup(base_html, "html.parser")
    if isinstance(head_selector, str):
        target_head = placeholder.find_all(head_selector)
    elif isinstance(head_selector, dict):
        target_head = placeholder.find_all(**head_selector)
    else:
        target_head = None

    if target_head:
        # Append the <head> taken from the base HTML to the existing (legacy) head
        _insert_head(result, target_head)

    original_body = result.body
    if original_body is None:
        pass
    elif placeholder.body is not None:
        if insert_body:
            # Beautify the legacy body with structure and classes from the
            # modern one, and embed the original body into a:
            # <div id="boost-legacy-docs-body"></div> block
            _replace_body(result, original_body, base_body=placeholder.body)
        else:
            _insert_in_doc(
                result.body,
                placeholder.find("div", {"id": "boost-legacy-docs-header"}),
                append=False,
            )

    content = result.prettify()

    # Replace all links to boost.org with a local link
    content = content.replace("https://www.boost.org/doc/libs/", "/docs/libs/")

    return content


def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
    """
    Takes HTML content and returns a list of tuples containing library
    names and the paths to those libraries' docs. This is used to
    update the documentation_url field on LibraryVersion objects.

    Args:
        content (str): HTML content from the libraries.htm file. For example, the
        HTML content from `/docs/libs/1_82_0/libs/libraries.htm`.
        name (str): The name of the section to search for. Defaults to "Alphabetically".
        parent (str): The parent tag of the section to search for. Defaults to "h2".
            Together, parent and string define what HTML tag to search for. For example,
            if parent="h2" and name="Alphabetically", this function will search for
            <h2 name="Alphabetically">.

    Returns:
        list: A list of tuples containing library names and the paths to those
        libraries' docs. For example, `[(library_name, path), ...]`.
    """
    soup = BeautifulSoup(content, "html.parser")

    # Find the tag that contains the list of libraries
    tag = soup.find("a", attrs={"name": name})

    if not tag:
        return []

    # Get the next <ul> tag, which contains the list of libraries
    library_list_tag = tag.find_parent(parent).find_next_sibling("ul")

    # Now get all the items in the list
    library_tags = library_list_tag.find_all("li")
    if not library_tags:
        return []

    results = []
    for library_tag in library_tags:
        # Get the url path for the documentation
        url_path = library_tag.find("a")["href"]
        # Get the library name
        library_name = library_tag.find("a").get_text()
        results.append((library_name, url_path))

    return results