import re from bs4 import BeautifulSoup, Comment, Tag from django.http import HttpHeaders from django.template.loader import render_to_string from django.templatetags.static import static from structlog import get_logger from core.boostrenderer import get_body_from_html from core.constants import ( SourceDocType, NO_PROCESS_LIBS, NO_WRAPPER_LIBS, FULLY_MODERNIZED_LIB_VERSIONS, ) logger = get_logger() # List HTML elements (with relevant attributes) to remove the FIRST occurrence REMOVE_TAGS = [ # Remove custom headers, these vary from lib to lib, it's usually a table # /docs/libs/1_82_0/ (maps to index.html and has removable headers) ( "table", { "border": "0", "bordercolor": "#111111", "cellpadding": "5", "cellspacing": "0", "style": "border-collapse: collapse", "width": "750", }, ), # /doc/libs/1_82_0/libs/functional/index.html # /doc/libs/1_82_0/libs/functional/negators.html # /doc/libs/1_82_0/libs/functional/ptr_fun.html # /doc/libs/1_82_0/libs/functional/function_traits.html # /doc/libs/1_82_0/libs/functional/mem_fun.html # /doc/libs/1_82_0/libs/functional/binders.html # /doc/libs/1_82_0/libs/uuid/doc/index.html # /doc/libs/1_82_0/libs/rational/index.html # /doc/libs/1_82_0/libs/format/index.html ("table", {"bgcolor": "#007F7F", "border": "1", "cellpadding": "2"}), # /docs/libs/1_82_0/libs/multi_array/doc/index.html (lowercase) ("table", {"bgcolor": "#007f7f", "border": "1", "cellpadding": "2"}), # /docs/libs/1_88_0/libs/statechart/doc/index.html ("td", {"valign": "top", "width": "300"}), # almost every other page has this as a header ("table", {"cellpadding": "2", "width": "100%"}), # Remove the first hr from the page ("hr", {}), # remove canonical tags ("link", {"rel": "canonical"}), ] # these tags are only removed on the release page, update REMOVE_TAGS for all pages REMOVE_TAGS_RELEASE = [ ("div", {"id": "footer"}), ] # List HTML elements (with relevant attributes) to remove ALL occurrences REMOVE_ALL = [ # the legacy logo referenced from multiple pages at different depths ("header", {"class": "header"}), ("img", {"src": "../../../../boost.png"}), ("img", {"src": "../../../boost.png"}), ("img", {"src": "../../boost.png"}), ("img", {"src": "../boost.png"}), ("img", {"src": "boost.png"}), ("img", {"src": "images/boost.png"}), # These are navigation controls, like next/up/prev. Do not remove for now. # most pages, e.g. /docs/libs/1_82_0/libs/iterator/doc/html/index.html # ("div", {"class": "spirit-nav"}), # /docs/libs/1_82_0/libs/gil/doc/html/index.html # ("div", {"class": "navbar"}), # /docs/libs/1_82_0/libs/iostreams/doc/guide/generic_streams.html # ("div", {"class": "nav"}), ] # List HTML elements (with relevant attributes) to remove ONLY their CSS class REMOVE_CSS_CLASSES = [ # /docs/libs/1_55_0/libs/exception/doc/boost_exception_all_hpp.html ("div", {"class": "body-0"}), ("div", {"class": "body-1"}), ("div", {"class": "body-2"}), # /docs/libs/1_82_0/libs/numeric/conversion/doc/html/index.html # ("div", {"class": "toc"}), # ("dl", {"class": "toc"}), # /doc/libs/boost_1_84_0/libs/container_hash/doc/html/hash.html # ("div", {"class": "toc2"}), ] def _insert_in_doc(target, elements, append=True): to_add = [ BeautifulSoup(""), *elements, BeautifulSoup(""), ] if append: target.extend(to_add) else: for i in reversed(to_add): target.insert(0, i) def _insert_head(result, head_adding): if result.head is None: result.html.insert(0, result.new_tag("head")) _insert_in_doc(result.head, head_adding) if result.head.head is not None: result.head.head.unwrap() def _replace_body(result, original_body, base_body): base_body_content = base_body.find("div", {"id": "boost-legacy-docs-body"}) if base_body_content is not None: result.body.replace_with(base_body) _insert_in_doc(base_body_content, [original_body]) result.body.body.unwrap() def wrap_main_body_elements( result: BeautifulSoup, original_docs_type: SourceDocType | None = None ): def is_end_comment(html_element): return ( isinstance(html_element, Comment) and html_element == " END Manually appending items " ) start_index = None elements_to_wrap = [] wrapper_div = result.new_tag("div") wrapper_class_list = [] # add classes based on the original docs type # TODO: this is a hack that adds "-antora" to anything that's missing an original_docs_type. # We should either ensure we always have a doc type, or boostlook should handle "source-docs-other" # or something similar docs_type_suffix = ( original_docs_type.value if original_docs_type else SourceDocType.ANTORA.value ) wrapper_class_list.append(f"source-docs-{docs_type_suffix}") if original_docs_type != SourceDocType.ANTORA: # Antora docs have a boostlook class already; others need it. wrapper_class_list.append("boostlook") wrapper_div["class"] = " ".join(wrapper_class_list) for index, element in enumerate(result.find("body").children): if is_end_comment(element): start_index = index # we want to leave the comment where it is continue if start_index: elements_to_wrap.append(element) for index, element in enumerate(elements_to_wrap): wrapper_div.append(element) result.append(wrapper_div) def modernize_legacy_page( soup: BeautifulSoup, base_html: str, head_selector: str | dict[str, str] = "head", insert_body: bool = True, original_docs_type: SourceDocType | None = None, skip_replace_boostlook: bool = False, show_footer: bool = True, show_navbar: bool = True, ) -> str: """Modernize a legacy Boost documentation page.""" HIDE_TAGS_BASE = [] if not show_navbar: HIDE_TAGS_BASE.append(("div", {"class": "header-menu-bar topnavbar"})), if soup.html is None: # Not an HTML file we care about return str(soup) # Remove CSS classes that produce visual harm for tag_name, tag_attrs in REMOVE_CSS_CLASSES: for tag in soup.find_all(tag_name, tag_attrs): tag.attrs.pop("class") soup = convert_name_to_id(soup) if not skip_replace_boostlook: soup = remove_library_boostlook(soup) soup = remove_embedded_boostlook(soup) # Use the base HTML to later extract the and (part of) the placeholder = BeautifulSoup(base_html, "html.parser") if isinstance(head_selector, str): target_head = placeholder.find_all(head_selector) elif isinstance(head_selector, dict): target_head = placeholder.find_all(**head_selector) else: target_head = None if target_head: # Append the taken from the base HTML to the existing (legacy) head _insert_head(soup, target_head) original_body = soup.body if original_body is None: pass elif placeholder.body is not None: if insert_body: # Beautify the legacy body with structure and classes from the # modern one, and embed the original body into a: #
block _replace_body(soup, original_body, base_body=placeholder.body) else: _insert_in_doc( soup.body, placeholder.find("div", {"id": "boost-legacy-docs-header"}), append=False, ) wrap_main_body_elements(soup, original_docs_type) if show_footer: rendered_template = render_to_string("includes/_footer.html", {}) rendered_template_as_dom = BeautifulSoup( rendered_template, "html.parser" ) soup.append(rendered_template_as_dom) # Remove tags from the base template soup = hide_tags(soup, HIDE_TAGS_BASE) return str(soup) def minimize_uris(content: str) -> str: # Replace all links to boost.org with a local link content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/") return content def remove_unwanted(content: BeautifulSoup) -> BeautifulSoup: # Remove the first occurrence of legacy header(s) and other stuff for tag_name, tag_attrs in REMOVE_TAGS: tag = content.find(tag_name, tag_attrs) if tag: tag.decompose() # Remove all navbar-like divs, if any for tag_name, tag_attrs in REMOVE_ALL: for tag in content.find_all(tag_name, tag_attrs): tag.decompose() return content def build_xpath(tag, attrs): parts = [f"@{key}='{val}'" for key, val in attrs.items()] condition = " and ".join(parts) return f".//{tag}[{condition}]" if condition else f".//{tag}" def get_library_documentation_urls(content, name="Alphabetically", parent="h2"): """ Takes HTML content and returns a list of tuples containing library names and the paths to those libraries' docs. This is used to update the documentation_url field on LibraryVersion objects. Args: content (str): HTML content from the libraries.htm file. For example, the HTML content from `/docs/libs/1_82_0/libs/libraries.htm`. name (str): The name of the section to search for. Defaults to "Alphabetically". parent (str): The parent tag of the section to search for. Defaults to "h2". Together, parent and string define what HTML tag to search for. For example, if parent="h2" and name="Alphabetically", this function will search for

. Returns: list: A list of tuples containing library names and the paths to those libraries' docs. For example, `[(library_name, path), ...]`. """ soup = BeautifulSoup(content, "html.parser") # Find the tag that contains the list of libraries tag = soup.find("a", attrs={"name": name}) if not tag: return [] # Get the next