import re
from bs4 import BeautifulSoup, Comment, Tag
from django.http import HttpHeaders
from django.template.loader import render_to_string
from django.templatetags.static import static
from structlog import get_logger
from core.boostrenderer import get_body_from_html
from core.constants import (
SourceDocType,
NO_PROCESS_LIBS,
NO_WRAPPER_LIBS,
FULLY_MODERNIZED_LIB_VERSIONS,
)
logger = get_logger()
# List HTML elements (with relevant attributes) to remove the FIRST occurrence
REMOVE_TAGS = [
# Remove custom headers, these vary from lib to lib, it's usually a table
# /docs/libs/1_82_0/ (maps to index.html and has removable headers)
(
"table",
{
"border": "0",
"bordercolor": "#111111",
"cellpadding": "5",
"cellspacing": "0",
"style": "border-collapse: collapse",
"width": "750",
},
),
# /doc/libs/1_82_0/libs/functional/index.html
# /doc/libs/1_82_0/libs/functional/negators.html
# /doc/libs/1_82_0/libs/functional/ptr_fun.html
# /doc/libs/1_82_0/libs/functional/function_traits.html
# /doc/libs/1_82_0/libs/functional/mem_fun.html
# /doc/libs/1_82_0/libs/functional/binders.html
# /doc/libs/1_82_0/libs/uuid/doc/index.html
# /doc/libs/1_82_0/libs/rational/index.html
# /doc/libs/1_82_0/libs/format/index.html
("table", {"bgcolor": "#007F7F", "border": "1", "cellpadding": "2"}),
# /docs/libs/1_82_0/libs/multi_array/doc/index.html (lowercase)
("table", {"bgcolor": "#007f7f", "border": "1", "cellpadding": "2"}),
# /docs/libs/1_88_0/libs/statechart/doc/index.html
("td", {"valign": "top", "width": "300"}),
# almost every other page has this as a header
("table", {"cellpadding": "2", "width": "100%"}),
# Remove the first hr from the page
("hr", {}),
# remove canonical tags
("link", {"rel": "canonical"}),
]
# these tags are only removed on the release page, update REMOVE_TAGS for all pages
REMOVE_TAGS_RELEASE = [
("div", {"id": "footer"}),
]
# List HTML elements (with relevant attributes) to remove ALL occurrences
REMOVE_ALL = [
# the legacy logo referenced from multiple pages at different depths
("header", {"class": "header"}),
("img", {"src": "../../../../boost.png"}),
("img", {"src": "../../../boost.png"}),
("img", {"src": "../../boost.png"}),
("img", {"src": "../boost.png"}),
("img", {"src": "boost.png"}),
("img", {"src": "images/boost.png"}),
# These are navigation controls, like next/up/prev. Do not remove for now.
# most pages, e.g. /docs/libs/1_82_0/libs/iterator/doc/html/index.html
# ("div", {"class": "spirit-nav"}),
# /docs/libs/1_82_0/libs/gil/doc/html/index.html
# ("div", {"class": "navbar"}),
# /docs/libs/1_82_0/libs/iostreams/doc/guide/generic_streams.html
# ("div", {"class": "nav"}),
]
# List HTML elements (with relevant attributes) to remove ONLY their CSS class
REMOVE_CSS_CLASSES = [
# /docs/libs/1_55_0/libs/exception/doc/boost_exception_all_hpp.html
("div", {"class": "body-0"}),
("div", {"class": "body-1"}),
("div", {"class": "body-2"}),
# /docs/libs/1_82_0/libs/numeric/conversion/doc/html/index.html
# ("div", {"class": "toc"}),
# ("dl", {"class": "toc"}),
# /doc/libs/boost_1_84_0/libs/container_hash/doc/html/hash.html
# ("div", {"class": "toc2"}),
]
def _insert_in_doc(target, elements, append=True):
to_add = [
BeautifulSoup(""),
*elements,
BeautifulSoup(""),
]
if append:
target.extend(to_add)
else:
for i in reversed(to_add):
target.insert(0, i)
def _insert_head(result, head_adding):
if result.head is None:
result.html.insert(0, result.new_tag("head"))
_insert_in_doc(result.head, head_adding)
if result.head.head is not None:
result.head.head.unwrap()
def _replace_body(result, original_body, base_body):
base_body_content = base_body.find("div", {"id": "boost-legacy-docs-body"})
if base_body_content is not None:
result.body.replace_with(base_body)
_insert_in_doc(base_body_content, [original_body])
result.body.body.unwrap()
def wrap_main_body_elements(
result: BeautifulSoup, original_docs_type: SourceDocType | None = None
):
def is_end_comment(html_element):
return (
isinstance(html_element, Comment)
and html_element == " END Manually appending items "
)
start_index = None
elements_to_wrap = []
wrapper_div = result.new_tag("div")
wrapper_class_list = []
# add classes based on the original docs type
# TODO: this is a hack that adds "-antora" to anything that's missing an original_docs_type.
# We should either ensure we always have a doc type, or boostlook should handle "source-docs-other"
# or something similar
docs_type_suffix = (
original_docs_type.value if original_docs_type else SourceDocType.ANTORA.value
)
wrapper_class_list.append(f"source-docs-{docs_type_suffix}")
if original_docs_type != SourceDocType.ANTORA:
# Antora docs have a boostlook class already; others need it.
wrapper_class_list.append("boostlook")
wrapper_div["class"] = " ".join(wrapper_class_list)
for index, element in enumerate(result.find("body").children):
if is_end_comment(element):
start_index = index
# we want to leave the comment where it is
continue
if start_index:
elements_to_wrap.append(element)
for index, element in enumerate(elements_to_wrap):
wrapper_div.append(element)
result.append(wrapper_div)
def modernize_legacy_page(
soup: BeautifulSoup,
base_html: str,
head_selector: str | dict[str, str] = "head",
insert_body: bool = True,
original_docs_type: SourceDocType | None = None,
skip_replace_boostlook: bool = False,
show_footer: bool = True,
show_navbar: bool = True,
) -> str:
"""Modernize a legacy Boost documentation page."""
HIDE_TAGS_BASE = []
if not show_navbar:
HIDE_TAGS_BASE.append(("div", {"class": "header-menu-bar topnavbar"})),
if soup.html is None:
# Not an HTML file we care about
return str(soup)
# Remove CSS classes that produce visual harm
for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
for tag in soup.find_all(tag_name, tag_attrs):
tag.attrs.pop("class")
soup = convert_name_to_id(soup)
if not skip_replace_boostlook:
soup = remove_library_boostlook(soup)
soup = remove_embedded_boostlook(soup)
# Use the base HTML to later extract the
and (part of) the
placeholder = BeautifulSoup(base_html, "html.parser")
if isinstance(head_selector, str):
target_head = placeholder.find_all(head_selector)
elif isinstance(head_selector, dict):
target_head = placeholder.find_all(**head_selector)
else:
target_head = None
if target_head:
# Append the taken from the base HTML to the existing (legacy) head
_insert_head(soup, target_head)
original_body = soup.body
if original_body is None:
pass
elif placeholder.body is not None:
if insert_body:
# Beautify the legacy body with structure and classes from the
# modern one, and embed the original body into a:
# block
_replace_body(soup, original_body, base_body=placeholder.body)
else:
_insert_in_doc(
soup.body,
placeholder.find("div", {"id": "boost-legacy-docs-header"}),
append=False,
)
wrap_main_body_elements(soup, original_docs_type)
if show_footer:
rendered_template = render_to_string("includes/_footer.html", {})
rendered_template_as_dom = BeautifulSoup(
rendered_template, "html.parser"
)
soup.append(rendered_template_as_dom)
# Remove tags from the base template
soup = hide_tags(soup, HIDE_TAGS_BASE)
return str(soup)
def minimize_uris(content: str) -> str:
# Replace all links to boost.org with a local link
content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
return content
def remove_unwanted(content: BeautifulSoup) -> BeautifulSoup:
# Remove the first occurrence of legacy header(s) and other stuff
for tag_name, tag_attrs in REMOVE_TAGS:
tag = content.find(tag_name, tag_attrs)
if tag:
tag.decompose()
# Remove all navbar-like divs, if any
for tag_name, tag_attrs in REMOVE_ALL:
for tag in content.find_all(tag_name, tag_attrs):
tag.decompose()
return content
def build_xpath(tag, attrs):
parts = [f"@{key}='{val}'" for key, val in attrs.items()]
condition = " and ".join(parts)
return f".//{tag}[{condition}]" if condition else f".//{tag}"
def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
"""
Takes HTML content and returns a list of tuples containing library
names and the paths to those libraries' docs. This is used to
update the documentation_url field on LibraryVersion objects.
Args:
content (str): HTML content from the libraries.htm file. For example, the
HTML content from `/docs/libs/1_82_0/libs/libraries.htm`.
name (str): The name of the section to search for. Defaults to "Alphabetically".
parent (str): The parent tag of the section to search for. Defaults to "h2".
Together, parent and string define what HTML tag to search for. For example,
if parent="h2" and name="Alphabetically", this function will search for
.
Returns:
list: A list of tuples containing library names and the paths to those
libraries' docs. For example, `[(library_name, path), ...]`.
"""
soup = BeautifulSoup(content, "html.parser")
# Find the tag that contains the list of libraries
tag = soup.find("a", attrs={"name": name})
if not tag:
return []
# Get the next
tag, which contains the list of libraries
library_list_tag = tag.find_parent(parent).find_next_sibling("ul")
# Now get all the items in the list
library_tags = library_list_tag.find_all("li")
if not library_tags:
return []
results = []
for library_tag in library_tags:
# Get the url path for the documentation
url_path = library_tag.find("a")["href"]
# Get the library name
library_name = library_tag.find("a").get_text()
results.append((library_name, url_path))
return results
### Code to modernize legacy release notes ###
def convert_h1_to_h2(soup):
"""Convert all h1 tags to h2 tags."""
for h1 in soup.find_all("h1"):
h1.name = "h2" # change h1 to h2
return soup
def convert_name_to_id(soup):
"""Convert all (deprecated) name attributes to id attributes."""
for tag in soup.find_all(attrs={"name": True}):
tag["id"] = tag["name"]
del tag["name"]
return soup
def remove_library_boostlook(soup):
for tag in soup.find_all("link"):
if (
tag.get("href").endswith("boostlook.css")
and tag.get("href") != "/static/css/boostlook.css"
):
tag.decompose()
return soup
def add_canonical_link(soup, canonical_uri):
"""Add a canonical link to the head of the document."""
if canonical_uri and soup.head:
canonical_link = soup.new_tag("link", rel="canonical", href=canonical_uri)
soup.head.append(canonical_link)
return soup
def modernize_preprocessor_docs(soup: BeautifulSoup) -> tuple[BeautifulSoup, bool]:
"""Special case handling for Boost.Preprocessor docs.
Returns a two-tuple:
0. BeautifulSoup object
1. Boolean indicating whether framesets were present (and modified).
"""
# Only transform if