Files
website-v2/core/htmlhelper.py
2025-01-09 10:07:03 -08:00

624 lines
20 KiB
Python

from bs4 import BeautifulSoup, Comment
from django.template.loader import render_to_string
from core.boostrenderer import get_body_from_html
# List HTML elements (with relevant attributes) to remove the FIRST occurrence
REMOVE_TAGS = [
# Remove custom headers, these vary from lib to lib, it's usually a table
# /docs/libs/1_82_0/ (maps to index.html and has removable headers)
(
"table",
{
"bgcolor": "#D7EEFF",
"border": "0",
"bordercolor": "#111111",
"cellpadding": "5",
"cellspacing": "0",
"style": "border-collapse: collapse",
},
),
# /doc/libs/1_82_0/libs/functional/index.html
# /doc/libs/1_82_0/libs/functional/negators.html
# /doc/libs/1_82_0/libs/functional/ptr_fun.html
# /doc/libs/1_82_0/libs/functional/function_traits.html
# /doc/libs/1_82_0/libs/functional/mem_fun.html
# /doc/libs/1_82_0/libs/functional/binders.html
# /doc/libs/1_82_0/libs/uuid/doc/index.html
# /doc/libs/1_82_0/libs/rational/index.html
# /doc/libs/1_82_0/libs/format/index.html
("table", {"bgcolor": "#007F7F", "border": "1", "cellpadding": "2"}),
# /docs/libs/1_82_0/libs/multi_array/doc/index.html (lowercase)
("table", {"bgcolor": "#007f7f", "border": "1", "cellpadding": "2"}),
# /docs/libs/1_82_0/libs/gil/doc/html/index.html
(
"table",
{
"summary": "header",
"width": "100%",
"cellspacing": "0",
"cellpadding": "7",
"border": "0",
},
),
# very prominent header
# /docs/libs/1_82_0/libs/locale/doc/html/index.html
("div", {"id": "top"}),
# almost every other page has this as a header
("table", {"cellpadding": "2", "width": "100%"}),
]
# these tags are only removed on the release page, update REMOVE_TAGS for all pages
REMOVE_TAGS_RELEASE = [
("div", {"id": "footer"}),
]
# List HTML elements (with relevant attributes) to remove ALL occurrences
REMOVE_ALL = [
# the legacy logo referenced from multiple pages at different depths
("header", {"class": "header"}),
("img", {"src": "../../../../boost.png"}),
("img", {"src": "../../../boost.png"}),
("img", {"src": "../../boost.png"}),
("img", {"src": "../boost.png"}),
("img", {"src": "boost.png"}),
("img", {"src": "images/boost.png"}),
# These are navigation controls, like next/up/prev. Do not remove for now.
# most pages, e.g. /docs/libs/1_82_0/libs/iterator/doc/html/index.html
# ("div", {"class": "spirit-nav"}),
# /docs/libs/1_82_0/libs/gil/doc/html/index.html
# ("div", {"class": "navbar"}),
# /docs/libs/1_82_0/libs/iostreams/doc/guide/generic_streams.html
# ("div", {"class": "nav"}),
]
# List HTML elements (with relevant attributes) to remove ONLY their CSS class
REMOVE_CSS_CLASSES = [
# /docs/libs/1_55_0/libs/exception/doc/boost_exception_all_hpp.html
("div", {"class": "body-0"}),
("div", {"class": "body-1"}),
("div", {"class": "body-2"}),
# /docs/libs/1_82_0/libs/numeric/conversion/doc/html/index.html
# ("div", {"class": "toc"}),
# ("dl", {"class": "toc"}),
# /doc/libs/boost_1_84_0/libs/container_hash/doc/html/hash.html
# ("div", {"class": "toc2"}),
]
def _insert_in_doc(target, elements, append=True):
to_add = [
BeautifulSoup("<!-- BEGIN Manually appending items -->"),
*elements,
BeautifulSoup("<!-- END Manually appending items -->"),
]
if append:
target.extend(to_add)
else:
for i in reversed(to_add):
target.insert(0, i)
def _insert_head(result, head_adding):
if result.head is None:
result.html.insert(0, result.new_tag("head"))
_insert_in_doc(result.head, head_adding)
if result.head.head is not None:
result.head.head.unwrap()
def _replace_body(result, original_body, base_body):
base_body_content = base_body.find("div", {"id": "boost-legacy-docs-body"})
if base_body_content is not None:
result.body.replace_with(base_body)
_insert_in_doc(base_body_content, [original_body])
result.body.body.unwrap()
def wrap_main_body_elements(result, original_docs_type=None):
def is_end_comment(element):
return (
isinstance(element, Comment) and element == " END Manually appending items "
)
start_index = None
elements_to_wrap = []
wrapper_div = result.new_tag("div", id="boost-legacy-docs-wrapper")
if original_docs_type:
# add a class based on the original docs type
wrapper_div["class"] = f"source-docs-{original_docs_type.value} boostlook"
for index, element in enumerate(result.find("body").children):
if is_end_comment(element):
start_index = index
# we want to leave the comment where it is
continue
if start_index:
elements_to_wrap.append(element)
for index, element in enumerate(elements_to_wrap):
wrapper_div.append(element)
result.append(wrapper_div)
def modernize_legacy_page(
content,
base_html,
head_selector="head",
insert_body=True,
original_docs_type=None,
show_footer=True,
show_navbar=True,
):
"""Modernize a legacy Boost documentation page."""
HIDE_TAGS_BASE = []
if not show_navbar:
HIDE_TAGS_BASE.append(("div", {"class": "header-menu-bar topnavbar"})),
result = BeautifulSoup(content, "html.parser")
if result.html is None:
# Not an HTML file we care about
return content
# Remove the first occurrence of legacy header(s) and other stuff
for tag_name, tag_attrs in REMOVE_TAGS:
tag = result.find(tag_name, tag_attrs)
if tag:
tag.decompose()
# Remove all navbar-like divs, if any
for tag_name, tag_attrs in REMOVE_ALL:
for tag in result.find_all(tag_name, tag_attrs):
tag.decompose()
# Remove CSS classes that produce visual harm
for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
for tag in result.find_all(tag_name, tag_attrs):
tag.attrs.pop("class")
result = convert_name_to_id(result)
# Use the base HTML to later extract the <head> and (part of) the <body>
placeholder = BeautifulSoup(base_html, "html.parser")
if isinstance(head_selector, str):
target_head = placeholder.find_all(head_selector)
elif isinstance(head_selector, dict):
target_head = placeholder.find_all(**head_selector)
else:
target_head = None
if target_head:
# Append the <head> taken from the base HTML to the existing (legacy) head
_insert_head(result, target_head)
original_body = result.body
if original_body is None:
pass
elif placeholder.body is not None:
if insert_body:
# Beautify the legacy body with structure and classes from the
# modern one, and embed the original body into a:
# <div id="boost-legacy-docs-body"></div> block
_replace_body(result, original_body, base_body=placeholder.body)
else:
_insert_in_doc(
result.body,
placeholder.find("div", {"id": "boost-legacy-docs-header"}),
append=False,
)
wrap_main_body_elements(result, original_docs_type)
if show_footer:
rendered_template = render_to_string("includes/_footer.html", {})
rendered_template_as_dom = BeautifulSoup(
rendered_template, "html.parser"
)
result.append(rendered_template_as_dom)
# Remove tags from the base template
result = hide_tags(result, HIDE_TAGS_BASE)
content = str(result)
# Replace all links to boost.org with a local link
content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
return content
def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
"""
Takes HTML content and returns a list of tuples containing library
names and the paths to those libraries' docs. This is used to
update the documentation_url field on LibraryVersion objects.
Args:
content (str): HTML content from the libraries.htm file. For example, the
HTML content from `/docs/libs/1_82_0/libs/libraries.htm`.
name (str): The name of the section to search for. Defaults to "Alphabetically".
parent (str): The parent tag of the section to search for. Defaults to "h2".
Together, parent and string define what HTML tag to search for. For example,
if parent="h2" and name="Alphabetically", this function will search for
<h2 name="Alphabetically">.
Returns:
list: A list of tuples containing library names and the paths to those
libraries' docs. For example, `[(library_name, path), ...]`.
"""
soup = BeautifulSoup(content, "html.parser")
# Find the tag that contains the list of libraries
tag = soup.find("a", attrs={"name": name})
if not tag:
return []
# Get the next <ul> tag, which contains the list of libraries
library_list_tag = tag.find_parent(parent).find_next_sibling("ul")
# Now get all the items in the list
library_tags = library_list_tag.find_all("li")
if not library_tags:
return []
results = []
for library_tag in library_tags:
# Get the url path for the documentation
url_path = library_tag.find("a")["href"]
# Get the library name
library_name = library_tag.find("a").get_text()
results.append((library_name, url_path))
return results
### Code to modernize legacy release notes ###
def convert_h1_to_h2(soup):
"""Convert all h1 tags to h2 tags."""
for h1 in soup.find_all("h1"):
h1.name = "h2" # change h1 to h2
return soup
def convert_name_to_id(soup):
"""Convert all (deprecated) name attributes to id attributes."""
for tag in soup.find_all(attrs={"name": True}):
tag["id"] = tag["name"]
del tag["name"]
return soup
def format_nested_lists(soup):
"""Flattens nested lists"""
try:
top_level_ul = soup.find_all("ul")
except AttributeError:
# If there are no ul tags, return soup
return soup
for ul in top_level_ul:
list_items = ul.find_all("li", recursive=False) # Only direct children of <ul>
for li in list_items:
# Extract and remove the non-<ul> contents from <li>
non_ul_contents = [
content for content in li.contents if not content.name == "ul"
]
for content in non_ul_contents:
content.extract()
# Convert the extracted contents to a string and parse it as HTML
text_content = "".join(str(content) for content in non_ul_contents).strip()
if text_content:
new_soup = BeautifulSoup(text_content, "html.parser")
h4_tag = soup.new_tag("h4")
h4_tag.append(new_soup)
# decompose the li tag and append the new h4 tag
li.decompose()
ul.append(h4_tag)
# Correct the HTML structure
# Process <h4> and associated <ul> tags
for h4 in soup.find_all("h4"):
next_ul = h4.find_next_sibling("ul")
if next_ul:
# Clean up <ul> and <li> tags under <h4>
for li in next_ul.find_all("li"):
# Unwrap or clean <div> tags inside <li>
for div in li.find_all("div"):
div.unwrap()
# Reinsert the cleaned <ul> after the <h4>
h4.insert_after(next_ul) # remove the h4 tag but keep its content
return soup
def process_new_libraries(soup):
"""Custom function to process the new libraries section
of legacy release notes"""
try:
new_libraries_divs = soup.find_all(
"div", class_=lambda x: x and "new_libraries" in x
)
except AttributeError:
# No div found
return soup
for div in new_libraries_divs:
h3_tag = div.find("h3")
if h3_tag and h3_tag.span:
# Extract text from h3 span and update h3 tag
h3_text = h3_tag.span.get_text()
h3_tag.clear()
h3_tag.append(h3_text)
ul_tag = div.find("ul")
if ul_tag:
list_items = ul_tag.find_all("li", recursive=False)
for li in list_items:
# Extract and restructure contents of div inside li
inner_div = li.find("div")
if inner_div:
# Move the anchor tag and text outside of the inner div
for content in inner_div.contents:
li.append(content)
# Remove the now-empty div
inner_div.decompose()
return soup
def remove_css(soup, tags):
"""Remove all CSS classes from the given tags."""
for tag_name, tag_attrs in tags:
try:
found_tags = soup.find_all(tag_name, **tag_attrs)
except AttributeError:
# No tags found
continue
for tag in found_tags:
if "class" in tag.attrs:
tag.attrs.pop("class")
return soup
def remove_duplicate_tag(soup, tag_name):
"""Remove duplicate tags with identical content."""
try:
tags = soup.find_all(tag_name)
except AttributeError:
# no tags
return soup
for i in range(len(tags) - 1):
current_tag = tags[i]
next_tag = tags[i + 1]
# Check if the next tag has the same text content
if current_tag.get_text(strip=True) == next_tag.get_text(strip=True):
next_tag.decompose() # Remove the duplicate
break
return soup
def remove_first_tag(soup, tags):
"""Remove the first occurrence of legacy header(s) and other stuff."""
for tag_name, tag_attrs in tags:
tag = soup.find(tag_name, tag_attrs)
if tag:
tag.decompose()
return soup
def hide_tags(soup, tags):
for tag_name, tag_attrs in tags:
tag = soup.find(tag_name, tag_attrs)
if tag:
tag["style"] = "display: none;"
return soup
def remove_ids(soup, ids):
"""Remove all tags with the given id."""
for id_value in ids:
try:
tag = soup.find(id=id_value)
except AttributeError:
# Tag not found
continue
if tag and not tag.get_text(strip=True) == "":
tag.unwrap()
elif tag:
tag.decompose()
return soup
def remove_release_classes(soup, classes):
"""Remove all tags with the given class name. Unwrap the tag if it has text."""
for class_value in classes:
try:
tags = soup.find_all(class_=class_value)
except AttributeError:
# Tag not found
continue
for tag in tags:
if tag and not tag.get_text(strip=True) == "":
tag.unwrap()
else:
tag.decompose()
return soup
def remove_tables(soup, class_name):
"""Remove all tables with the given class name."""
for table in soup.find_all("table", class_=class_name):
table.decompose()
return soup
def remove_cpp_alliance_links(soup):
"""Remove download link sponsorship notes about The C++ Alliance."""
# Convert soup to string to use replace.
content = str(soup)
# Remove the C++ Alliance link from the content.
replace_text = (
'* The download links are supported by grants from <a class="text-sky-600" '
'href="https://cppalliance.org/" target="_blank">The C++ Alliance</a>.'
)
content = content.replace(replace_text, "")
# Convert the content back to a soup object.
return BeautifulSoup(content, "html.parser")
def remove_tags(soup, tags):
"""Remove all tags with the given tag name and attributes."""
for tag_name, tag_attrs in tags:
for tag in soup.find_all(tag_name, tag_attrs):
tag.decompose()
return soup
def style_links(soup, class_name):
"""Add the given class name to all links."""
for a_tag in soup.find_all("a"):
a_tag["class"] = class_name
return soup
def add_class_to_sibling_by_header(
soup, header_text, class_to_add, header_tag="h3", target_tag="ul"
):
"""
Adds a class to the next sibling of a specified tag
if it matches the search criteria.
"""
sections = soup.find_all(header_tag, string=header_text)
for section in sections:
target = section.find_next_sibling(target_tag)
if target:
existing_classes = target.get("class", [])
if class_to_add not in existing_classes:
target["class"] = existing_classes + [class_to_add]
return target
def reformat_new_libraries_list(soup):
new_libraries_list = add_class_to_sibling_by_header(
soup,
header_text="New Libraries",
class_to_add="new-libraries",
header_tag="h3",
target_tag="ul",
)
if new_libraries_list:
for li in new_libraries_list.find_all("li"):
a_tag = li.find("a")
if a_tag:
description_text = li.text.replace(a_tag.text, "").strip(": ").strip()
description_text = description_text.lstrip(":")
nested_ul = soup.new_tag("ul")
nested_li = soup.new_tag("li")
nested_li.string = description_text
nested_ul.append(nested_li)
li.clear()
li.append(a_tag)
li.append(": ")
li.append(nested_ul)
return soup
def modernize_release_notes(html_content):
IDS_TO_REMOVE = ["heading", "body" "body-inner", "content"]
CLASSES_TO_REMOVE = [
"section",
"section-0",
"section-title",
"section-body",
"news-title",
"news-date",
"news-description",
"description",
"link",
"identifier",
"library",
]
soup = BeautifulSoup(html_content, "html.parser")
# Remove unwanted tables
soup = remove_tables(soup, "download-table")
# Normalize headings
soup = convert_h1_to_h2(soup)
# Remove the first occurrence of legacy header(s) and other stuff
soup = remove_first_tag(soup, REMOVE_TAGS)
# Remove the first occurrence of legacy header(s) and other stuff
soup = remove_first_tag(soup, REMOVE_TAGS_RELEASE)
# Remove all navbar-like divs, if any
soup = remove_tags(soup, REMOVE_ALL)
# Remove CSS classes that produce visual harm
soup = remove_css(soup, REMOVE_CSS_CLASSES)
# Add custom class to all <a> tags
soup = style_links(soup, "text-sky-600")
# # Strip what's left of other things we don't want
# Unwrap elements with specific IDs
soup = remove_ids(soup, IDS_TO_REMOVE)
# Unwrap elements with specific classes
soup = remove_release_classes(soup, CLASSES_TO_REMOVE)
# Process divs with class 'new_libraries'
soup = process_new_libraries(soup)
# Convert nested <ul>'s to <h4>s with single <ul> inside
# NOTE: Commented out because it's not needed for the current release notes.
# soup = format_nested_lists(soup)
# Restructure New Libraries' HTML structure
soup = reformat_new_libraries_list(soup)
# Remove duplicate header tags
soup = remove_duplicate_tag(soup, "h2")
# Remove sponsorship links about downloads from release notes.
soup = remove_cpp_alliance_links(soup)
# Remove unnecessary divs
try:
excess_divs = soup.find_all("div")
except AttributeError:
# not found
excess_divs = []
for div in excess_divs:
div.unwrap()
result = str(soup)
# Replace all links to boost.org with a local link
content = result.replace("https://www.boost.org/doc/libs/", "/docs/libs/")
return get_body_from_html(content)