mirror of
https://github.com/boostorg/website-v2.git
synced 2026-02-28 05:52:21 +00:00
211 lines
7.5 KiB
Python
211 lines
7.5 KiB
Python
from bs4 import BeautifulSoup
|
|
|
|
|
|
# List HTML elements (with relevant attributes) to remove the FIRST occurrence
|
|
REMOVE_TAGS = [
|
|
# Remove custom headers, these vary from lib to lib, it's usually a table
|
|
# /docs/libs/1_82_0/ (maps to index.html and has removable headers)
|
|
(
|
|
"table",
|
|
{
|
|
"bgcolor": "#D7EEFF",
|
|
"border": "0",
|
|
"bordercolor": "#111111",
|
|
"cellpadding": "5",
|
|
"cellspacing": "0",
|
|
"style": "border-collapse: collapse",
|
|
},
|
|
),
|
|
# /doc/libs/1_82_0/libs/functional/index.html
|
|
# /doc/libs/1_82_0/libs/functional/negators.html
|
|
# /doc/libs/1_82_0/libs/functional/ptr_fun.html
|
|
# /doc/libs/1_82_0/libs/functional/function_traits.html
|
|
# /doc/libs/1_82_0/libs/functional/mem_fun.html
|
|
# /doc/libs/1_82_0/libs/functional/binders.html
|
|
# /doc/libs/1_82_0/libs/uuid/doc/index.html
|
|
# /doc/libs/1_82_0/libs/rational/index.html
|
|
# /doc/libs/1_82_0/libs/format/index.html
|
|
("table", {"bgcolor": "#007F7F", "border": "1", "cellpadding": "2"}),
|
|
# /docs/libs/1_82_0/libs/multi_array/doc/index.html (lowercase)
|
|
("table", {"bgcolor": "#007f7f", "border": "1", "cellpadding": "2"}),
|
|
# /docs/libs/1_82_0/libs/gil/doc/html/index.html
|
|
(
|
|
"table",
|
|
{
|
|
"summary": "header",
|
|
"width": "100%",
|
|
"cellspacing": "0",
|
|
"cellpadding": "7",
|
|
"border": "0",
|
|
},
|
|
),
|
|
# very prominent header
|
|
# /docs/libs/1_82_0/libs/locale/doc/html/index.html
|
|
("div", {"id": "top"}),
|
|
# almost every other page has this as a header
|
|
("table", {"cellpadding": "2", "width": "100%"}),
|
|
]
|
|
|
|
# List HTML elements (with relevant attributes) to remove ALL occurrences
|
|
REMOVE_ALL = [
|
|
# the legacy logo referenced from multiple pages at different depths
|
|
("img", {"src": "../../../../boost.png"}),
|
|
("img", {"src": "../../../boost.png"}),
|
|
("img", {"src": "../../boost.png"}),
|
|
("img", {"src": "../boost.png"}),
|
|
("img", {"src": "boost.png"}),
|
|
("img", {"src": "images/boost.png"}),
|
|
# These are navigation controls, like next/up/prev. Do not remove for now.
|
|
# most pages, e.g. /docs/libs/1_82_0/libs/iterator/doc/html/index.html
|
|
# ("div", {"class": "spirit-nav"}),
|
|
# /docs/libs/1_82_0/libs/gil/doc/html/index.html
|
|
# ("div", {"class": "navbar"}),
|
|
# /docs/libs/1_82_0/libs/iostreams/doc/guide/generic_streams.html
|
|
# ("div", {"class": "nav"}),
|
|
]
|
|
|
|
# List HTML elements (with relevant attributes) to remove ONLY their CSS class
|
|
REMOVE_CSS_CLASSESS = [
|
|
# /docs/libs/1_55_0/libs/exception/doc/boost_exception_all_hpp.html
|
|
("div", {"class": "body-0"}),
|
|
("div", {"class": "body-1"}),
|
|
("div", {"class": "body-2"}),
|
|
# /docs/libs/1_82_0/libs/numeric/conversion/doc/html/index.html
|
|
("div", {"class": "toc"}),
|
|
("dl", {"class": "toc"}),
|
|
]
|
|
|
|
|
|
def _insert_in_doc(target, elements, append=True):
|
|
to_add = [
|
|
BeautifulSoup("<!-- BEGIN Manually appending items -->"),
|
|
*elements,
|
|
BeautifulSoup("<!-- END Manually appending items -->"),
|
|
]
|
|
if append:
|
|
target.extend(to_add)
|
|
else:
|
|
for i in reversed(to_add):
|
|
target.insert(0, i)
|
|
|
|
|
|
def _insert_head(result, head_adding):
|
|
if result.head is None:
|
|
result.html.insert(0, result.new_tag("head"))
|
|
_insert_in_doc(result.head, head_adding)
|
|
if result.head.head is not None:
|
|
result.head.head.unwrap()
|
|
|
|
|
|
def _replace_body(result, original_body, base_body):
|
|
base_body_content = base_body.find("div", {"id": "boost-legacy-docs-body"})
|
|
if base_body_content is not None:
|
|
result.body.replace_with(base_body)
|
|
_insert_in_doc(base_body_content, [original_body])
|
|
result.body.body.unwrap()
|
|
|
|
|
|
def modernize_legacy_page(content, base_html, head_selector="head", insert_body=True):
|
|
result = BeautifulSoup(content, "html.parser")
|
|
if result.html is None:
|
|
# Not an HTML file we care about
|
|
return content
|
|
|
|
# Remove the first occurrence of legacy header(s) and other stuff
|
|
for tag_name, tag_attrs in REMOVE_TAGS:
|
|
tag = result.find(tag_name, tag_attrs)
|
|
if tag:
|
|
tag.decompose()
|
|
|
|
# Remove all navbar-like divs, if any
|
|
for tag_name, tag_attrs in REMOVE_ALL:
|
|
for tag in result.find_all(tag_name, tag_attrs):
|
|
tag.decompose()
|
|
|
|
# Remove CSS classes that produce visual harm
|
|
for tag_name, tag_attrs in REMOVE_CSS_CLASSESS:
|
|
for tag in result.find_all(tag_name, tag_attrs):
|
|
tag.attrs.pop("class")
|
|
|
|
# Use the base HTML to later extract the <head> and (part of) the <body>
|
|
placeholder = BeautifulSoup(base_html, "html.parser")
|
|
if isinstance(head_selector, str):
|
|
target_head = placeholder.find_all(head_selector)
|
|
elif isinstance(head_selector, dict):
|
|
target_head = placeholder.find_all(**head_selector)
|
|
else:
|
|
target_head = None
|
|
|
|
if target_head:
|
|
# Append the <head> taken from the base HTML to the existing (legacy) head
|
|
_insert_head(result, target_head)
|
|
|
|
original_body = result.body
|
|
if original_body is None:
|
|
pass
|
|
elif placeholder.body is not None:
|
|
if insert_body:
|
|
# Beautify the legacy body with structure and classes from the
|
|
# modern one, and embed the original body into a:
|
|
# <div id="boost-legacy-docs-body"></div> block
|
|
_replace_body(result, original_body, base_body=placeholder.body)
|
|
else:
|
|
_insert_in_doc(
|
|
result.body,
|
|
placeholder.find("div", {"id": "boost-legacy-docs-header"}),
|
|
append=False,
|
|
)
|
|
|
|
content = result.prettify()
|
|
|
|
# Replace all links to boost.org with a local link
|
|
content = content.replace("https://www.boost.org/doc/libs/", "/docs/libs/")
|
|
|
|
return content
|
|
|
|
|
|
def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
|
|
"""
|
|
Takes HTML content and returns a list of tuples containing library
|
|
names and the paths to those libraries' docs. This is used to
|
|
update the documentation_url field on LibraryVersion objects.
|
|
|
|
Args:
|
|
content (str): HTML content from the libraries.htm file. For example, the
|
|
HTML content from `/docs/libs/1_82_0/libs/libraries.htm`.
|
|
name (str): The name of the section to search for. Defaults to "Alphabetically".
|
|
parent (str): The parent tag of the section to search for. Defaults to "h2".
|
|
Together, parent and string define what HTML tag to search for. For example,
|
|
if parent="h2" and name="Alphabetically", this function will search for
|
|
<h2 name="Alphabetically">.
|
|
|
|
Returns:
|
|
list: A list of tuples containing library names and the paths to those
|
|
libraries' docs. For example, `[(library_name, path), ...]`.
|
|
"""
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
|
|
# Find the tag that contains the list of libraries
|
|
tag = soup.find("a", attrs={"name": name})
|
|
|
|
if not tag:
|
|
return []
|
|
|
|
# Get the next <ul> tag, which contains the list of libraries
|
|
library_list_tag = tag.find_parent(parent).find_next_sibling("ul")
|
|
|
|
# Now get all the items in the list
|
|
library_tags = library_list_tag.find_all("li")
|
|
if not library_tags:
|
|
return []
|
|
|
|
results = []
|
|
for library_tag in library_tags:
|
|
# Get the url path for the documentation
|
|
url_path = library_tag.find("a")["href"]
|
|
# Get the library name
|
|
library_name = library_tag.find("a").get_text()
|
|
results.append((library_name, url_path))
|
|
|
|
return results
|