diff --git a/core/htmlhelper.py b/core/htmlhelper.py
index ceb60e34..5703b839 100644
--- a/core/htmlhelper.py
+++ b/core/htmlhelper.py
@@ -3,6 +3,7 @@ import re
from bs4 import BeautifulSoup, Comment, Tag
from django.template.loader import render_to_string
from django.templatetags.static import static
+from lxml import html
from core.boostrenderer import get_body_from_html
from core.constants import SourceDocType
@@ -241,26 +242,30 @@ def modernize_legacy_page(
def slightly_modernize_legacy_library_doc_page(content):
"""Modernize a legacy Boost library documentation page, but only minimally."""
- result = BeautifulSoup(content, "html.parser")
- if result.html is None:
- # Not an HTML file we care about
- return content
- # Remove the first occurrence of legacy header(s) and other stuff
- for tag_name, tag_attrs in REMOVE_TAGS:
- tag = result.find(tag_name, tag_attrs)
- if tag:
- tag.decompose()
+ try:
+ root = html.fromstring(content)
+ except Exception:
+ return content # Not valid HTML
- for tag_name, tag_attrs in REMOVE_ALL:
- for tag in result.find_all(tag_name, tag_attrs):
- tag.decompose()
+ for tag_name, attrs in REMOVE_TAGS:
+ xpath = build_xpath(tag_name, attrs)
+ elements = root.xpath(xpath)
+ if elements:
+ elements[0].getparent().remove(elements[0]) # Remove only first
- content = str(result)
+ for tag_name, attrs in REMOVE_ALL:
+ xpath = build_xpath(tag_name, attrs)
+ for el in root.xpath(xpath):
+ el.getparent().remove(el)
- # Replace all links to boost.org with a local link
- content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
+ content = html.tostring(root, encoding="unicode", method="html")
+ return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
- return content
+
+def build_xpath(tag, attrs):
+ parts = [f"@{key}='{val}'" for key, val in attrs.items()]
+ condition = " and ".join(parts)
+ return f".//{tag}[{condition}]" if condition else f".//{tag}"
def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
diff --git a/docs/dependencies.md b/docs/dependencies.md
index a0eb9273..20f77514 100644
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@@ -6,4 +6,4 @@
1. Add the package to `requirements.in`
1. Run `just pip-compile`, which will add the dependency to `requirements.txt`
1. Run `just rebuild` to rebuild your Docker image to include the new dependencies
-2. Run `docker compose up` and continue with development
+2. Run `just up` and continue with development
diff --git a/requirements-dev.txt b/requirements-dev.txt
index bddfe9ef..7d058647 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,7 +10,7 @@ django==4.2.16
# django-debug-toolbar
django-debug-toolbar==4.4.6
# via -r ./requirements-dev.in
-pydevd-pycharm==243.22562.180
+pydevd-pycharm==243.26053.29
# via -r ./requirements-dev.in
sqlparse==0.5.1
# via
diff --git a/requirements.in b/requirements.in
index 363a21e6..fe1c8b81 100644
--- a/requirements.in
+++ b/requirements.in
@@ -33,6 +33,7 @@ boto3
jsoncomment
unidecode
wordcloud
+lxml
# Logging
django-tracer
diff --git a/requirements.txt b/requirements.txt
index acdb7513..cc081dbe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -206,6 +206,8 @@ kiwisolver==1.4.7
# via matplotlib
kombu==5.4.2
# via celery
+lxml==5.4.0
+ # via -r ./requirements.in
marshmallow==3.22.0
# via environs
matplotlib==3.9.2