diff --git a/core/htmlhelper.py b/core/htmlhelper.py index ceb60e34..5703b839 100644 --- a/core/htmlhelper.py +++ b/core/htmlhelper.py @@ -3,6 +3,7 @@ import re from bs4 import BeautifulSoup, Comment, Tag from django.template.loader import render_to_string from django.templatetags.static import static +from lxml import html from core.boostrenderer import get_body_from_html from core.constants import SourceDocType @@ -241,26 +242,30 @@ def modernize_legacy_page( def slightly_modernize_legacy_library_doc_page(content): """Modernize a legacy Boost library documentation page, but only minimally.""" - result = BeautifulSoup(content, "html.parser") - if result.html is None: - # Not an HTML file we care about - return content - # Remove the first occurrence of legacy header(s) and other stuff - for tag_name, tag_attrs in REMOVE_TAGS: - tag = result.find(tag_name, tag_attrs) - if tag: - tag.decompose() + try: + root = html.fromstring(content) + except Exception: + return content # Not valid HTML - for tag_name, tag_attrs in REMOVE_ALL: - for tag in result.find_all(tag_name, tag_attrs): - tag.decompose() + for tag_name, attrs in REMOVE_TAGS: + xpath = build_xpath(tag_name, attrs) + elements = root.xpath(xpath) + if elements: + elements[0].getparent().remove(elements[0]) # Remove only first - content = str(result) + for tag_name, attrs in REMOVE_ALL: + xpath = build_xpath(tag_name, attrs) + for el in root.xpath(xpath): + el.getparent().remove(el) - # Replace all links to boost.org with a local link - content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/") + content = html.tostring(root, encoding="unicode", method="html") + return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/") - return content + +def build_xpath(tag, attrs): + parts = [f"@{key}='{val}'" for key, val in attrs.items()] + condition = " and ".join(parts) + return f".//{tag}[{condition}]" if condition else f".//{tag}" def get_library_documentation_urls(content, name="Alphabetically", parent="h2"): diff --git a/docs/dependencies.md b/docs/dependencies.md index a0eb9273..20f77514 100644 --- a/docs/dependencies.md +++ b/docs/dependencies.md @@ -6,4 +6,4 @@ 1. Add the package to `requirements.in` 1. Run `just pip-compile`, which will add the dependency to `requirements.txt` 1. Run `just rebuild` to rebuild your Docker image to include the new dependencies -2. Run `docker compose up` and continue with development +2. Run `just up` and continue with development diff --git a/requirements-dev.txt b/requirements-dev.txt index bddfe9ef..7d058647 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,7 +10,7 @@ django==4.2.16 # django-debug-toolbar django-debug-toolbar==4.4.6 # via -r ./requirements-dev.in -pydevd-pycharm==243.22562.180 +pydevd-pycharm==243.26053.29 # via -r ./requirements-dev.in sqlparse==0.5.1 # via diff --git a/requirements.in b/requirements.in index 363a21e6..fe1c8b81 100644 --- a/requirements.in +++ b/requirements.in @@ -33,6 +33,7 @@ boto3 jsoncomment unidecode wordcloud +lxml # Logging django-tracer diff --git a/requirements.txt b/requirements.txt index acdb7513..cc081dbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -206,6 +206,8 @@ kiwisolver==1.4.7 # via matplotlib kombu==5.4.2 # via celery +lxml==5.4.0 + # via -r ./requirements.in marshmallow==3.22.0 # via environs matplotlib==3.9.2