diff --git a/README.md b/README.md index a00645d1..afe2fb7f 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ There are two options for development setups, nix and native. The native setup d 1. [Non-Dev Server Allauth Setup](docs/non-dev-server-allauth-setup.md) 1. [Admin Features](docs/admin.md) 1. [Mailing List Setup](docs/mailing_list.md) + 1. [Connecting to Servers](https://github.com/cppalliance/website-v2-operations/blob/master/gcp/README.md) After going through the "Development System setup" steps above to create the Docker image, install dependencies, and start the services in `docker-compose.yml`, run: diff --git a/core/admin.py b/core/admin.py index 834e75a2..c7db40bd 100644 --- a/core/admin.py +++ b/core/admin.py @@ -2,18 +2,31 @@ from django.contrib import admin from django.urls import path from django.shortcuts import redirect, render from django.contrib import messages +from django.utils import timezone from .models import RenderedContent, SiteSettings from .tasks import delete_all_rendered_content @admin.register(RenderedContent) class RenderedContentAdmin(admin.ModelAdmin): - list_display = ("cache_key", "content_type", "modified") + list_display = ( + "cache_key", + "content_type", + "modified", + "latest_path_matched_indicator", + "latest_path_match_class", + ) search_fields = ("cache_key",) + readonly_fields = ("latest_path_match_class",) def get_urls(self): urls = super().get_urls() custom_urls = [ + path( + "start-content-refresh/", + self.admin_site.admin_view(self.start_content_refresh_view), + name="core_renderedcontent_start_content_refresh", + ), path( "delete-all/", self.admin_site.admin_view(self.delete_all_view), @@ -22,6 +35,27 @@ class RenderedContentAdmin(admin.ModelAdmin): ] return custom_urls + urls + def start_content_refresh_view(self, request): + if request.method == "POST": + settings = SiteSettings.load() + settings.rendered_content_replacement_start = timezone.now() + settings.save() + messages.success( + request, + f"Content refresh start time set to {settings.rendered_content_replacement_start}", + ) + return redirect("..") + + context = { + **self.admin_site.each_context(request), + "title": "Start Content Refresh", + } + return render( + request, + "admin/core/renderedcontent/start_content_refresh_confirmation.html", + context, + ) + def delete_all_view(self, request): if request.method == "POST": delete_all_rendered_content.delay() @@ -42,13 +76,15 @@ class RenderedContentAdmin(admin.ModelAdmin): def changelist_view(self, request, extra_context=None): extra_context = extra_context or {} + extra_context["has_start_content_refresh"] = True extra_context["has_delete_all"] = True return super().changelist_view(request, extra_context=extra_context) @admin.register(SiteSettings) class SiteSettingsAdmin(admin.ModelAdmin): - list_display = ("id", "wordcloud_ignore") + list_display = ("id", "wordcloud_ignore", "rendered_content_replacement_start") + readonly_fields = ("rendered_content_replacement_start",) def has_add_permission(self, request): return super().has_add_permission(request) and SiteSettings.objects.count() == 0 diff --git a/core/migrations/0004_renderedcontent_latest_docs_path_and_more.py b/core/migrations/0004_renderedcontent_latest_docs_path_and_more.py new file mode 100644 index 00000000..90aba374 --- /dev/null +++ b/core/migrations/0004_renderedcontent_latest_docs_path_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 6.0.2 on 2026-02-18 20:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("core", "0003_sitesettings_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="renderedcontent", + name="latest_docs_path", + field=models.CharField(blank=True, default=""), + ), + migrations.AddField( + model_name="renderedcontent", + name="latest_path_match_class", + field=models.CharField(blank=True, default="", max_length=128), + ), + migrations.AddField( + model_name="renderedcontent", + name="latest_path_matched_indicator", + field=models.IntegerField( + choices=[ + (0, "Undetermined"), + (1, "Direct match exists"), + (2, "Determined by matcher"), + ], + default=0, + help_text="Indicates how the latest path should be determined.", + ), + ), + ] diff --git a/core/migrations/0005_sitesettings_rendered_content_replacement_start.py b/core/migrations/0005_sitesettings_rendered_content_replacement_start.py new file mode 100644 index 00000000..8622139b --- /dev/null +++ b/core/migrations/0005_sitesettings_rendered_content_replacement_start.py @@ -0,0 +1,23 @@ +# Generated by Django 5.2.8 on 2026-01-27 18:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("core", "0004_renderedcontent_latest_docs_path_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="sitesettings", + name="rendered_content_replacement_start", + field=models.DateTimeField( + blank=True, + editable=False, + help_text="Set via RenderedContent admin action.", + null=True, + ), + ), + ] diff --git a/core/models.py b/core/models.py index 877d38d4..5cb99a18 100644 --- a/core/models.py +++ b/core/models.py @@ -1,10 +1,20 @@ +import re + from django.db import models from django.utils.translation import gettext_lazy as _ from django_extensions.db.models import TimeStampedModel +from libraries.path_matcher.utils import determine_latest_url +from versions.models import Version from .managers import RenderedContentManager +class LatestPathMatchIndicator(models.IntegerChoices): + UNDETERMINED = 0, _("Undetermined") + DIRECT_MATCH = 1, _("Direct match exists") + CUSTOM_MATCH = 2, _("Determined by matcher") + + class RenderedContent(TimeStampedModel): """Stores a copy of rendered content. Generally, this content is retrieved from the S3 buckets and, if necessary, converted to HTML. @@ -41,6 +51,16 @@ class RenderedContent(TimeStampedModel): blank=True, ) + latest_path_matched_indicator = models.IntegerField( + choices=LatestPathMatchIndicator, + default=LatestPathMatchIndicator.UNDETERMINED, + null=False, + blank=False, + help_text=_("Indicates how the latest path should be determined."), + ) + latest_docs_path = models.CharField(blank=True, default="") + latest_path_match_class = models.CharField(max_length=128, blank=True, default="") + objects = RenderedContentManager() class Meta: @@ -50,6 +70,23 @@ class RenderedContent(TimeStampedModel): def __str__(self): return self.cache_key + @property + def latest_path(self) -> str | None: + indicator = self.latest_path_matched_indicator + if indicator == LatestPathMatchIndicator.DIRECT_MATCH: + return re.sub( + r"static_content_[\d_]+/(?P[^/]\S+)", + "doc/libs/latest/\g", + self.cache_key, + ) + elif indicator == LatestPathMatchIndicator.CUSTOM_MATCH: + return self.latest_docs_path + elif indicator == LatestPathMatchIndicator.UNDETERMINED: + return determine_latest_url( + self.cache_key.replace("static_content_", ""), + Version.objects.most_recent(), + ) + def save(self, *args, **kwargs): if isinstance(self.content_original, bytes): self.content_original = self.content_original.decode("utf-8") @@ -66,6 +103,12 @@ class SiteSettings(models.Model): default="", help_text="A comma-separated list of words to ignore in the release report wordcloud.", # noqa E501 ) + rendered_content_replacement_start = models.DateTimeField( + null=True, + blank=True, + editable=False, + help_text="Set via RenderedContent admin action.", + ) class Meta: constraints = [ diff --git a/core/tasks.py b/core/tasks.py index 467f31b1..93a28922 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -4,11 +4,14 @@ from celery import shared_task from dateutil.parser import parse from django.core.cache import caches +from django.utils import timezone from core.asciidoc import convert_adoc_to_html +from libraries.path_matcher.utils import get_path_match_from_chain +from versions.models import Version from .boostrenderer import get_content_from_s3 from .constants import RENDERED_CONTENT_BATCH_DELETE_SIZE -from .models import RenderedContent +from .models import RenderedContent, LatestPathMatchIndicator logger = structlog.get_logger() @@ -66,9 +69,27 @@ def refresh_content_from_s3(s3_key, cache_key): @shared_task def save_rendered_content(cache_key, content_type, content_html, last_updated_at=None): """Saves a RenderedContent object to database.""" + + match_result = get_path_match_from_chain( + cache_key.replace("static_content_", ""), Version.objects.most_recent() + ) + + indicator = ( + LatestPathMatchIndicator.DIRECT_MATCH + if match_result.is_direct_equivalent + else LatestPathMatchIndicator.CUSTOM_MATCH + ) + + # we don't set the latest_docs_path if it's a direct match, for db size reduction defaults = { "content_type": content_type, "content_html": content_html, + "latest_path_matched_indicator": indicator, + "latest_docs_path": ( + match_result.latest_path if not match_result.is_direct_equivalent else None + ), + "latest_path_match_class": match_result.matcher, + "modified": timezone.now(), } if last_updated_at: diff --git a/core/tests/test_views.py b/core/tests/test_views.py index 3b0086f1..7f5a2381 100644 --- a/core/tests/test_views.py +++ b/core/tests/test_views.py @@ -398,9 +398,7 @@ def test_static_content_blocks_direct_doc_paths(request_factory): @pytest.mark.django_db -@override_settings( - CACHES=TEST_CACHES, -) +@override_settings(CACHES=TEST_CACHES) def test_static_content_allows_non_direct_doc_paths(request_factory): """Test that non-direct doc paths are allowed and processed normally.""" diff --git a/core/views.py b/core/views.py index e55fde72..bc247ac4 100644 --- a/core/views.py +++ b/core/views.py @@ -71,7 +71,7 @@ from .htmlhelper import ( add_canonical_link, ) from .markdown import process_md -from .models import RenderedContent +from .models import RenderedContent, SiteSettings from .tasks import ( clear_rendered_content_cache_by_cache_key, clear_rendered_content_cache_by_content_type, @@ -393,6 +393,7 @@ class BaseStaticContentTemplateView(TemplateView): return { "content": content_obj.content_html.encode("utf-8"), "content_type": content_obj.content_type, + "updated": content_obj.modified, } except RenderedContent.DoesNotExist: return None @@ -607,6 +608,13 @@ class DocLibsTemplateView(VersionAlertMixin, BaseStaticContentTemplateView): result = self.get_from_database(cache_key) if not result and (result := self.get_from_s3(content_path)): self.save_to_database(cache_key, result) + if result: + refresh_start = SiteSettings.load().rendered_content_replacement_start + last_updated = result.get("updated", timezone.now()) + if refresh_start and last_updated < refresh_start: + refresh_content_from_s3.delay( + f"/archives/boost_{content_path}", cache_key + ) elif content_data := self.get_from_s3(content_path): # structure is to allow for redirect/return to be handled in a unified way result = { diff --git a/flake.nix b/flake.nix index 9ffc6744..64c5cb89 100644 --- a/flake.nix +++ b/flake.nix @@ -54,6 +54,7 @@ awscli gdk just + kubectl opentofu # frontend nodejs_22 # matches Dockerfile, due for upgrade? diff --git a/libraries/mixins.py b/libraries/mixins.py index a8cd3815..19d2e6c9 100644 --- a/libraries/mixins.py +++ b/libraries/mixins.py @@ -8,6 +8,7 @@ from django.db.models.functions import Lower from django.shortcuts import get_object_or_404 from django.urls import reverse +from core.models import RenderedContent from libraries.constants import ( LATEST_RELEASE_URL_PATH_STR, MASTER_RELEASE_URL_PATH_STR, @@ -20,6 +21,7 @@ from libraries.models import ( Library, LibraryVersion, ) +from libraries.path_matcher.utils import determine_latest_url from versions.models import Version logger = structlog.get_logger() @@ -37,15 +39,33 @@ class VersionAlertMixin: current_version_kwargs = self.kwargs.copy() if url_name == "docs-libs-page": - alert_visible = not current_version_kwargs.get("content_path").startswith( - LATEST_RELEASE_URL_PATH_STR - ) + allowed_types = getattr(self, "html_content_types", []) + if allowed_types and context.get("content_type") not in allowed_types: + return context + content_path = current_version_kwargs.get("content_path") + + alert_visible = not content_path.startswith(LATEST_RELEASE_URL_PATH_STR) + if alert_visible: + content = RenderedContent.objects.filter( + cache_key=f"static_content_{content_path}" + ).first() + + version_alert_url = ( + content.latest_path + if content + else determine_latest_url( + content_path, + Version.objects.most_recent(), + ) + ) + context["version_alert_url"] = f"/{version_alert_url}" + # TODO: this hack is here because the BoostVersionMixin only handles the # libraries format (boost-1-90-0-beta-1) for betas, while this path uses # 1_90_beta1 so we need to retrieve and set the selected_version # specifically for this use, db slug = "boost-1-90-0-beta1" # path_slug = 1_90_beta1 - path_slug = current_version_kwargs.get("content_path").split("/")[0] + path_slug = content_path.split("/")[0] if path_slug == LATEST_RELEASE_URL_PATH_STR: context["selected_version"] = Version.objects.most_recent() elif path_slug in ("master", "develop"): @@ -59,7 +79,7 @@ class VersionAlertMixin: "content_path": re.sub( r"([_0-9a-zA-Z]+|master|develop)/(\S+)", rf"{LATEST_RELEASE_URL_PATH_STR}/\2", - current_version_kwargs.get("content_path"), + content_path, ) } ) @@ -68,7 +88,10 @@ class VersionAlertMixin: alert_visible = ( self.kwargs.get("version_slug") != LATEST_RELEASE_URL_PATH_STR ) - context["version_alert_url"] = reverse(url_name, kwargs=current_version_kwargs) + context["version_alert_url"] = reverse( + url_name, kwargs=current_version_kwargs + ) + context["version_alert"] = alert_visible return context diff --git a/libraries/path_matcher/__init__.py b/libraries/path_matcher/__init__.py new file mode 100644 index 00000000..29bf18b8 --- /dev/null +++ b/libraries/path_matcher/__init__.py @@ -0,0 +1,25 @@ +from .base_path_matcher import BasePathMatcher, PathSegments, PathMatchResult +from .matchers import ( + DirectMatcher, + LibsPathToLatestDirectMatcher, + LibsPathToLatestFallbackMatcher, + LibsToAntoraPathDirectMatcher, + DocHtmlBoostPathToFallbackMatcher, + DocHtmlPathToDirectMatcher, + DocHtmlBoostHtmlFallbackPathMatcher, + ToLibsLatestRootFallbackMatcher, +) + +__all__ = [ + BasePathMatcher, + PathSegments, + PathMatchResult, + DirectMatcher, + LibsPathToLatestDirectMatcher, + LibsPathToLatestFallbackMatcher, + LibsToAntoraPathDirectMatcher, + DocHtmlBoostPathToFallbackMatcher, + DocHtmlPathToDirectMatcher, + DocHtmlBoostHtmlFallbackPathMatcher, + ToLibsLatestRootFallbackMatcher, +] diff --git a/libraries/path_matcher/base_path_matcher.py b/libraries/path_matcher/base_path_matcher.py new file mode 100644 index 00000000..68cce2c2 --- /dev/null +++ b/libraries/path_matcher/base_path_matcher.py @@ -0,0 +1,156 @@ +import re +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass + +from botocore.client import BaseClient +from botocore.exceptions import ClientError +from django.conf import settings + +from versions.models import Version +import structlog + +logger = structlog.get_logger(__name__) + + +@dataclass +class PathSegments: + library_name: str + content_path: str + + +@dataclass +class PathMatchResult: + is_direct_equivalent: bool + latest_path: str + matcher: str + + +class BasePathMatcher(metaclass=ABCMeta): + """ + Extended class names should follow the format of "(FromDescription)To(ToDescription)(Exact|Index)Matcher". + + * ...Direct - should be used when we're going to return a direct matching file in the latest library docs + * ...Fallback - should be used when we're going return an index.htm(l) file in the latest library docs or otherwise + don't mind there being no exact match on the db/s3 + + Operation: + 1. we check to see if the provided path matches the Extended class's path_re regex. + 2. if no regex match we move to the next matcher in the chain + 3. if regex matches we check the DB to see if a matching path is found and fallback to a checking S3 to see if + it just hasn't been cached. + 4. if no match on db or s3 and the matcher is flagged as is_index_fallback=True we return that as a match + 5. otherwise we then move on to the next matcher in the chain + + class properties: + has_equivalent: default false, set to true if this class provides a direct equivalent path and no path translation + is needed + is_index_fallback: default false, set to true if this matcher accepts that the path may not actually exist. + path_re: returns a compiled regex() as documented on the property + """ + + has_equivalent: bool = False + is_index_fallback: bool = False + + @property + @abstractmethod + def path_re(self) -> re.Pattern[str]: + """ + returns a Pattern object with group names of 'library_name', 'content_path' + e.g. re.compile(rf"{BOOST_VERSION_REGEX}/libs/(?P[\w]+)/(?P\S+)") + All groups must be filled, don't necessarily need to be used in your generate_... methods. + """ + raise NotImplementedError + + def __init__(self, latest_version: Version, s3_client: BaseClient): + self.latest_version: Version = latest_version + self.s3_client: BaseClient = s3_client + self.next: BasePathMatcher | None = None + self.latest_slug: str = self.latest_version.stripped_boost_url_slug + + def set_next(self, next_matcher: "BasePathMatcher"): + self.next = next_matcher + + @abstractmethod + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + """ + Generates a string to match the s3/cache_key path which will be checked for existence, + returns something similar to: + static_content_1_84_0/libs/algorithm/doc/html/index.html + static_content_1_84_0/doc/html/accumulators.html + """ + raise NotImplementedError + + @abstractmethod + def generate_latest_url(self, path_data: PathSegments) -> str: + """returns the actual latest url the user should be presented with""" + raise NotImplementedError + + def determine_match(self, path: str) -> PathMatchResult: + if (details := self.get_group_items(path)) is not None: + if self.confirm_path_exists(path, details) or self.is_index_fallback: + logger.debug(f"regex match on {self.get_class_name()}") + return self.get_result(details) + + logger.debug(f"no regex match determined on {self.get_class_name()}") + if self.next: + return self.next.determine_match(path) + else: + msg = f"No redirect path match for {path=}" + logger.warning(msg) + raise ValueError(msg) + + def get_group_items(self, path: str) -> PathSegments | None: + """ + returns tuple (library_name, content_path) + """ + if src_match := self.path_re.match(path): + group_values = src_match.groupdict() + library_name = group_values.get("library_name") + content_path = group_values.get("content_path") + if all([library_name, content_path]): + return PathSegments(library_name, content_path) + return None + + def confirm_path_exists(self, path: str, segments: PathSegments) -> bool: + s3_path = self.generate_latest_s3_path(path, segments) + logger.debug(f"{s3_path=}") + return ( + self.confirm_db_path_exists(s3_path) + or self.confirm_s3_path_exists(s3_path) + ) # fmt: skip + + def confirm_s3_path_exists(self, path: str) -> bool: + # s3 stored, e.g. archives/boost_1_90_0/doc/html/accumulators.html + archive_key = path.replace("static_content_", "archives/boost_") + logger.debug(f"Checking S3 for {path=} ~ {archive_key=} ") + try: + bucket_name = settings.STATIC_CONTENT_BUCKET_NAME + self.s3_client.head_object(Bucket=bucket_name, Key=archive_key) + logger.debug(f"S3 key exists: {path}") + return True + except ClientError: + logger.debug(f"S3 key does not exist: {path}") + return False + + @staticmethod + def confirm_db_path_exists(path: str) -> bool: + from core.models import RenderedContent + + logger.debug(f"{path=}") + if is_path := RenderedContent.objects.filter(cache_key=path).exists(): + logger.debug(f"RenderedContent match {is_path=}") + return True + return False + + def get_class_name(self): + return self.__class__.__name__ + + def get_result(self, path_data: PathSegments) -> PathMatchResult: + return PathMatchResult( + self.has_equivalent, + self.generate_latest_url(path_data), + self.get_class_name(), + ) + + def handle(self, test_path: str) -> PathMatchResult: + return self.determine_match(test_path) diff --git a/libraries/path_matcher/matchers.py b/libraries/path_matcher/matchers.py new file mode 100644 index 00000000..a42447b0 --- /dev/null +++ b/libraries/path_matcher/matchers.py @@ -0,0 +1,222 @@ +import os +import re + +from core.constants import BOOST_VERSION_REGEX +from libraries.constants import LATEST_RELEASE_URL_PATH_STR +from libraries.path_matcher import BasePathMatcher, PathSegments + + +class DirectMatcher(BasePathMatcher): + # pseudo-example 1_84_0/*/CXX11.html + # pseudo-expected s3 dest = static_content_1_79_0/*/CXX11.html e.g. 'static_content_1_90_0/doc/html/accumulators.html' + # pseudo-expected final path = doc/libs/latest/*/CXX11.html + has_equivalent = True + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/(?P(?P\S+))" + ) + + def generate_latest_s3_path(self, path: str, segments: PathSegments): + return "/".join([f"static_content_{self.latest_slug}", segments.content_path]) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + ["doc", "libs", LATEST_RELEASE_URL_PATH_STR, path_data.content_path] + ) + + +class LibsPathToLatestDirectMatcher(BasePathMatcher): + # example 1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html + # expected s3 dest = static_content_1_79_0/libs/algorithm/doc/html/algorithm/CXX11.html + # expected final path = doc/libs/latest/libs/algorithm/doc/html/algorithm/CXX11.html + has_equivalent = False + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/libs/(?P[\w]+)/(?P\S+)" + ) + + def generate_latest_s3_path(self, path: str, segments: PathSegments): + return "/".join( + [ + f"static_content_{self.latest_slug}", + "libs", + segments.library_name, + segments.content_path, + ] + ) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + [ + "doc", + "libs", + LATEST_RELEASE_URL_PATH_STR, + "libs", + path_data.library_name, + path_data.content_path, + ] + ) + + +class LibsPathToLatestFallbackMatcher(BasePathMatcher): + # example 1_78_0/libs/algorithm/doc/html/header/boost/algorithm/string_regex_hpp.html + # expected s3 dest = static_content_1_79_0/libs/algorithm/index.html + # expected final path = doc/libs/latest/libs/algorithm/index.html + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/libs/(?P[\w]+)/(?P\S+)" + ) + is_index_fallback = True + + def generate_latest_s3_path(self, path, segments: PathSegments): + return "/".join( + [ + f"static_content_{self.latest_slug}", + "libs", + segments.library_name, + "index.html", + ] + ) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + [ + "doc", + "libs", + LATEST_RELEASE_URL_PATH_STR, + "libs", + path_data.library_name, + "index.html", + ] + ) + + +class LibsToAntoraPathDirectMatcher(BasePathMatcher): + # example 1_85_0/libs/url/doc/html/url/urls/segments.html + # expected s3 dest = static_content_1_79_0/doc/antora/url/urls/segments.html + # expected final dest = doc/libs/latest/doc/antora/url/index.html + + # Only the boost urls library redirects to antora for now so the regex in use + # is tightly limited to that. The commented path_re will work when this is + # needed to be more generic, all other things being equal. + # path_re = re.compile(rf"{BOOST_VERSION_REGEX}/libs/(?P[\w]+)/(?P\S+)") + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/libs/(?Purl)/(?P\S+)" + ) + + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + # library name is in content_path + return "/".join( + [ + f"static_content_{self.latest_slug}", + "doc", + "antora", + segments.content_path.replace("doc/html/", ""), + ] + ) + + def generate_latest_url(self, path_data: PathSegments) -> str: + # library name is in content_path + return os.path.sep.join( + [ + "doc", + "libs", + LATEST_RELEASE_URL_PATH_STR, + "doc", + "antora", + path_data.content_path.replace("doc/html/", ""), + ] + ) + + +class DocHtmlBoostPathToFallbackMatcher(BasePathMatcher): + # example 1_64_0/doc/html/boost_process/acknowledgements.html + # expected s3 dest = static_content_1_79_0/libs/process/index.html + # expected final path = doc/libs/latest/libs/process/index.html + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/doc/html/boost_(?P[\w]+)/(?P\S+)" + ) + is_index_fallback = True + + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + return "/".join( + [ + f"static_content_{self.latest_slug}", + "libs", + segments.library_name, + "index.html", + ] + ) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + [ + "doc", + "libs", + LATEST_RELEASE_URL_PATH_STR, + "libs", + path_data.library_name, + "index.html", + ] + ) + + +class DocHtmlPathToDirectMatcher(BasePathMatcher): + # example = 1_35_0/doc/html/interprocess.html + # expected s3 dest = static_content_1_79_0/doc/html/interprocess.html + # expected final path = doc/libs/latest/doc/html/interprocess.html + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/(?Pdoc/html/(?!boost_)(?P[\w]+.html))" + ) + + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + return "/".join([f"static_content_{self.latest_slug}", segments.content_path]) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + ["doc", "libs", LATEST_RELEASE_URL_PATH_STR, path_data.content_path] + ) + + +class DocHtmlBoostHtmlFallbackPathMatcher(BasePathMatcher): + # example 1_34_0/doc/html/boost_math.html + # expected s3 dest = static_content_1_79_0/libs/math/doc/html/index.html + # expected final path = doc/libs/latest/libs/math/doc/html/index.html + path_re = re.compile( + rf"{BOOST_VERSION_REGEX}/(?Pdoc/html)/boost_(?P[\w]+).html" + ) + is_index_fallback = True + + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + return "/".join( + [ + f"static_content_{self.latest_slug}", + "libs", + segments.library_name, + segments.content_path, + "index.html", + ] + ) + + def generate_latest_url(self, path_data: PathSegments) -> str: + return os.path.sep.join( + [ + "doc", + "libs", + LATEST_RELEASE_URL_PATH_STR, + "libs", + path_data.library_name, + path_data.content_path, + "index.html", + ] + ) + + +class ToLibsLatestRootFallbackMatcher(BasePathMatcher): + # any other path not matched will arrive here, values inaccurate, set as needed + path_re = re.compile(r"(?P(?P\S+))") + is_index_fallback = True + + def generate_latest_s3_path(self, path: str, segments: PathSegments) -> str: + return "/".join([f"static_content_{self.latest_slug}", "libs"]) + + def generate_latest_url(self, path_data: PathSegments) -> str: + # trailing slash here to save a redirect + return f"libraries/{LATEST_RELEASE_URL_PATH_STR}/" diff --git a/libraries/path_matcher/utils.py b/libraries/path_matcher/utils.py new file mode 100644 index 00000000..7e663827 --- /dev/null +++ b/libraries/path_matcher/utils.py @@ -0,0 +1,42 @@ +from libraries.path_matcher.base_path_matcher import PathMatchResult +from libraries.path_matcher.matchers import ( + DirectMatcher, + LibsPathToLatestDirectMatcher, + LibsPathToLatestFallbackMatcher, + LibsToAntoraPathDirectMatcher, + DocHtmlBoostPathToFallbackMatcher, + DocHtmlPathToDirectMatcher, + DocHtmlBoostHtmlFallbackPathMatcher, + ToLibsLatestRootFallbackMatcher, +) +from libraries.utils import get_s3_client +from versions.models import Version + + +def get_path_match_from_chain(url: str, latest_version: Version) -> PathMatchResult: + s3_client = get_s3_client() + + # matcher chain in order + matcher_classes = [ + DirectMatcher, + LibsPathToLatestDirectMatcher, + LibsToAntoraPathDirectMatcher, + LibsPathToLatestFallbackMatcher, + DocHtmlBoostPathToFallbackMatcher, + DocHtmlPathToDirectMatcher, + DocHtmlBoostHtmlFallbackPathMatcher, + ToLibsLatestRootFallbackMatcher, + ] + + matchers = [ + matcher_class(latest_version, s3_client) for matcher_class in matcher_classes + ] + for current, next_matcher in zip(matchers, matchers[1:]): + current.set_next(next_matcher) + result = matchers[0].handle(test_path=url) + return result + + +def determine_latest_url(url: str, latest_version: Version) -> str: + match_result = get_path_match_from_chain(url, latest_version) + return match_result.latest_path diff --git a/libraries/tests/path_matcher/__init__.py b/libraries/tests/path_matcher/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libraries/tests/path_matcher/test_matchers.py b/libraries/tests/path_matcher/test_matchers.py new file mode 100644 index 00000000..3857e5f3 --- /dev/null +++ b/libraries/tests/path_matcher/test_matchers.py @@ -0,0 +1,233 @@ +import pytest +from unittest.mock import MagicMock, patch + +from libraries.path_matcher import ( + BasePathMatcher, + DirectMatcher, + LibsPathToLatestDirectMatcher, + LibsPathToLatestFallbackMatcher, + LibsToAntoraPathDirectMatcher, + DocHtmlBoostPathToFallbackMatcher, + DocHtmlPathToDirectMatcher, + DocHtmlBoostHtmlFallbackPathMatcher, + ToLibsLatestRootFallbackMatcher, +) +from libraries.path_matcher.utils import get_path_match_from_chain, determine_latest_url + +test_params = [ + ( + DirectMatcher, + True, # confirm_path_exists result + False, # confirm s3 path exists result + "1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html", # src path + "static_content_1_79_0/libs/algorithm/doc/html/algorithm/CXX11.html", # expected s3 key + True, # is direct equivalent + "doc/libs/latest/libs/algorithm/doc/html/algorithm/CXX11.html", # expected final path + ), + ( + LibsPathToLatestDirectMatcher, + True, # confirm_path_exists result + False, # confirm s3 path exists result + "1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html", # src path + "static_content_1_79_0/libs/algorithm/doc/html/algorithm/CXX11.html", # expected s3 key + False, # is not a direct equivalent + "doc/libs/latest/libs/algorithm/doc/html/algorithm/CXX11.html", # expected final path + ), + ( + LibsPathToLatestFallbackMatcher, + False, # confirm_path_exists result + False, # confirm s3 path exists result + "1_78_0/libs/algorithm/doc/html/header/boost/algorithm/string_regex_hpp.html", # src path + "static_content_1_79_0/libs/algorithm/index.html", # expected s3 key + False, + "doc/libs/latest/libs/algorithm/index.html", # expected final path + ), + ( + LibsToAntoraPathDirectMatcher, + True, # confirm_path_exists result + False, # confirm s3 path exists result + "1_85_0/libs/url/doc/html/url/urls/segments.html", # src path + "static_content_1_79_0/doc/antora/url/urls/segments.html", # expected s3 key + False, + "doc/libs/latest/doc/antora/url/urls/segments.html", # expected final path + ), + ( + DocHtmlBoostPathToFallbackMatcher, + True, # confirm_path_exists result + False, # confirm s3 path exists result + "1_64_0/doc/html/boost_process/acknowledgements.html", # src path + "static_content_1_79_0/libs/process/index.html", # expected s3 key + False, + "doc/libs/latest/libs/process/index.html", # expected final path + ), + ( + DocHtmlPathToDirectMatcher, + False, # confirm_path_exists result + True, # confirm s3 path exists result + "1_35_0/doc/html/interprocess.html", # src path + "static_content_1_79_0/doc/html/interprocess.html", # expected s3 key + False, + "doc/libs/latest/doc/html/interprocess.html", # expected final path + ), + ( + DocHtmlBoostHtmlFallbackPathMatcher, + False, # confirm_path_exists result + True, # confirm s3 path exists result + "1_34_0/doc/html/boost_math.html", # src path + "static_content_1_79_0/libs/math/doc/html/index.html", # expected s3 key + False, + "doc/libs/latest/libs/math/doc/html/index.html", # expected final path + ), + ( + ToLibsLatestRootFallbackMatcher, + False, + False, + "1_33_1/doc/html/BOOST_VARIANT_LIMIT_TYPES.html", + "static_content_1_79_0/libs", + False, + "libraries/latest/", + ), +] + + +@pytest.mark.parametrize( + "matcher_class,db_path_result,s3_path_result,test_path,expected_s3_key,is_direct_equivalent,expected_final_path", + test_params, +) +def test_libs_path_to_latest_exact_db_path_exists( + matcher_class, + db_path_result, + s3_path_result, + test_path, + expected_s3_key, + is_direct_equivalent, + expected_final_path, + monkeypatch, + version, +): + monkeypatch.setattr( + BasePathMatcher, "confirm_db_path_exists", lambda x, y: db_path_result + ) + monkeypatch.setattr( + BasePathMatcher, "confirm_s3_path_exists", lambda x, y: s3_path_result + ) + + mock_s3_client = MagicMock() + matcher = matcher_class(version, mock_s3_client) + + with patch.object( + matcher, "confirm_db_path_exists", wraps=matcher.confirm_db_path_exists + ) as spy: + pm = matcher.determine_match(test_path) + spy.assert_called_once_with(expected_s3_key) + + assert pm.is_direct_equivalent == is_direct_equivalent + assert pm.latest_path == expected_final_path + + +chain_data = [ + ( + "1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html", + "doc/libs/latest/libs/algorithm/doc/html/algorithm/CXX11.html", + True, + DirectMatcher, + ), + ( + "1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html", + "doc/libs/latest/libs/algorithm/doc/html/algorithm/CXX11.html", + True, + LibsPathToLatestDirectMatcher, + ), + ( + "1_84_0/libs/algorithm/doc/html/algorithm/nope.html", + "doc/libs/latest/libs/algorithm/index.html", + False, + LibsPathToLatestFallbackMatcher, + ), + ( + "1_85_0/libs/url/doc/html/url/urls/segments.html", + "doc/libs/latest/doc/antora/url/urls/segments.html", + True, + LibsToAntoraPathDirectMatcher, + ), + ( + "1_35_0/doc/html/interprocess.html", + "doc/libs/latest/doc/html/interprocess.html", + True, + DocHtmlPathToDirectMatcher, + ), + ( + "1_64_0/doc/html/boost_process/acknowledgements.html", + "doc/libs/latest/libs/process/index.html", + True, + DocHtmlBoostPathToFallbackMatcher, + ), + ( + "1_34_0/doc/html/boost_math.html", + "doc/libs/latest/libs/math/doc/html/index.html", + False, + DocHtmlBoostHtmlFallbackPathMatcher, + ), + ( + "1_XX_Y/does/not/exist", + "libraries/latest/", + False, + ToLibsLatestRootFallbackMatcher, + ), +] + + +@pytest.mark.parametrize( + "test_url,expected_match,db_path_exists,matching_class", chain_data +) +def test_handoff( + test_url, expected_match, db_path_exists, matching_class, monkeypatch, version +): + # default deny + monkeypatch.setattr(BasePathMatcher, "confirm_db_path_exists", lambda x, y: False) + monkeypatch.setattr(BasePathMatcher, "confirm_s3_path_exists", lambda x, y: False) + # Using match_class here because for the likes of the antora case we want to have it match on + # LibsToAntoraPathDirectMatcher specifically, not an earlier matching regex where the key would not be in db/s3. + # Same reason we use match_class(version, mock_s3_client).get_class_name() below rather than a string name for the class + monkeypatch.setattr( + matching_class, "confirm_db_path_exists", lambda x, y: db_path_exists + ) + + match_result = get_path_match_from_chain(test_url, latest_version=version) + + mock_s3_client = MagicMock() + assert match_result.latest_path == expected_match + assert ( + match_result.matcher == matching_class(version, mock_s3_client).get_class_name() + ) + + +def test_determine_latest_url(monkeypatch, version): + monkeypatch.setattr( + DocHtmlBoostHtmlFallbackPathMatcher, "confirm_db_path_exists", lambda x, y: True + ) + + test_url = "1_34_0/doc/html/boost_math.html" + expected_latest_url = "doc/libs/latest/libs/math/doc/html/index.html" + + assert determine_latest_url(test_url, version) == expected_latest_url + + +def test_s3_archive_key_prefix(version): + """Test that the S3 archive key correctly contains the 'archives/boost_' prefix""" + mock_s3_client = MagicMock() + + test_path = "static_content_1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html" + expected_archive_key = ( + "archives/boost_1_84_0/libs/algorithm/doc/html/algorithm/CXX11.html" + ) + + # Create a matcher instance with the mock s3 client + matcher = DirectMatcher(version, mock_s3_client) + matcher.confirm_s3_path_exists(test_path) + + mock_s3_client.head_object.assert_called_once() + call_kwargs = mock_s3_client.head_object.call_args[1] + assert call_kwargs["Key"] == expected_archive_key + assert call_kwargs["Key"].startswith("archives/") + assert "archives/boost_" in call_kwargs["Key"] diff --git a/libraries/utils.py b/libraries/utils.py index 3bc44311..b4e227ea 100644 --- a/libraries/utils.py +++ b/libraries/utils.py @@ -3,12 +3,16 @@ import string import re from itertools import islice +import boto3 import structlog import tempfile from datetime import datetime, timezone + +from botocore.client import BaseClient from dateutil.relativedelta import relativedelta from dateutil.parser import ParserError, parse +from django.conf import settings from django.utils.text import slugify from libraries.constants import ( @@ -377,3 +381,12 @@ def generate_release_report_filename(version_slug: str, published_format: bool = filename_data.append(datetime.now(timezone.utc).isoformat()) filename = f"{'-'.join(filename_data)}.pdf" return filename + + +def get_s3_client() -> BaseClient: + return boto3.client( + "s3", + aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY, + region_name=settings.STATIC_CONTENT_REGION, + ) diff --git a/requirements-dev.in b/requirements-dev.in index 572bea9e..95d1f1ad 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -1,3 +1,3 @@ -c requirements.txt django-debug-toolbar -pydevd-pycharm==253.29346.142 # pinned to appropriate version for current pycharm +pydevd-pycharm==253.29346.308 # pinned to appropriate version for current pycharm diff --git a/requirements-dev.txt b/requirements-dev.txt index 36c64e9f..8dc6e716 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,7 +10,7 @@ django==6.0.2 # django-debug-toolbar django-debug-toolbar==6.2.0 # via -r ./requirements-dev.in -pydevd-pycharm==253.29346.142 +pydevd-pycharm==253.29346.308 # via -r ./requirements-dev.in sqlparse==0.5.5 # via diff --git a/requirements.in b/requirements.in index 45b278c1..92457ec7 100644 --- a/requirements.in +++ b/requirements.in @@ -64,6 +64,7 @@ pre-commit pytest pytest-cov pytest-django +pytest-mock responses # Packaging diff --git a/requirements.txt b/requirements.txt index 5125448b..faa731a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -433,10 +433,13 @@ pytest==9.0.2 # -r ./requirements.in # pytest-cov # pytest-django + # pytest-mock pytest-cov==7.0.0 # via -r ./requirements.in pytest-django==4.11.1 # via -r ./requirements.in +pytest-mock==3.15.1 + # via -r ./requirements.in python-dateutil==2.9.0.post0 # via # -r ./requirements.in diff --git a/scripts/load_production_data.sh b/scripts/load_production_data.sh index 1a02fdfc..ed973a05 100755 --- a/scripts/load_production_data.sh +++ b/scripts/load_production_data.sh @@ -184,13 +184,7 @@ download_latest_db_dump() { docker compose exec db bash -c "pg_restore -U $DB_USER -d $DB_NAME -v --no-owner --no-privileges /tmp/$DUMP_FILENAME" # apply any migrations newer than our dumped database docker compose exec web bash -c "./manage.py migrate" - # update the database to delete all rows from socialaccount_social app, which need to be configured differently locally - echo "Deleting all rows from socialaccount_socialapp table and setting fake passwords..." - docker compose exec web bash -c "./manage.py shell -c 'from allauth.socialaccount.models import SocialApp; SocialApp.objects.all().delete()'" - just manage "set_fake_passwords --password=test" - echo 'from django.contrib.auth import get_user_model; u=get_user_model().objects.get(email="superadmin@boost.org"); u.set_password("foobarone"); u.save()' | docker compose exec -T web python manage.py shell echo "Database restored successfully from $DUMP_FILENAME" - return 0 } @@ -199,6 +193,11 @@ if [ "${skip_web_option:-}" != "yes" ]; then echo "Failed to download and restore latest database dump"; exit 1; } + docker compose exec web bash -c "DJANGO_SUPERUSER_USERNAME=superadmin DJANGO_SUPERUSER_EMAIL=superadmin@boost.org DJANGO_SUPERUSER_PASSWORD=foobarone ./manage.py createsuperuser --noinput" || true + # update the database to delete all rows from socialaccount_social app, which need to be configured differently locally + echo "Deleting all rows from socialaccount_socialapp table and setting fake passwords..." + docker compose exec web bash -c "./manage.py shell -c 'from allauth.socialaccount.models import SocialApp; SocialApp.objects.all().delete()'" + just manage "set_fake_passwords --password=test" fi if [ "${skip_lists_option:-}" != "yes" ]; then diff --git a/templates/admin/core/renderedcontent/change_list.html b/templates/admin/core/renderedcontent/change_list.html index a249788d..34ab85ba 100644 --- a/templates/admin/core/renderedcontent/change_list.html +++ b/templates/admin/core/renderedcontent/change_list.html @@ -3,6 +3,13 @@ {% block object-tools-items %} {{ block.super }} + {% if has_start_content_refresh %} +
  • + + {% trans "Start Content Refresh" %} + +
  • + {% endif %} {% if has_delete_all %}
  • diff --git a/templates/admin/core/renderedcontent/start_content_refresh_confirmation.html b/templates/admin/core/renderedcontent/start_content_refresh_confirmation.html new file mode 100644 index 00000000..c8aed29f --- /dev/null +++ b/templates/admin/core/renderedcontent/start_content_refresh_confirmation.html @@ -0,0 +1,33 @@ +{% extends "admin/base_site.html" %} +{% load i18n static %} + +{% block extrahead %} + {{ block.super }} + +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-core model-renderedcontent delete-confirmation{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

    "Are you sure you want to mark the content refresh start time?

    +

    + This will set the rendered_content_replacement_start timestamp in Site Settings to the current time after + which page loads will trigger a content refresh. Note: the first view in this case will not show the refreshed + content. +

    +
    {% csrf_token %} + +
    +{% endblock %}