Add block to direct docs path access (#2012) (#2026)

This commit is contained in:
daveoconnor
2025-12-02 14:55:00 -08:00
committed by GitHub
parent 591814fea0
commit 240eafa5f1
3 changed files with 181 additions and 5 deletions

View File

@@ -11,6 +11,7 @@ SLACK_URL = "https://cpplang.slack.com"
STATIC_CONTENT_EARLY_EXIT_PATH_PREFIXES = ("releases/",)
# possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
BOOST_LIB_PATH_RE = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")
BOOST_VERSION_REGEX = r"(boost_){0,1}([0-9_]*[0-9]+[^/]*)"
NO_PROCESS_LIBS = [
# Do nothing with these - just render contents directly
"libs/filesystem",

View File

@@ -356,3 +356,169 @@ def test_docs_libs_gateway_200_html_transformed(rf, tp, mock_get_file_data):
def test_calendar(rf, tp):
response = tp.get("calendar")
tp.response_200(response)
@pytest.mark.django_db
@override_settings(
CACHES=TEST_CACHES,
)
def test_static_content_blocks_direct_doc_paths(request_factory):
"""Test that direct access to doc paths and library paths is blocked with 404."""
# Test cases for paths that should be blocked (return 404)
blocked_paths = [
# Original doc/html paths that should be blocked
"boost_1_53_0_beta1/doc/html/index.html",
"1_82_0/doc/html/tutorial.html",
"1_55_0b1/doc/html/reference/api.html",
"boost_1_86_0/doc/html/deep/nested/path.html",
"1_75_0/doc/html/simple.html",
# Edge cases with different boost version formats
"boost_1_53_0_beta1/doc/html/", # trailing slash
"1_82_0/doc/html/a", # single character file
# NEW: Library paths that should now be blocked
"boost_1_53_0_beta1/libs/algorithm/doc/index.html",
"1_82_0/libs/filesystem/doc/index.html",
"boost_1_86_0/libs/test/doc/reference.html",
"1_75_0/libs/wave/doc/tutorial.html",
"boost_1_82_0/libs/any_library/any_file.html",
"1_55_0b1/libs/serialization/index.html",
# Edge cases for libs paths
"boost_1_53_0_beta1/libs/", # just libs with trailing slash
"1_82_0/libs/a", # single character lib name
]
for content_path in blocked_paths:
request = request_factory.get(f"/{content_path}")
view = StaticContentTemplateView.as_view()
# Should raise Http404 without even trying to fetch from S3
with pytest.raises(Http404):
view(request, content_path=content_path)
@pytest.mark.django_db
@override_settings(
CACHES=TEST_CACHES,
)
def test_static_content_allows_non_direct_doc_paths(request_factory):
"""Test that non-direct doc paths are allowed and processed normally."""
# Test cases for paths that should NOT be blocked (normal processing)
allowed_paths = [
# Tools paths - should still be allowed (not libs)
"1_82_0/tools/build/doc/index.html",
"boost_1_82_0/tools/cmake/doc/reference.html",
# Paths with non-boost-version prefixes - should be allowed
"develop/libs/filesystem/doc/index.html", # develop prefix, not version
"master/libs/test/doc/reference.html", # master prefix, not version
# Paths without version prefixes
"doc/html/index.html", # No boost version prefix
"some/other/doc/html/file.html", # Different structure
"libs/algorithm/doc/index.html", # No version prefix
# Paths that don't match the exact patterns
"boost_1_82_0/doc/other/file.html", # not /doc/html/
"1_82_0/doc/htmls/file.html", # not exact /doc/html/
"1_82_0/documentation/html/file.html", # not /doc/html/
"boost_1_82_0/libraries/algorithm/doc/index.html", # libraries not libs
"some_other_prefix/libs/algorithm/doc/index.html", # no boost version
]
for content_path in allowed_paths:
# Mock S3 to return content so we can test the path isn't blocked
with patch(
"core.views.get_content_from_s3",
return_value={"content": b"test content", "content_type": "text/plain"},
):
response = call_view(request_factory, content_path)
# Should get 200 response, not 404 - the main thing is it's not blocked
assert (
response.status_code == 200
), f"Path should be allowed but got {response.status_code}: {content_path}"
def test_boost_version_regex_doc_html_pattern():
"""Test the BOOST_VERSION_REGEX doc/html pattern matches expected version formats."""
import re
from core.constants import BOOST_VERSION_REGEX
# Test the doc/html blocking pattern used in the view
doc_html_pattern = rf"^{BOOST_VERSION_REGEX}/doc/html/.+$"
# Test cases that should match the doc/html pattern
matching_cases = [
"boost_1_53_0_beta1/doc/html/index.html",
"1_82_0/doc/html/tutorial.html",
"1_55_0b1/doc/html/reference/api.html",
"boost_1_86_0/doc/html/test.html",
"1_75_0/doc/html/simple.html",
]
for test_path in matching_cases:
match = re.match(doc_html_pattern, test_path)
assert match is not None, f"Doc/html pattern should match: {test_path}"
# The captured groups should match the expected version parts
version_match = re.match(BOOST_VERSION_REGEX, test_path)
assert version_match is not None, f"Version pattern should match: {test_path}"
# Test cases that should NOT match the doc/html pattern
non_matching_cases = [
"1_82_0/tools/build/doc/index.html", # tools path
"develop/doc/html/index.html", # develop prefix, not version
"doc/html/index.html", # no version prefix
"boost_1_82_0/doc/other/file.html", # not /doc/html/
"1_82_0/doc/htmls/file.html", # not exact /doc/html/
"some/other/doc/html/file.html", # no boost version
"boost_1_82_0/doc/html/", # no file after /doc/html/
"1_82_0/doc/html", # no trailing slash or file
"boost_1_53_0_beta1/libs/algorithm/doc/index.html", # libs path
]
for test_path in non_matching_cases:
match = re.match(doc_html_pattern, test_path)
assert match is None, f"Doc/html pattern should NOT match: {test_path}"
def test_boost_version_regex_libs_pattern():
"""Test the BOOST_VERSION_REGEX libs pattern matches expected version formats."""
import re
from core.constants import BOOST_VERSION_REGEX
# Test the libs blocking pattern used in the view
libs_pattern = rf"^{BOOST_VERSION_REGEX}/libs/.+$"
# Test cases that should match the libs pattern
matching_cases = [
"boost_1_53_0_beta1/libs/algorithm/doc/index.html",
"1_82_0/libs/filesystem/doc/index.html",
"boost_1_86_0/libs/test/doc/reference.html",
"1_75_0/libs/wave/doc/tutorial.html",
"boost_1_82_0/libs/any_library/any_file.html",
"1_55_0b1/libs/serialization/index.html",
"1_82_0/libs/a", # single character lib name
"boost_1_53_0_beta1/libs/algorithm", # no trailing file extension
]
for test_path in matching_cases:
match = re.match(libs_pattern, test_path)
assert match is not None, f"Libs pattern should match: {test_path}"
# The captured groups should match the expected version parts
version_match = re.match(BOOST_VERSION_REGEX, test_path)
assert version_match is not None, f"Version pattern should match: {test_path}"
# Test cases that should NOT match the libs pattern
non_matching_cases = [
"1_82_0/tools/build/doc/index.html", # tools path
"develop/libs/filesystem/doc/index.html", # develop prefix, not version
"latest/libs/algorithm/doc/index.html", # latest prefix, not version
"libs/algorithm/doc/index.html", # no version prefix
"boost_1_82_0/libraries/algorithm/doc/index.html", # libraries not libs
"some/other/libs/algorithm/file.html", # no boost version
"boost_1_82_0/libs", # no trailing slash or file
"boost_1_53_0_beta1/libs/", # just libs with trailing slash (no content after)
"1_82_0/doc/html/index.html", # doc/html path
]
for test_path in non_matching_cases:
match = re.match(libs_pattern, test_path)
assert match is None, f"Libs pattern should NOT match: {test_path}"

View File

@@ -1,4 +1,5 @@
import os
import re
import requests
from django.utils import timezone
@@ -49,6 +50,7 @@ from .boostrenderer import (
from .constants import (
SourceDocType,
BOOST_LIB_PATH_RE,
BOOST_VERSION_REGEX,
STATIC_CONTENT_EARLY_EXIT_PATH_PREFIXES,
)
from .htmlhelper import (
@@ -287,11 +289,7 @@ class BaseStaticContentTemplateView(TemplateView):
return redirect(self.content_dict.get("redirect"))
except ContentNotFoundException:
logger.info(
"get_content_from_s3_view_not_in_cache",
content_path=content_path,
status_code=404,
)
logger.info(f"get_content_from_s3_view_not_in_cache {content_path} 404")
raise Http404("Content not found")
return super().get(request, *args, **kwargs)
@@ -465,6 +463,17 @@ class BaseStaticContentTemplateView(TemplateView):
class StaticContentTemplateView(BaseStaticContentTemplateView):
def get(self, request, content_path, *args, **kwargs):
# filter out direct access to the doc paths
path_regexes = [
re.compile(rf"^{BOOST_VERSION_REGEX}/doc/html/.+$"),
re.compile(rf"^{BOOST_VERSION_REGEX}/libs/.+$"),
]
path_match = any(regex.match(content_path) for regex in path_regexes)
if path_match:
raise Http404("Content not found")
return super().get(request, *args, **kwargs)
def process_content(self, content):
"""Process the content we receive from S3"""
content_html = self.content_dict.get("content")