Replace relative image URLs with absolute URLs in static content files

- Add function to replace relative image paths with absolute paths - Add docs on static content and adding shortcuts to the config file - Add url and view for rendering static image data
2026-01-19 04:42:17 +00:00 · 2024-01-18 13:03:03 -08:00
parent fdc5e6dfff
commit 6755f76753
7 changed files with 202 additions and 17 deletions
--- a/config/urls.py
+++ b/config/urls.py
@@ -17,6 +17,7 @@ from core.views import (
    CalendarView,
    ClearCacheView,
    DocLibsTemplateView,
+    ImageView,
    MarkdownTemplateView,
    StaticContentTemplateView,
    UserGuideTemplateView,
@@ -278,6 +279,12 @@ urlpatterns = (
            MarkdownTemplateView.as_view(),
            name="markdown-page",
        ),
+        # Images from static content
+        re_path(
+            r"^images/(?P<content_path>.+)/?",
+            ImageView.as_view(),
+            name="images-page",
+        ),
        # Static content
        re_path(
            r"^(?P<content_path>.+)/?",
--- a/core/boostrenderer.py
+++ b/core/boostrenderer.py
@@ -1,7 +1,6 @@
 import json
 import os
 import re
-
 import boto3
 import structlog
 from botocore.exceptions import ClientError
@@ -170,6 +169,54 @@ def get_s3_keys(content_path, config_filename=None):
    return s3_keys


+def convert_img_paths(html_content: str, s3_path: str = None):
+    """
+    Convert all relative images paths to absolute paths.
+
+    Args:
+    - html_content: The HTML content you want to convert
+    - s3_path: The key ultimately used to retrieve the HTML data.
+        If present, will be whatever key from get_s3_keys() that worked
+        to retrieve the HTML data
+
+    Explanation:
+
+    The config file allows us to add shortcut URLs to specific S3 keys. An example is
+    the /help/ page; see the config file for how it maps the site_path to the S3 key
+    that will retrieve the data.
+
+    However, most images in these files will be relative, and when the config file
+    masks the S3 keys, the image URLs can't be found in the browser.
+
+    This function retrieves all images and updates their URLs to be fully-qualified
+    by routing them through our `/images/` view, which will retrieve them from S3
+    directly.
+
+    NOTE: This hasn't been well-tested and it's possible it will need updates as
+    we encounter more special cases related to the static content.
+    """
+    if not html_content:
+        return
+
+    if type(html_content) is not str:
+        raise ValueError(
+            f"HTML content must be a string, and it is {type(html_content)}."
+        )
+
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    for img in soup.find_all("img"):
+        original_src = img.get("src", "")
+        if not original_src.startswith(("http://", "https://")):
+            # Construct the new absolute URL for the image
+            new_src = "/".join([s3_path, original_src])
+            if not new_src.startswith("/"):
+                new_src = f"/{new_src}"
+            img["src"] = new_src
+
+    return str(soup)
+
+
 class Youtube(SpanToken):
    """
    Span token for Youtube shortcodes
--- a/core/tasks.py
+++ b/core/tasks.py
@@ -47,6 +47,7 @@ def refresh_content_from_s3(s3_key, cache_key):
    """Calls S3 with the s3_key, then saves the result to the
    RenderedContent object with the given cache_key."""
    content_dict = get_content_from_s3(key=s3_key)
+
    content = content_dict.get("content")
    if content_dict and content:
        content_type = content_dict.get("content_type")
--- a/core/tests/test_renderer.py
+++ b/core/tests/test_renderer.py
@@ -1,3 +1,4 @@
+from bs4 import BeautifulSoup
 from unittest.mock import Mock, patch
 import datetime
 from io import BytesIO
@@ -9,6 +10,7 @@ from ..boostrenderer import (
    get_content_type,
    get_file_data,
    get_s3_keys,
+    convert_img_paths,
 )


@@ -109,6 +111,8 @@ def test_get_file_data():

 def test_get_s3_keys():
    """
+    Test cases for get_s3_keys function.
+
    Test cases:

    - "/marshmallow/index.html" -> "site/develop/tools/auto_index/index.html"
@@ -116,7 +120,6 @@ def test_get_s3_keys():
    - "/rst.css" -> "site/develop/rst.css"
    - "/site/develop/doc/html/about.html" -> "site/develop/doc/html/about.html"
    """
-
    assert "/site-docs/develop/user-guide/index.html" in get_s3_keys(
        "/doc/user-guide/index.html"
    )
@@ -129,3 +132,30 @@ def test_get_s3_keys():
    assert "/site-docs/develop/release-process/index.html" in get_s3_keys(
        "/doc/release-process/index.html"
    )
+
+
+def test_convert_img_paths():
+    # Test data
+    html_content = """
+        <html>
+            <body>
+                <img src="image1.png" alt="Image 1"/>
+            </body>
+        </html>
+    """
+
+    # Expected output after conversion
+    expected_html = """
+        <html>
+            <body>
+                <img src="/images/site-pages/develop/image1.png" alt="Image 1"/>
+            </body>
+        </html>
+    """  # noqa
+    s3_path = "/images/site-pages/develop"
+
+    result = convert_img_paths(html_content, s3_path)
+
+    expected_soup = BeautifulSoup(expected_html, "html.parser")
+    result_soup = BeautifulSoup(result, "html.parser")
+    assert result_soup == expected_soup
--- a/core/views.py
+++ b/core/views.py
@@ -4,7 +4,6 @@ import structlog
 from dateutil.parser import parse

 from django.conf import settings
-
 from django.contrib.auth.mixins import UserPassesTestMixin
 from django.core.cache import caches
 from django.http import Http404, HttpResponse, HttpResponseNotFound
@@ -15,7 +14,12 @@ from django.views.generic import TemplateView


 from .asciidoc import process_adoc_to_html_content
-from .boostrenderer import get_content_from_s3
+from .boostrenderer import (
+    get_content_from_s3,
+    get_s3_client,
+    extract_file_data,
+    convert_img_paths,
+)
 from .htmlhelper import modernize_legacy_page
 from .markdown import process_md
 from .models import RenderedContent
@@ -153,7 +157,7 @@ class ContentNotFoundException(Exception):
    pass


-class StaticContentTemplateView(TemplateView):
+class BaseStaticContentTemplateView(TemplateView):
    template_name = "adoc_content.html"

    def get(self, request, *args, **kwargs):
@@ -274,6 +278,8 @@ class StaticContentTemplateView(TemplateView):
    def render_to_response(self, context, **response_kwargs):
        """Return the HTML response with a template, or just the content directly."""
        if self.get_template_names():
+            content = self.process_content(context["content"])
+            context["content"] = content
            return super().render_to_response(context, **response_kwargs)
        content = self.process_content(context["content"])
        return HttpResponse(content, content_type=context["content_type"])
@@ -304,7 +310,32 @@ class StaticContentTemplateView(TemplateView):
        return content


-class DocLibsTemplateView(StaticContentTemplateView):
+class StaticContentTemplateView(BaseStaticContentTemplateView):
+    def process_content(self, content):
+        """Process the content we receive from S3"""
+        content_html = self.content_dict.get("content")
+        content_type = self.content_dict.get("content_type")
+        content_key = self.content_dict.get("content_key")
+
+        # Replace relative image paths will fully-qualified ones so they will render
+        if content_type == "text/html" or content_type == "text/asciidoc":
+            # Prefix the new URL path with "/images" so it routes through
+            # our ImageView class
+            url_parts = ["/images"]
+
+            if content_key:
+                # Get the path from the S3 key by stripping the filename from the S3 key
+                directory = os.path.dirname(content_key)
+                url_parts.append(directory.lstrip("/"))
+
+            # Generate the replacement path to the image
+            s3_path = "/".join(url_parts)
+            # Process the HTML to replace the image paths
+            content = convert_img_paths(str(content_html), s3_path)
+        return content
+
+
+class DocLibsTemplateView(BaseStaticContentTemplateView):
    # possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
    boost_lib_path_re = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")

@@ -347,7 +378,7 @@ class DocLibsTemplateView(StaticContentTemplateView):
        )


-class UserGuideTemplateView(StaticContentTemplateView):
+class UserGuideTemplateView(BaseStaticContentTemplateView):
    def get_from_s3(self, content_path):
        legacy_url = f"/doc/{content_path}"
        return super().get_from_s3(legacy_url)
@@ -375,3 +406,22 @@ class UserGuideTemplateView(StaticContentTemplateView):
        return modernize_legacy_page(
            content, base_html, insert_body=insert_body, head_selector=head_selector
        )
+
+
+class ImageView(View):
+    def get(self, request, *args, **kwargs):
+        # TODO: Add caching logic
+        content_path = self.kwargs.get("content_path")
+
+        client = get_s3_client()
+        try:
+            response = client.get_object(
+                Bucket=settings.STATIC_CONTENT_BUCKET_NAME, Key=content_path
+            )
+            file_data = extract_file_data(response, content_path)
+            content = file_data["content"]
+            content_type = file_data["content_type"]
+
+            return HttpResponse(content, content_type=content_type)
+        except ContentNotFoundException:
+            raise Http404("Content not found")
--- a/docs/static_content.md
+++ b/docs/static_content.md
@@ -1,10 +1,47 @@
-# Retrieving Static Content from the Boost Amazon S3 Bucket
+# Boost Static Content

-The `StaticContentTemplateView` class (in the `core/` app) is a Django view that handles requests for static content.
+**Static Content** refers to content such as HTML files, markdown files, asciidoc files, etc. that is retrieved from Amazon S3 and rendered within the Boost site.
+
+We can add "shortcut" URL paths to specific directories or files within S3 by updating the file `stage_static_config.json`.
+
+## Quick Start
+
+### Adding a shortcut url to a static page
+
+1. Identify the URL pattern you would like to use. Example: `/style-guide/`
+2. Identify the S3 path to the file you would like that URL to load. Example: `/site-pages/develop/style-guides.adoc`
+3. Add an entry to `stage_static_config.json`. `site_path` is your URL route, with a `/` on either side. `s3_path` is the path to your desired file in S3, with a leading `/`:
+
+```javascript
+  ...
+  },
+  {
+    "site_path": "/style-guide/",
+    "s3_path": "/site-pages/develop/style-guides.adoc"
+  },
+  {
+  ...
+```
+
+4. Restart your server and load `/style-guide/` in your browser to confirm it works.
+
+## About Retrieving Static Content
+
+An example shortcut url is the `/help/` page. This is the route that the `/help/` URL takes to render that page:
+
+- The user enters `/help/` into the browser
+- There is no `help/` path in `config/urls.py`, so the route falls through to the view that handles static content, `StaticContentTemplateView`.
+- In this view, the `content_path` will be `help` . The view uses the `content_path` to try and retrieve the content for the `/help/` page from the Redis cache, the `RenderedContent` table, or from Amazon S3.
+- The logic for retrieving the content from Amazon S3 is stored in `core/boostrenderer.py::get_content_from_s3()`. See [Retrieving Static Content from the Boost Amazon S3 Bucket](#retrieving-static-content-from-the-boost-amazon-s3-bucket) and [How we decide which S3 keys to try](#how-we-decide-which-s3-keys-to-try) for more information.
+- Back in the view, if the view receives content from S3 (or the Redis cache or the database), it will return that. Otherwise, a 404 is raised.
+
+## Retrieving Static Content from the Boost Amazon S3 Bucket
+
+The `StaticContentTemplateView` class (in the `core/` app) is a Django view that handles requests for static content. It inherits from `BaseStaticContentTemplateView`, which is the class that contains the bulk of the logic.

 Its URL path is the very last path in our list of URL patterns (see `config/urls.py`) because it functions as the fallback URL pattern. If a user enters a URL that doesn't match anything else defined in our URL patterns, this view will attempt to retrieve the request as static content from S3 using the URL path.

-The `StaticContentTemplateView` calls S3 using the URL pattern and generates a list of potential keys to check. It then checks the specified S3 bucket for each of those keys and returns the first match it finds, along with the file content type. Passing the content type with the bucket contents allows the content to be delivered appropriately to the user (so HTML files will be rendered as HTML, etc.)
+`StaticContentTemplateView` calls S3 using the URL pattern. The S3 retrieval code in `core/boostrenderer.py::get_content_from_s3()` generates a list of potential keys to check. It then checks the specified S3 bucket for each of those keys and returns the first match it finds, along with the file content type. Passing the content type with the bucket contents allows the content to be delivered appropriately to the user (so HTML files will be rendered as HTML, etc.)

 Boost uses the AWS SDK for Python (boto3) to connect to an S3 bucket and retrieve the static content. If no bucket name is provided, pur process uses the `STATIC_CONTENT_BUCKET_NAME` setting from the Django project settings.

@@ -32,6 +69,10 @@ Take a look at this sample `{env}_static_config.json` file:
        "site_path": "/develop/doc/",
        "s3_path": "/site/develop/doc/html/"
    },
+    {
+      "site_path": "/doc/_/",
+      "s3_path": "/site-docs/develop/_/"
+    },
    {
        "site_path": "/",
        "s3_path": "/site/develop/"
@@ -43,12 +84,21 @@ Take a look at this sample `{env}_static_config.json` file:

 - `/site/develop/libs/index.html`

+Note that the `site_path` and the `s3_path` don't have to be to the same depth; the `site_path` in this example is 2 levels deep, and the `s3_path` is 3 levels deep. It doesn't matter.
+
 **Example 2**: If the URL request is for `/develop/doc/index.html`, the S3 keys that the function would try are:

 - `/site/develop/doc/html/index.html`
 - `/site/develop/doc/index.html`

-**Example 3**: If the URL request is for `/index.html`, the S3 keys that the function would try are:
+**Example 3**: If the url request is for `/doc/accumulators/`, the S3 keys that the function would try are:
+
+- `/site-docs/develop/accumulators/`
+- `/site-docs/develop/accumulators/index.html`
+
+In this example, the `_` functions as a wildcard, so `/doc/accumulators/` would shortcut to `/site-docs/develop/accumulators/`, and `/doc/algorithm/` would shortcut to `/site-docs/develop/algorithm/`, even though neither `accumulators` nor `algorithm` have their own entries in the config file.
+
+**Example 4**: If the URL request is for `/index.html`, the S3 keys that the function would try are:

 - `/site/develop/index.html`
 - `/site/index.html`
@@ -59,4 +109,4 @@ We first try to retrieve the static content using the exact S3 key specified in

 See [Caching and the `RenderedContent` model](./caching_rendered_content.md) for how Django-side caching is handled.

-Cacching is also handled via Fastly CDN.
+Caching is also handled via Fastly CDN.
--- a/libraries/tests/test_tasks.py
+++ b/libraries/tests/test_tasks.py
@@ -19,11 +19,11 @@ def test_get_and_store_library_version_documentation_urls_for_version(
    library_name = library.name.lower()
    mock_s3_response = {
        "content": f"""
-            <h2>Libraries Listed <a name="Alphabetically">Alphabetically</a></h2>
-            <ul>
-                <li><a href="{library_name}/index.html">{library_name}</a></li>
-            </ul>
-        """
+        <h2>Libraries Listed <a name="Alphabetically">Alphabetically</a></h2>
+        <ul>
+            <li><a href="{library_name}/index.html">{library_name}</a></li>
+        </ul>
+    """
    }

    # Mock the get_content_from_s3 function to return the mock S3 response