Replace relative image URLs with absolute URLs in static content files

- Add function to replace relative image paths with absolute paths
- Add docs on static content and adding shortcuts to the config file
- Add url and view for rendering static image data
This commit is contained in:
Lacey Williams Henschel
2024-01-18 13:03:03 -08:00
committed by Lacey Henschel
parent fdc5e6dfff
commit 6755f76753
7 changed files with 202 additions and 17 deletions

View File

@@ -17,6 +17,7 @@ from core.views import (
CalendarView,
ClearCacheView,
DocLibsTemplateView,
ImageView,
MarkdownTemplateView,
StaticContentTemplateView,
UserGuideTemplateView,
@@ -278,6 +279,12 @@ urlpatterns = (
MarkdownTemplateView.as_view(),
name="markdown-page",
),
# Images from static content
re_path(
r"^images/(?P<content_path>.+)/?",
ImageView.as_view(),
name="images-page",
),
# Static content
re_path(
r"^(?P<content_path>.+)/?",

View File

@@ -1,7 +1,6 @@
import json
import os
import re
import boto3
import structlog
from botocore.exceptions import ClientError
@@ -170,6 +169,54 @@ def get_s3_keys(content_path, config_filename=None):
return s3_keys
def convert_img_paths(html_content: str, s3_path: str = None):
"""
Convert all relative images paths to absolute paths.
Args:
- html_content: The HTML content you want to convert
- s3_path: The key ultimately used to retrieve the HTML data.
If present, will be whatever key from get_s3_keys() that worked
to retrieve the HTML data
Explanation:
The config file allows us to add shortcut URLs to specific S3 keys. An example is
the /help/ page; see the config file for how it maps the site_path to the S3 key
that will retrieve the data.
However, most images in these files will be relative, and when the config file
masks the S3 keys, the image URLs can't be found in the browser.
This function retrieves all images and updates their URLs to be fully-qualified
by routing them through our `/images/` view, which will retrieve them from S3
directly.
NOTE: This hasn't been well-tested and it's possible it will need updates as
we encounter more special cases related to the static content.
"""
if not html_content:
return
if type(html_content) is not str:
raise ValueError(
f"HTML content must be a string, and it is {type(html_content)}."
)
soup = BeautifulSoup(html_content, "html.parser")
for img in soup.find_all("img"):
original_src = img.get("src", "")
if not original_src.startswith(("http://", "https://")):
# Construct the new absolute URL for the image
new_src = "/".join([s3_path, original_src])
if not new_src.startswith("/"):
new_src = f"/{new_src}"
img["src"] = new_src
return str(soup)
class Youtube(SpanToken):
"""
Span token for Youtube shortcodes

View File

@@ -47,6 +47,7 @@ def refresh_content_from_s3(s3_key, cache_key):
"""Calls S3 with the s3_key, then saves the result to the
RenderedContent object with the given cache_key."""
content_dict = get_content_from_s3(key=s3_key)
content = content_dict.get("content")
if content_dict and content:
content_type = content_dict.get("content_type")

View File

@@ -1,3 +1,4 @@
from bs4 import BeautifulSoup
from unittest.mock import Mock, patch
import datetime
from io import BytesIO
@@ -9,6 +10,7 @@ from ..boostrenderer import (
get_content_type,
get_file_data,
get_s3_keys,
convert_img_paths,
)
@@ -109,6 +111,8 @@ def test_get_file_data():
def test_get_s3_keys():
"""
Test cases for get_s3_keys function.
Test cases:
- "/marshmallow/index.html" -> "site/develop/tools/auto_index/index.html"
@@ -116,7 +120,6 @@ def test_get_s3_keys():
- "/rst.css" -> "site/develop/rst.css"
- "/site/develop/doc/html/about.html" -> "site/develop/doc/html/about.html"
"""
assert "/site-docs/develop/user-guide/index.html" in get_s3_keys(
"/doc/user-guide/index.html"
)
@@ -129,3 +132,30 @@ def test_get_s3_keys():
assert "/site-docs/develop/release-process/index.html" in get_s3_keys(
"/doc/release-process/index.html"
)
def test_convert_img_paths():
# Test data
html_content = """
<html>
<body>
<img src="image1.png" alt="Image 1"/>
</body>
</html>
"""
# Expected output after conversion
expected_html = """
<html>
<body>
<img src="/images/site-pages/develop/image1.png" alt="Image 1"/>
</body>
</html>
""" # noqa
s3_path = "/images/site-pages/develop"
result = convert_img_paths(html_content, s3_path)
expected_soup = BeautifulSoup(expected_html, "html.parser")
result_soup = BeautifulSoup(result, "html.parser")
assert result_soup == expected_soup

View File

@@ -4,7 +4,6 @@ import structlog
from dateutil.parser import parse
from django.conf import settings
from django.contrib.auth.mixins import UserPassesTestMixin
from django.core.cache import caches
from django.http import Http404, HttpResponse, HttpResponseNotFound
@@ -15,7 +14,12 @@ from django.views.generic import TemplateView
from .asciidoc import process_adoc_to_html_content
from .boostrenderer import get_content_from_s3
from .boostrenderer import (
get_content_from_s3,
get_s3_client,
extract_file_data,
convert_img_paths,
)
from .htmlhelper import modernize_legacy_page
from .markdown import process_md
from .models import RenderedContent
@@ -153,7 +157,7 @@ class ContentNotFoundException(Exception):
pass
class StaticContentTemplateView(TemplateView):
class BaseStaticContentTemplateView(TemplateView):
template_name = "adoc_content.html"
def get(self, request, *args, **kwargs):
@@ -274,6 +278,8 @@ class StaticContentTemplateView(TemplateView):
def render_to_response(self, context, **response_kwargs):
"""Return the HTML response with a template, or just the content directly."""
if self.get_template_names():
content = self.process_content(context["content"])
context["content"] = content
return super().render_to_response(context, **response_kwargs)
content = self.process_content(context["content"])
return HttpResponse(content, content_type=context["content_type"])
@@ -304,7 +310,32 @@ class StaticContentTemplateView(TemplateView):
return content
class DocLibsTemplateView(StaticContentTemplateView):
class StaticContentTemplateView(BaseStaticContentTemplateView):
def process_content(self, content):
"""Process the content we receive from S3"""
content_html = self.content_dict.get("content")
content_type = self.content_dict.get("content_type")
content_key = self.content_dict.get("content_key")
# Replace relative image paths will fully-qualified ones so they will render
if content_type == "text/html" or content_type == "text/asciidoc":
# Prefix the new URL path with "/images" so it routes through
# our ImageView class
url_parts = ["/images"]
if content_key:
# Get the path from the S3 key by stripping the filename from the S3 key
directory = os.path.dirname(content_key)
url_parts.append(directory.lstrip("/"))
# Generate the replacement path to the image
s3_path = "/".join(url_parts)
# Process the HTML to replace the image paths
content = convert_img_paths(str(content_html), s3_path)
return content
class DocLibsTemplateView(BaseStaticContentTemplateView):
# possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
boost_lib_path_re = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")
@@ -347,7 +378,7 @@ class DocLibsTemplateView(StaticContentTemplateView):
)
class UserGuideTemplateView(StaticContentTemplateView):
class UserGuideTemplateView(BaseStaticContentTemplateView):
def get_from_s3(self, content_path):
legacy_url = f"/doc/{content_path}"
return super().get_from_s3(legacy_url)
@@ -375,3 +406,22 @@ class UserGuideTemplateView(StaticContentTemplateView):
return modernize_legacy_page(
content, base_html, insert_body=insert_body, head_selector=head_selector
)
class ImageView(View):
def get(self, request, *args, **kwargs):
# TODO: Add caching logic
content_path = self.kwargs.get("content_path")
client = get_s3_client()
try:
response = client.get_object(
Bucket=settings.STATIC_CONTENT_BUCKET_NAME, Key=content_path
)
file_data = extract_file_data(response, content_path)
content = file_data["content"]
content_type = file_data["content_type"]
return HttpResponse(content, content_type=content_type)
except ContentNotFoundException:
raise Http404("Content not found")

View File

@@ -1,10 +1,47 @@
# Retrieving Static Content from the Boost Amazon S3 Bucket
# Boost Static Content
The `StaticContentTemplateView` class (in the `core/` app) is a Django view that handles requests for static content.
**Static Content** refers to content such as HTML files, markdown files, asciidoc files, etc. that is retrieved from Amazon S3 and rendered within the Boost site.
We can add "shortcut" URL paths to specific directories or files within S3 by updating the file `stage_static_config.json`.
## Quick Start
### Adding a shortcut url to a static page
1. Identify the URL pattern you would like to use. Example: `/style-guide/`
2. Identify the S3 path to the file you would like that URL to load. Example: `/site-pages/develop/style-guides.adoc`
3. Add an entry to `stage_static_config.json`. `site_path` is your URL route, with a `/` on either side. `s3_path` is the path to your desired file in S3, with a leading `/`:
```javascript
...
},
{
"site_path": "/style-guide/",
"s3_path": "/site-pages/develop/style-guides.adoc"
},
{
...
```
4. Restart your server and load `/style-guide/` in your browser to confirm it works.
## About Retrieving Static Content
An example shortcut url is the `/help/` page. This is the route that the `/help/` URL takes to render that page:
- The user enters `/help/` into the browser
- There is no `help/` path in `config/urls.py`, so the route falls through to the view that handles static content, `StaticContentTemplateView`.
- In this view, the `content_path` will be `help` . The view uses the `content_path` to try and retrieve the content for the `/help/` page from the Redis cache, the `RenderedContent` table, or from Amazon S3.
- The logic for retrieving the content from Amazon S3 is stored in `core/boostrenderer.py::get_content_from_s3()`. See [Retrieving Static Content from the Boost Amazon S3 Bucket](#retrieving-static-content-from-the-boost-amazon-s3-bucket) and [How we decide which S3 keys to try](#how-we-decide-which-s3-keys-to-try) for more information.
- Back in the view, if the view receives content from S3 (or the Redis cache or the database), it will return that. Otherwise, a 404 is raised.
## Retrieving Static Content from the Boost Amazon S3 Bucket
The `StaticContentTemplateView` class (in the `core/` app) is a Django view that handles requests for static content. It inherits from `BaseStaticContentTemplateView`, which is the class that contains the bulk of the logic.
Its URL path is the very last path in our list of URL patterns (see `config/urls.py`) because it functions as the fallback URL pattern. If a user enters a URL that doesn't match anything else defined in our URL patterns, this view will attempt to retrieve the request as static content from S3 using the URL path.
The `StaticContentTemplateView` calls S3 using the URL pattern and generates a list of potential keys to check. It then checks the specified S3 bucket for each of those keys and returns the first match it finds, along with the file content type. Passing the content type with the bucket contents allows the content to be delivered appropriately to the user (so HTML files will be rendered as HTML, etc.)
`StaticContentTemplateView` calls S3 using the URL pattern. The S3 retrieval code in `core/boostrenderer.py::get_content_from_s3()` generates a list of potential keys to check. It then checks the specified S3 bucket for each of those keys and returns the first match it finds, along with the file content type. Passing the content type with the bucket contents allows the content to be delivered appropriately to the user (so HTML files will be rendered as HTML, etc.)
Boost uses the AWS SDK for Python (boto3) to connect to an S3 bucket and retrieve the static content. If no bucket name is provided, pur process uses the `STATIC_CONTENT_BUCKET_NAME` setting from the Django project settings.
@@ -32,6 +69,10 @@ Take a look at this sample `{env}_static_config.json` file:
"site_path": "/develop/doc/",
"s3_path": "/site/develop/doc/html/"
},
{
"site_path": "/doc/_/",
"s3_path": "/site-docs/develop/_/"
},
{
"site_path": "/",
"s3_path": "/site/develop/"
@@ -43,12 +84,21 @@ Take a look at this sample `{env}_static_config.json` file:
- `/site/develop/libs/index.html`
Note that the `site_path` and the `s3_path` don't have to be to the same depth; the `site_path` in this example is 2 levels deep, and the `s3_path` is 3 levels deep. It doesn't matter.
**Example 2**: If the URL request is for `/develop/doc/index.html`, the S3 keys that the function would try are:
- `/site/develop/doc/html/index.html`
- `/site/develop/doc/index.html`
**Example 3**: If the URL request is for `/index.html`, the S3 keys that the function would try are:
**Example 3**: If the url request is for `/doc/accumulators/`, the S3 keys that the function would try are:
- `/site-docs/develop/accumulators/`
- `/site-docs/develop/accumulators/index.html`
In this example, the `_` functions as a wildcard, so `/doc/accumulators/` would shortcut to `/site-docs/develop/accumulators/`, and `/doc/algorithm/` would shortcut to `/site-docs/develop/algorithm/`, even though neither `accumulators` nor `algorithm` have their own entries in the config file.
**Example 4**: If the URL request is for `/index.html`, the S3 keys that the function would try are:
- `/site/develop/index.html`
- `/site/index.html`
@@ -59,4 +109,4 @@ We first try to retrieve the static content using the exact S3 key specified in
See [Caching and the `RenderedContent` model](./caching_rendered_content.md) for how Django-side caching is handled.
Cacching is also handled via Fastly CDN.
Caching is also handled via Fastly CDN.

View File

@@ -19,11 +19,11 @@ def test_get_and_store_library_version_documentation_urls_for_version(
library_name = library.name.lower()
mock_s3_response = {
"content": f"""
<h2>Libraries Listed <a name="Alphabetically">Alphabetically</a></h2>
<ul>
<li><a href="{library_name}/index.html">{library_name}</a></li>
</ul>
"""
<h2>Libraries Listed <a name="Alphabetically">Alphabetically</a></h2>
<ul>
<li><a href="{library_name}/index.html">{library_name}</a></li>
</ul>
"""
}
# Mock the get_content_from_s3 function to return the mock S3 response