Cache asciidoc content in db (Part of #394)

- Add RenderedContent model and related helper methods - Change StaticContentView `get()` logic to try the cache, then the db, then S3 - Change StaticContentView to update db appropriately - Refactoring for readability/maintainability
2026-01-19 04:42:17 +00:00 · 2023-06-07 14:37:05 -07:00
parent d648997ced
commit 825dbc019b
12 changed files with 415 additions and 149 deletions
--- a/conftest.py
+++ b/conftest.py
@@ -7,6 +7,7 @@ from django.core.files import File as DjangoFile
 # Include the various pytest fixtures from all of our Django apps tests
 # directories
 pytest_plugins = [
+    "core.tests.fixtures",
    "libraries.tests.fixtures",
    "news.tests.fixtures",
    "users.tests.fixtures",
--- a/core/boostrenderer.py
+++ b/core/boostrenderer.py
@@ -18,6 +18,19 @@ from pygments.formatters.html import HtmlFormatter
 logger = structlog.get_logger()


+def extract_file_data(response, s3_key):
+    """Extracts the file content, content type, and last modified date from an S3
+    response object."""
+    file_content = response["Body"].read()
+    content_type = get_content_type(s3_key, response["ContentType"])
+    last_modified = response["LastModified"]
+    return {
+        "content": file_content,
+        "content_type": content_type,
+        "last_modified": last_modified,
+    }
+
+
 def get_body_from_html(html_string: str) -> str:
    """Use BeautifulSoup to get the body content from an HTML document, without
    the <body> tag.
@@ -49,84 +62,32 @@ def get_content_from_s3(key=None, bucket_name=None):
    Get content from S3. Returns the decoded file contents if able
    """
    if not key:
-        logger.info(
-            "get_content_from_s3_no_key_provided",
-            key=key,
-            bucket_name=bucket_name,
-            function_name="get_content_from_s3",
-        )
        raise ValueError("No key provided.")

-    if not bucket_name:
-        bucket_name = settings.STATIC_CONTENT_BUCKET_NAME
-
-    s3_keys = get_s3_keys(key)
-
-    if not s3_keys:
-        s3_keys = [key]
-
-    client = boto3.client(
-        "s3",
-        aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID,
-        aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY,
-        region_name="us-east-1",
-    )
+    bucket_name = bucket_name or settings.STATIC_CONTENT_BUCKET_NAME
+    s3_keys = get_s3_keys(key) or [key]
+    client = get_s3_client()

    for s3_key in s3_keys:
-        try:
-            response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/"))
-            file_content = response["Body"].read()
-            content_type = get_content_type(s3_key, response["ContentType"])
-
-            logger.info(
-                "get_content_from_s3_success",
-                key=key,
-                bucket_name=bucket_name,
-                s3_key=s3_key,
-                function_name="get_content_from_s3",
-            )
-            return file_content, content_type
-        except ClientError as e:
-            # Log the error and continue with the next key in the list
-            logger.exception(
-                "get_content_from_s3_error",
-                key=key,
-                bucket_name=bucket_name,
-                s3_key=s3_key,
-                error=str(e),
-                function_name="get_content_from_s3",
-            )
-            pass
+        file_data = get_file_data(client, bucket_name, s3_key)
+        if file_data:
+            return file_data

        # Handle URLs that are directories looking for `index.html` files
        if s3_key.endswith("/"):
-            try:
-                original_key = s3_key.lstrip("/")
-                index_html_key = f"{original_key}index.html"
-                response = client.get_object(Bucket=bucket_name, Key=index_html_key)
-                file_content = response["Body"].read()
-                content_type = response["ContentType"]
-                return file_content, content_type
-            except ClientError as e:
-                # Log the error and continue with the next key in the list
-                logger.exception(
-                    "get_content_from_s3_client_error",
-                    key=key,
-                    bucket_name=bucket_name,
-                    s3_key=s3_key,
-                    error=str(e),
-                    function_name="get_content_from_s3",
-                )
-                pass
+            original_key = s3_key.lstrip("/")
+            index_html_key = f"{original_key}index.html"
+            file_data = get_file_data(client, bucket_name, index_html_key)
+            if file_data:
+                return file_data

-    # Return None if no valid object is found
    logger.info(
        "get_content_from_s3_no_valid_object",
        key=key,
        bucket_name=bucket_name,
        function_name="get_content_from_s3",
    )
-    return None
+    return {}


 def get_content_type(s3_key, content_type):
@@ -148,6 +109,32 @@ def get_content_type(s3_key, content_type):
    return content_type


+def get_file_data(client, bucket_name, s3_key):
+    """Get the file data from S3. Returns the decoded file contents if able."""
+    try:
+        response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/"))
+        return extract_file_data(response, s3_key)
+    except ClientError as e:
+        # Log the exception but ignore it otherwise, since it's not necessaruly an error
+        logger.exception(
+            "get_content_from_s3_error",
+            s3_key=s3_key,
+            error=str(e),
+            function_name="get_content_from_s3",
+        )
+    return
+
+
+def get_s3_client():
+    """Get an S3 client."""
+    return boto3.client(
+        "s3",
+        aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY,
+        region_name="us-east-1",
+    )
+
+
 def get_s3_keys(content_path, config_filename="stage_static_config.json"):
    """
    Get the S3 key for a given content path
--- a/core/migrations/0001_initial.py
+++ b/core/migrations/0001_initial.py
@@ -0,0 +1,69 @@
+# Generated by Django 4.2.1 on 2023-06-09 04:20
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="RenderedContent",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "cache_key",
+                    models.CharField(
+                        db_index=True,
+                        help_text="The cache key for the content.",
+                        max_length=255,
+                        unique=True,
+                    ),
+                ),
+                (
+                    "content_type",
+                    models.CharField(
+                        blank=True,
+                        help_text="The content type/MIME type.",
+                        max_length=255,
+                        null=True,
+                    ),
+                ),
+                (
+                    "content_original",
+                    models.TextField(
+                        blank=True, help_text="The original content.", null=True
+                    ),
+                ),
+                (
+                    "content_html",
+                    models.TextField(
+                        blank=True, help_text="The rendered HTML content.", null=True
+                    ),
+                ),
+                (
+                    "last_updated_at",
+                    models.DateTimeField(
+                        blank=True,
+                        help_text="The last time the content was updated in S3.",
+                        null=True,
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "rendered content",
+                "verbose_name_plural": "rendered contents",
+            },
+        ),
+    ]
--- a/core/migrations/init.py
+++ b/core/migrations/init.py
--- a/core/models.py
+++ b/core/models.py
@@ -0,0 +1,52 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+
+class RenderedContent(models.Model):
+    """Stores a copy of rendered content. Generally, this content is retrieved
+    from the S3 buckets and, if necessary, converted to HTML.
+
+    This model is intended to be used as a cache. If the content is not found,
+    it will be retrieved from S3 and stored in this model. If the content is
+    found, it will be returned from this model."""
+
+    cache_key = models.CharField(
+        max_length=255,
+        unique=True,
+        help_text=_("The cache key for the content."),
+        db_index=True,
+    )
+    content_type = models.CharField(
+        max_length=255,
+        help_text=_("The content type/MIME type."),
+        null=True,
+        blank=True,
+    )
+    content_original = models.TextField(
+        help_text=_("The original content."), null=True, blank=True
+    )
+    content_html = models.TextField(
+        help_text=_("The rendered HTML content."), null=True, blank=True
+    )
+    last_updated_at = models.DateTimeField(
+        help_text=_("The last time the content was updated in S3."),
+        null=True,
+        blank=True,
+    )
+
+    class Meta:
+        verbose_name = _("rendered content")
+        verbose_name_plural = _("rendered contents")
+
+    def __str__(self):
+        return self.cache_key
+
+    def save(self, *args, **kwargs):
+        if isinstance(self.content_original, bytes):
+            self.content_original = self.content_original.decode("utf-8")
+        if isinstance(self.content_html, bytes):
+            self.content_html = self.content_html.decode("utf-8")
+        if isinstance(self.content_type, bytes):
+            self.content_type = self.content_type.decode("utf-8")
+
+        super().save(*args, **kwargs)
--- a/core/tasks.py
+++ b/core/tasks.py
@@ -1,26 +1,27 @@
 import os
+
 import subprocess

-from django.conf import settings
-from django.core.cache import caches
 from celery import shared_task


@shared_task
-def adoc_to_html(file_path, cache_key, content_type, delete_file=True):
+def adoc_to_html(file_path, delete_file=True):
    """
-    Converts an AsciiDoc file to HTML and stores the result in the cache.
+    Converts an AsciiDoc file to HTML.
    If delete_file is True, the temporary file will be deleted after the
    conversion is complete.

+    Note: This returns the full <html> document, including the <head> and
+    <body> tags.
+
    The asciidoctor package is a Ruby gem, which is why we're using subprocess
    to run the command.
    https://docs.asciidoctor.org/asciidoctor/latest/

    :param file_path: The path to the AsciiDoc file
-    :param cache_key: The key to use when storing the result in the cache
-    :param content_type: The content type of the file
    :param delete_file: Whether or not to delete the temporary file after the
+        conversion is complete
    """
    result = subprocess.run(
        ["asciidoctor", "-o", "-", file_path],
@@ -30,15 +31,6 @@ def adoc_to_html(file_path, cache_key, content_type, delete_file=True):

    # Get the output from the command
    converted_html = result.stdout
-    # Get the static content cache
-    static_content_cache = caches["static_content"]
-
-    # Store the HTML content in cache
-    static_content_cache.set(
-        cache_key,
-        (converted_html, content_type),
-        int(settings.CACHES["static_content"]["TIMEOUT"]),
-    )

    # Delete the temporary file
    if delete_file:
--- a/core/tests/fixtures.py
+++ b/core/tests/fixtures.py
@@ -0,0 +1,12 @@
+import pytest
+from model_bakery import baker
+
+
+@pytest.fixture
+def rendered_content(db):
+    return baker.make(
+        "core.RenderedContent",
+        cache_key="cache-key",
+        content_original="Sample content",
+        content_html="<p>Sample content</p>",
+    )
--- a/core/tests/test_models.py
+++ b/core/tests/test_models.py
@@ -0,0 +1,19 @@
+from model_bakery import baker
+
+
+def test_rendered_content_creation(rendered_content):
+    assert rendered_content.cache_key is not None
+
+
+def test_rendered_content_save():
+    content = baker.make(
+        "core.RenderedContent",
+        content_original=b"Sample original content",
+        content_html=b"<p>Sample HTML content</p>",
+        content_type=b"text/html",
+    )
+    content.save()
+    content.refresh_from_db()
+    assert isinstance(content.content_original, str)
+    assert isinstance(content.content_html, str)
+    assert isinstance(content.content_type, str)
--- a/core/tests/test_renderer.py
+++ b/core/tests/test_renderer.py
@@ -1,4 +1,39 @@
-from ..boostrenderer import get_body_from_html, get_content_type, get_s3_keys
+from unittest.mock import Mock, patch
+import datetime
+from io import BytesIO
+import pytest
+
+from ..boostrenderer import (
+    extract_file_data,
+    get_body_from_html,
+    get_content_type,
+    get_file_data,
+    get_s3_keys,
+)
+
+
+@pytest.fixture
+def mock_s3_client():
+    return "mock_s3_client"
+
+
+def test_extract_file_data():
+    response = {
+        "Body": BytesIO(b"file content"),
+        "ContentType": "text/plain",
+        "LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
+    }
+    s3_key = "example_key.txt"
+
+    expected_result = {
+        "content": b"file content",
+        "content_type": "text/plain",
+        "last_modified": datetime.datetime(2023, 6, 8, 12, 0, 0),
+    }
+
+    result = extract_file_data(response, s3_key)
+
+    assert result == expected_result


 def test_get_body_from_html():
@@ -41,6 +76,36 @@ def test_get_content_type():
    ), "application/javascript"


+def test_get_file_data():
+    # Mock the S3 client
+    mock_client = Mock()
+    mock_response = Mock()
+    mock_extract_file_data = Mock(return_value="mock_file_data")
+
+    # Patch the necessary functions and objects
+    with patch("core.boostrenderer.extract_file_data", mock_extract_file_data), patch(
+        "core.boostrenderer.logger"
+    ) as mock_logger:
+        # Set up the mock response
+        mock_client.get_object.return_value = mock_response
+
+        bucket_name = "my-bucket"
+        s3_key = "/path/to/file.txt"
+
+        expected_result = "mock_file_data"
+
+        # Call the function being tested
+        result = get_file_data(mock_client, bucket_name, s3_key)
+
+        # Assert the expected behavior and result
+        assert result == expected_result
+        mock_client.get_object.assert_called_once_with(
+            Bucket=bucket_name, Key=s3_key.lstrip("/")
+        )
+        mock_extract_file_data.assert_called_once_with(mock_response, s3_key)
+        assert not mock_logger.exception.called
+
+
 def test_get_s3_keys():
    """
    Test cases:
--- a/core/tests/test_tasks.py
+++ b/core/tests/test_tasks.py
@@ -27,7 +27,7 @@ from core.tasks import adoc_to_html
 )
 def test_adoc_to_html():
    # Get the static content cache
-    static_content_cache = caches["static_content"]
+    caches["static_content"]

    # The content of the sample adoc file
    sample_adoc_content = "= Document Title\n\nThis is a sample document.\n"
@@ -37,20 +37,10 @@ def test_adoc_to_html():
        temp_file.write(sample_adoc_content.encode())
        temp_file_path = temp_file.name

-    # The cache key to be used
-    cache_key = "sample_key"
-
    # Execute the task
    with patch("core.tasks.subprocess.run") as mock_run:
        mock_run.return_value.stdout = "html_content".encode()
-        adoc_to_html(temp_file_path, cache_key, "text/asciidoc", delete_file=True)
-
-    # Verify that the content has been stored in the cache
-    cached_result = static_content_cache.get(cache_key)
-
-    assert cached_result is not None
-    assert cached_result[0] == b"html_content"
-    assert cached_result[1] == "text/asciidoc"
+        adoc_to_html(temp_file_path, delete_file=True)

    # Verify that the temporary file has been deleted
    with pytest.raises(FileNotFoundError):
--- a/core/tests/test_views.py
+++ b/core/tests/test_views.py
@@ -53,7 +53,8 @@ def test_content_found(request_factory):
    """Test that content is found and returned."""
    content_path = "/develop/libs/rst.css"
    with patch(
-        "core.views.get_content_from_s3", return_value=(b"fake content", "text/plain")
+        "core.views.get_content_from_s3",
+        return_value={"content": b"fake content", "content_type": "text/plain"},
    ):
        response = call_view(request_factory, content_path)
    assert response.status_code == 200
@@ -86,7 +87,11 @@ def test_cache_expiration(request_factory):
    cache_key = f"static_content_{content_path}"

    # Set the content in the cache with a 1-second timeout
-    cache.set(cache_key, (mock_content, mock_content_type), timeout=1)
+    cache.set(
+        cache_key,
+        {"content": mock_content, "content_type": mock_content_type},
+        timeout=1,
+    )


 def test_markdown_view_top_level(tp):
--- a/core/views.py
+++ b/core/views.py
@@ -1,15 +1,16 @@
 import os.path
 import structlog
 import tempfile
+from dateutil.parser import parse

 from django.conf import settings
 from django.core.cache import caches
 from django.http import Http404, HttpResponse, HttpResponseNotFound
-from django.shortcuts import render
-from django.views.generic import TemplateView, View
+from django.views.generic import TemplateView

 from .boostrenderer import get_body_from_html, get_content_from_s3
 from .markdown import process_md
+from .models import RenderedContent
 from .tasks import adoc_to_html

 logger = structlog.get_logger()
@@ -92,78 +93,151 @@ class MarkdownTemplateView(TemplateView):
        return self.render_to_response(context)


-class StaticContentTemplateView(View):
+class ContentNotFoundException(Exception):
+    pass
+
+
+class StaticContentTemplateView(TemplateView):
+    template_name = "adoc_content.html"
+
    def get(self, request, *args, **kwargs):
-        """Verifies the file and returns the raw static content from S3
-        mangling paths using the stage_static_config.json settings
+        """Returns static content that originates in S3, but is cached in a couple of
+        different places.
+
+        Any valid S3 key to the S3 bucket apecified in settings can be returned by
+        this view. Pages like the Help page are stored in S3 and rendered via
+        this view, for example.
+
+        See the *_static_config.json files for URL mappings to specific S3 keys.
        """
-        content_path = kwargs.get("content_path")
+        content_path = self.kwargs.get("content_path")
+        try:
+            self.content_dict = self.get_content(content_path)
+        except ContentNotFoundException:
+            logger.info(
+                "get_content_from_s3_view_not_in_cache",
+                content_path=content_path,
+                status_code=404,
+            )
+            return HttpResponseNotFound("Page not found")
+        return super().get(request, *args, **kwargs)

-        # Try to get content from cache, if it's not there then fetch from S3
-        content, content_type = self.get_content(content_path)
+    def get_template_names(self):
+        """Returns the template name."""
+        content_type = self.content_dict.get("content_type")
+        if content_type == "text/asciidoc":
+            return [self.template_name]
+        return []

-        if content is None:
-            return HttpResponseNotFound("Page not found")  # Return a 404 response
+    def get_context_data(self, **kwargs):
+        """Returns the content and content type for the template. In some cases,
+        changes the content type."""
+        context = super().get_context_data(**kwargs)
+        content_type = self.content_dict.get("content_type")
+        content = self.content_dict.get("content")

        if content_type == "text/asciidoc":
-            response = self.handle_adoc_content(request, content, content_type)
-        else:
-            response = HttpResponse(content, content_type=content_type)
+            content_type = "text/html"
+
+        context.update({"content": content, "content_type": content_type})

        logger.info(
-            "get_content_from_s3_view_success",
-            key=kwargs.get("content_path"),
-            status_code=response.status_code,
+            "get_content_from_s3_view_success", key=self.kwargs.get("content_path")
        )

-        return response
+        return context

-    def get_content(self, content_path):
-        """Get content from cache or S3."""
-        static_content_cache = caches["static_content"]
-        cache_key = f"static_content_{content_path}"
-        cached_result = static_content_cache.get(cache_key)
-
-        if cached_result:
-            content, content_type = cached_result
+    def render_to_response(self, context, **response_kwargs):
+        """Return the HTML response with a template, or just the content directly."""
+        if self.get_template_names():
+            return super().render_to_response(context, **response_kwargs)
        else:
-            result = get_content_from_s3(key=content_path)
-            if not result:
-                logger.info(
-                    "get_content_from_s3_view_no_valid_object",
-                    key=content_path,
-                    status_code=404,
-                )
-                return None, None  # Return None values when content is not found
-
-            content, content_type = result
-
-            # Always store the original content and content_type in cache
-            static_content_cache.set(
-                cache_key,
-                (content, content_type),
-                int(settings.CACHES["static_content"]["TIMEOUT"]),
+            return HttpResponse(
+                context["content"], content_type=context["content_type"]
            )

-        return content, content_type
+    def get_content(self, content_path):
+        """Returns content from cache, database, or S3"""
+        static_content_cache = caches["static_content"]
+        cache_key = f"static_content_{content_path}"
+        result = self.get_from_cache(static_content_cache, cache_key)

-    def handle_adoc_content(self, request, content, content_path):
-        """Convert AsciiDoc content to HTML and return the HTML response."""
+        if result is None:
+            result = self.get_from_database(cache_key)
+
+        if result is None:
+            result = self.get_from_s3(content_path, cache_key)
+
+        if result is None:
+            logger.info(
+                "get_content_from_s3_view_no_valid_object",
+                key=content_path,
+                status_code=404,
+            )
+            raise ContentNotFoundException("Content not found")
+
+        return result
+
+    def get_from_cache(self, static_content_cache, cache_key):
+        cached_result = static_content_cache.get(cache_key)
+        return cached_result if cached_result else None
+
+    def get_from_database(self, cache_key):
+        try:
+            content_obj = RenderedContent.objects.get(cache_key=cache_key)
+            return {
+                "content": content_obj.content_html,
+                "content_type": content_obj.content_type,
+            }
+        except RenderedContent.DoesNotExist:
+            return None
+
+    def get_from_s3(self, content_path, cache_key):
+        result = get_content_from_s3(key=content_path)
+        if result and result.get("content"):
+            self.update_or_create_content(result, cache_key)
+            return result
+        return
+
+    def update_or_create_content(self, result, cache_key):
+        content = result.get("content")
+        content_type = result.get("content_type")
+        last_updated_at_raw = result.get("last_updated_at")
+
+        if content_type == "text/asciidoc":
+            content = self.convert_adoc_to_html(content, cache_key)
+            last_updated_at = (
+                parse(last_updated_at_raw) if last_updated_at_raw else None
+            )
+
+            defaults = {"content_html": content, "content_type": content_type}
+            if last_updated_at:
+                defaults["last_updated_at"] = last_updated_at
+            content_obj, created = RenderedContent.objects.update_or_create(
+                cache_key=cache_key, defaults=defaults
+            )
+            logger.info(
+                "get_content_from_s3_view_saved_to_db",
+                cache_key=cache_key,
+                content_type=content_type,
+                status_code=200,
+                obj_id=content_obj.id,
+                created=created,
+            )
+            result["content"] = content
+            result["content_type"] = content_type
+
+    def convert_adoc_to_html(self, content, cache_key):
+        """Renders asciidoc content to HTML."""
        # Write the content to a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            if isinstance(content, str):
+                content = content.encode()
            temp_file.write(content)

-        # Convert the AsciiDoc to HTML
-        # TODO: Put this back on a delay and return a response indicating that
-        # the content is being prepared
-        html_content = adoc_to_html(
-            temp_file.name, content_path, "text/asciidoc", delete_file=True
-        )
+        html_content = adoc_to_html(temp_file.name, delete_file=True)
        if isinstance(html_content, bytes):
-            # Content is a byte string, decode it using UTF-8 encoding
            html_content = html_content.decode("utf-8")

-        # Extract only the contents of the body tag from the HTML
-        content = get_body_from_html(html_content)
-        context = {"content": content, "content_type": "text/html"}
-        return render(request, "adoc_content.html", context)
+        # Extract only the contents of the body tag that we want from the HTML
+        return get_body_from_html(html_content)