Cache asciidoc content in db (Part of #394)

- Add RenderedContent model and related helper methods
- Change StaticContentView `get()` logic to try the cache, then the db, then S3
- Change StaticContentView to update db appropriately
- Refactoring for readability/maintainability
This commit is contained in:
Lacey Williams Henschel
2023-06-07 14:37:05 -07:00
parent d648997ced
commit 825dbc019b
12 changed files with 415 additions and 149 deletions

View File

@@ -7,6 +7,7 @@ from django.core.files import File as DjangoFile
# Include the various pytest fixtures from all of our Django apps tests
# directories
pytest_plugins = [
"core.tests.fixtures",
"libraries.tests.fixtures",
"news.tests.fixtures",
"users.tests.fixtures",

View File

@@ -18,6 +18,19 @@ from pygments.formatters.html import HtmlFormatter
logger = structlog.get_logger()
def extract_file_data(response, s3_key):
"""Extracts the file content, content type, and last modified date from an S3
response object."""
file_content = response["Body"].read()
content_type = get_content_type(s3_key, response["ContentType"])
last_modified = response["LastModified"]
return {
"content": file_content,
"content_type": content_type,
"last_modified": last_modified,
}
def get_body_from_html(html_string: str) -> str:
"""Use BeautifulSoup to get the body content from an HTML document, without
the <body> tag.
@@ -49,84 +62,32 @@ def get_content_from_s3(key=None, bucket_name=None):
Get content from S3. Returns the decoded file contents if able
"""
if not key:
logger.info(
"get_content_from_s3_no_key_provided",
key=key,
bucket_name=bucket_name,
function_name="get_content_from_s3",
)
raise ValueError("No key provided.")
if not bucket_name:
bucket_name = settings.STATIC_CONTENT_BUCKET_NAME
s3_keys = get_s3_keys(key)
if not s3_keys:
s3_keys = [key]
client = boto3.client(
"s3",
aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY,
region_name="us-east-1",
)
bucket_name = bucket_name or settings.STATIC_CONTENT_BUCKET_NAME
s3_keys = get_s3_keys(key) or [key]
client = get_s3_client()
for s3_key in s3_keys:
try:
response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/"))
file_content = response["Body"].read()
content_type = get_content_type(s3_key, response["ContentType"])
logger.info(
"get_content_from_s3_success",
key=key,
bucket_name=bucket_name,
s3_key=s3_key,
function_name="get_content_from_s3",
)
return file_content, content_type
except ClientError as e:
# Log the error and continue with the next key in the list
logger.exception(
"get_content_from_s3_error",
key=key,
bucket_name=bucket_name,
s3_key=s3_key,
error=str(e),
function_name="get_content_from_s3",
)
pass
file_data = get_file_data(client, bucket_name, s3_key)
if file_data:
return file_data
# Handle URLs that are directories looking for `index.html` files
if s3_key.endswith("/"):
try:
original_key = s3_key.lstrip("/")
index_html_key = f"{original_key}index.html"
response = client.get_object(Bucket=bucket_name, Key=index_html_key)
file_content = response["Body"].read()
content_type = response["ContentType"]
return file_content, content_type
except ClientError as e:
# Log the error and continue with the next key in the list
logger.exception(
"get_content_from_s3_client_error",
key=key,
bucket_name=bucket_name,
s3_key=s3_key,
error=str(e),
function_name="get_content_from_s3",
)
pass
original_key = s3_key.lstrip("/")
index_html_key = f"{original_key}index.html"
file_data = get_file_data(client, bucket_name, index_html_key)
if file_data:
return file_data
# Return None if no valid object is found
logger.info(
"get_content_from_s3_no_valid_object",
key=key,
bucket_name=bucket_name,
function_name="get_content_from_s3",
)
return None
return {}
def get_content_type(s3_key, content_type):
@@ -148,6 +109,32 @@ def get_content_type(s3_key, content_type):
return content_type
def get_file_data(client, bucket_name, s3_key):
"""Get the file data from S3. Returns the decoded file contents if able."""
try:
response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/"))
return extract_file_data(response, s3_key)
except ClientError as e:
# Log the exception but ignore it otherwise, since it's not necessaruly an error
logger.exception(
"get_content_from_s3_error",
s3_key=s3_key,
error=str(e),
function_name="get_content_from_s3",
)
return
def get_s3_client():
"""Get an S3 client."""
return boto3.client(
"s3",
aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY,
region_name="us-east-1",
)
def get_s3_keys(content_path, config_filename="stage_static_config.json"):
"""
Get the S3 key for a given content path

View File

@@ -0,0 +1,69 @@
# Generated by Django 4.2.1 on 2023-06-09 04:20
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="RenderedContent",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"cache_key",
models.CharField(
db_index=True,
help_text="The cache key for the content.",
max_length=255,
unique=True,
),
),
(
"content_type",
models.CharField(
blank=True,
help_text="The content type/MIME type.",
max_length=255,
null=True,
),
),
(
"content_original",
models.TextField(
blank=True, help_text="The original content.", null=True
),
),
(
"content_html",
models.TextField(
blank=True, help_text="The rendered HTML content.", null=True
),
),
(
"last_updated_at",
models.DateTimeField(
blank=True,
help_text="The last time the content was updated in S3.",
null=True,
),
),
],
options={
"verbose_name": "rendered content",
"verbose_name_plural": "rendered contents",
},
),
]

View File

52
core/models.py Normal file
View File

@@ -0,0 +1,52 @@
from django.db import models
from django.utils.translation import gettext_lazy as _
class RenderedContent(models.Model):
"""Stores a copy of rendered content. Generally, this content is retrieved
from the S3 buckets and, if necessary, converted to HTML.
This model is intended to be used as a cache. If the content is not found,
it will be retrieved from S3 and stored in this model. If the content is
found, it will be returned from this model."""
cache_key = models.CharField(
max_length=255,
unique=True,
help_text=_("The cache key for the content."),
db_index=True,
)
content_type = models.CharField(
max_length=255,
help_text=_("The content type/MIME type."),
null=True,
blank=True,
)
content_original = models.TextField(
help_text=_("The original content."), null=True, blank=True
)
content_html = models.TextField(
help_text=_("The rendered HTML content."), null=True, blank=True
)
last_updated_at = models.DateTimeField(
help_text=_("The last time the content was updated in S3."),
null=True,
blank=True,
)
class Meta:
verbose_name = _("rendered content")
verbose_name_plural = _("rendered contents")
def __str__(self):
return self.cache_key
def save(self, *args, **kwargs):
if isinstance(self.content_original, bytes):
self.content_original = self.content_original.decode("utf-8")
if isinstance(self.content_html, bytes):
self.content_html = self.content_html.decode("utf-8")
if isinstance(self.content_type, bytes):
self.content_type = self.content_type.decode("utf-8")
super().save(*args, **kwargs)

View File

@@ -1,26 +1,27 @@
import os
import subprocess
from django.conf import settings
from django.core.cache import caches
from celery import shared_task
@shared_task
def adoc_to_html(file_path, cache_key, content_type, delete_file=True):
def adoc_to_html(file_path, delete_file=True):
"""
Converts an AsciiDoc file to HTML and stores the result in the cache.
Converts an AsciiDoc file to HTML.
If delete_file is True, the temporary file will be deleted after the
conversion is complete.
Note: This returns the full <html> document, including the <head> and
<body> tags.
The asciidoctor package is a Ruby gem, which is why we're using subprocess
to run the command.
https://docs.asciidoctor.org/asciidoctor/latest/
:param file_path: The path to the AsciiDoc file
:param cache_key: The key to use when storing the result in the cache
:param content_type: The content type of the file
:param delete_file: Whether or not to delete the temporary file after the
conversion is complete
"""
result = subprocess.run(
["asciidoctor", "-o", "-", file_path],
@@ -30,15 +31,6 @@ def adoc_to_html(file_path, cache_key, content_type, delete_file=True):
# Get the output from the command
converted_html = result.stdout
# Get the static content cache
static_content_cache = caches["static_content"]
# Store the HTML content in cache
static_content_cache.set(
cache_key,
(converted_html, content_type),
int(settings.CACHES["static_content"]["TIMEOUT"]),
)
# Delete the temporary file
if delete_file:

12
core/tests/fixtures.py Normal file
View File

@@ -0,0 +1,12 @@
import pytest
from model_bakery import baker
@pytest.fixture
def rendered_content(db):
return baker.make(
"core.RenderedContent",
cache_key="cache-key",
content_original="Sample content",
content_html="<p>Sample content</p>",
)

19
core/tests/test_models.py Normal file
View File

@@ -0,0 +1,19 @@
from model_bakery import baker
def test_rendered_content_creation(rendered_content):
assert rendered_content.cache_key is not None
def test_rendered_content_save():
content = baker.make(
"core.RenderedContent",
content_original=b"Sample original content",
content_html=b"<p>Sample HTML content</p>",
content_type=b"text/html",
)
content.save()
content.refresh_from_db()
assert isinstance(content.content_original, str)
assert isinstance(content.content_html, str)
assert isinstance(content.content_type, str)

View File

@@ -1,4 +1,39 @@
from ..boostrenderer import get_body_from_html, get_content_type, get_s3_keys
from unittest.mock import Mock, patch
import datetime
from io import BytesIO
import pytest
from ..boostrenderer import (
extract_file_data,
get_body_from_html,
get_content_type,
get_file_data,
get_s3_keys,
)
@pytest.fixture
def mock_s3_client():
return "mock_s3_client"
def test_extract_file_data():
response = {
"Body": BytesIO(b"file content"),
"ContentType": "text/plain",
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
}
s3_key = "example_key.txt"
expected_result = {
"content": b"file content",
"content_type": "text/plain",
"last_modified": datetime.datetime(2023, 6, 8, 12, 0, 0),
}
result = extract_file_data(response, s3_key)
assert result == expected_result
def test_get_body_from_html():
@@ -41,6 +76,36 @@ def test_get_content_type():
), "application/javascript"
def test_get_file_data():
# Mock the S3 client
mock_client = Mock()
mock_response = Mock()
mock_extract_file_data = Mock(return_value="mock_file_data")
# Patch the necessary functions and objects
with patch("core.boostrenderer.extract_file_data", mock_extract_file_data), patch(
"core.boostrenderer.logger"
) as mock_logger:
# Set up the mock response
mock_client.get_object.return_value = mock_response
bucket_name = "my-bucket"
s3_key = "/path/to/file.txt"
expected_result = "mock_file_data"
# Call the function being tested
result = get_file_data(mock_client, bucket_name, s3_key)
# Assert the expected behavior and result
assert result == expected_result
mock_client.get_object.assert_called_once_with(
Bucket=bucket_name, Key=s3_key.lstrip("/")
)
mock_extract_file_data.assert_called_once_with(mock_response, s3_key)
assert not mock_logger.exception.called
def test_get_s3_keys():
"""
Test cases:

View File

@@ -27,7 +27,7 @@ from core.tasks import adoc_to_html
)
def test_adoc_to_html():
# Get the static content cache
static_content_cache = caches["static_content"]
caches["static_content"]
# The content of the sample adoc file
sample_adoc_content = "= Document Title\n\nThis is a sample document.\n"
@@ -37,20 +37,10 @@ def test_adoc_to_html():
temp_file.write(sample_adoc_content.encode())
temp_file_path = temp_file.name
# The cache key to be used
cache_key = "sample_key"
# Execute the task
with patch("core.tasks.subprocess.run") as mock_run:
mock_run.return_value.stdout = "html_content".encode()
adoc_to_html(temp_file_path, cache_key, "text/asciidoc", delete_file=True)
# Verify that the content has been stored in the cache
cached_result = static_content_cache.get(cache_key)
assert cached_result is not None
assert cached_result[0] == b"html_content"
assert cached_result[1] == "text/asciidoc"
adoc_to_html(temp_file_path, delete_file=True)
# Verify that the temporary file has been deleted
with pytest.raises(FileNotFoundError):

View File

@@ -53,7 +53,8 @@ def test_content_found(request_factory):
"""Test that content is found and returned."""
content_path = "/develop/libs/rst.css"
with patch(
"core.views.get_content_from_s3", return_value=(b"fake content", "text/plain")
"core.views.get_content_from_s3",
return_value={"content": b"fake content", "content_type": "text/plain"},
):
response = call_view(request_factory, content_path)
assert response.status_code == 200
@@ -86,7 +87,11 @@ def test_cache_expiration(request_factory):
cache_key = f"static_content_{content_path}"
# Set the content in the cache with a 1-second timeout
cache.set(cache_key, (mock_content, mock_content_type), timeout=1)
cache.set(
cache_key,
{"content": mock_content, "content_type": mock_content_type},
timeout=1,
)
def test_markdown_view_top_level(tp):

View File

@@ -1,15 +1,16 @@
import os.path
import structlog
import tempfile
from dateutil.parser import parse
from django.conf import settings
from django.core.cache import caches
from django.http import Http404, HttpResponse, HttpResponseNotFound
from django.shortcuts import render
from django.views.generic import TemplateView, View
from django.views.generic import TemplateView
from .boostrenderer import get_body_from_html, get_content_from_s3
from .markdown import process_md
from .models import RenderedContent
from .tasks import adoc_to_html
logger = structlog.get_logger()
@@ -92,78 +93,151 @@ class MarkdownTemplateView(TemplateView):
return self.render_to_response(context)
class StaticContentTemplateView(View):
class ContentNotFoundException(Exception):
pass
class StaticContentTemplateView(TemplateView):
template_name = "adoc_content.html"
def get(self, request, *args, **kwargs):
"""Verifies the file and returns the raw static content from S3
mangling paths using the stage_static_config.json settings
"""Returns static content that originates in S3, but is cached in a couple of
different places.
Any valid S3 key to the S3 bucket apecified in settings can be returned by
this view. Pages like the Help page are stored in S3 and rendered via
this view, for example.
See the *_static_config.json files for URL mappings to specific S3 keys.
"""
content_path = kwargs.get("content_path")
content_path = self.kwargs.get("content_path")
try:
self.content_dict = self.get_content(content_path)
except ContentNotFoundException:
logger.info(
"get_content_from_s3_view_not_in_cache",
content_path=content_path,
status_code=404,
)
return HttpResponseNotFound("Page not found")
return super().get(request, *args, **kwargs)
# Try to get content from cache, if it's not there then fetch from S3
content, content_type = self.get_content(content_path)
def get_template_names(self):
"""Returns the template name."""
content_type = self.content_dict.get("content_type")
if content_type == "text/asciidoc":
return [self.template_name]
return []
if content is None:
return HttpResponseNotFound("Page not found") # Return a 404 response
def get_context_data(self, **kwargs):
"""Returns the content and content type for the template. In some cases,
changes the content type."""
context = super().get_context_data(**kwargs)
content_type = self.content_dict.get("content_type")
content = self.content_dict.get("content")
if content_type == "text/asciidoc":
response = self.handle_adoc_content(request, content, content_type)
else:
response = HttpResponse(content, content_type=content_type)
content_type = "text/html"
context.update({"content": content, "content_type": content_type})
logger.info(
"get_content_from_s3_view_success",
key=kwargs.get("content_path"),
status_code=response.status_code,
"get_content_from_s3_view_success", key=self.kwargs.get("content_path")
)
return response
return context
def get_content(self, content_path):
"""Get content from cache or S3."""
static_content_cache = caches["static_content"]
cache_key = f"static_content_{content_path}"
cached_result = static_content_cache.get(cache_key)
if cached_result:
content, content_type = cached_result
def render_to_response(self, context, **response_kwargs):
"""Return the HTML response with a template, or just the content directly."""
if self.get_template_names():
return super().render_to_response(context, **response_kwargs)
else:
result = get_content_from_s3(key=content_path)
if not result:
logger.info(
"get_content_from_s3_view_no_valid_object",
key=content_path,
status_code=404,
)
return None, None # Return None values when content is not found
content, content_type = result
# Always store the original content and content_type in cache
static_content_cache.set(
cache_key,
(content, content_type),
int(settings.CACHES["static_content"]["TIMEOUT"]),
return HttpResponse(
context["content"], content_type=context["content_type"]
)
return content, content_type
def get_content(self, content_path):
"""Returns content from cache, database, or S3"""
static_content_cache = caches["static_content"]
cache_key = f"static_content_{content_path}"
result = self.get_from_cache(static_content_cache, cache_key)
def handle_adoc_content(self, request, content, content_path):
"""Convert AsciiDoc content to HTML and return the HTML response."""
if result is None:
result = self.get_from_database(cache_key)
if result is None:
result = self.get_from_s3(content_path, cache_key)
if result is None:
logger.info(
"get_content_from_s3_view_no_valid_object",
key=content_path,
status_code=404,
)
raise ContentNotFoundException("Content not found")
return result
def get_from_cache(self, static_content_cache, cache_key):
cached_result = static_content_cache.get(cache_key)
return cached_result if cached_result else None
def get_from_database(self, cache_key):
try:
content_obj = RenderedContent.objects.get(cache_key=cache_key)
return {
"content": content_obj.content_html,
"content_type": content_obj.content_type,
}
except RenderedContent.DoesNotExist:
return None
def get_from_s3(self, content_path, cache_key):
result = get_content_from_s3(key=content_path)
if result and result.get("content"):
self.update_or_create_content(result, cache_key)
return result
return
def update_or_create_content(self, result, cache_key):
content = result.get("content")
content_type = result.get("content_type")
last_updated_at_raw = result.get("last_updated_at")
if content_type == "text/asciidoc":
content = self.convert_adoc_to_html(content, cache_key)
last_updated_at = (
parse(last_updated_at_raw) if last_updated_at_raw else None
)
defaults = {"content_html": content, "content_type": content_type}
if last_updated_at:
defaults["last_updated_at"] = last_updated_at
content_obj, created = RenderedContent.objects.update_or_create(
cache_key=cache_key, defaults=defaults
)
logger.info(
"get_content_from_s3_view_saved_to_db",
cache_key=cache_key,
content_type=content_type,
status_code=200,
obj_id=content_obj.id,
created=created,
)
result["content"] = content
result["content_type"] = content_type
def convert_adoc_to_html(self, content, cache_key):
"""Renders asciidoc content to HTML."""
# Write the content to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
if isinstance(content, str):
content = content.encode()
temp_file.write(content)
# Convert the AsciiDoc to HTML
# TODO: Put this back on a delay and return a response indicating that
# the content is being prepared
html_content = adoc_to_html(
temp_file.name, content_path, "text/asciidoc", delete_file=True
)
html_content = adoc_to_html(temp_file.name, delete_file=True)
if isinstance(html_content, bytes):
# Content is a byte string, decode it using UTF-8 encoding
html_content = html_content.decode("utf-8")
# Extract only the contents of the body tag from the HTML
content = get_body_from_html(html_content)
context = {"content": content, "content_type": "text/html"}
return render(request, "adoc_content.html", context)
# Extract only the contents of the body tag that we want from the HTML
return get_body_from_html(html_content)