From 825dbc019b88a7d5c4e5ac9422aeebb279b376fe Mon Sep 17 00:00:00 2001 From: Lacey Williams Henschel Date: Wed, 7 Jun 2023 14:37:05 -0700 Subject: [PATCH] Cache asciidoc content in db (Part of #394) - Add RenderedContent model and related helper methods - Change StaticContentView `get()` logic to try the cache, then the db, then S3 - Change StaticContentView to update db appropriately - Refactoring for readability/maintainability --- conftest.py | 1 + core/boostrenderer.py | 115 +++++++++----------- core/migrations/0001_initial.py | 69 ++++++++++++ core/migrations/__init__.py | 0 core/models.py | 52 +++++++++ core/tasks.py | 22 ++-- core/tests/fixtures.py | 12 +++ core/tests/test_models.py | 19 ++++ core/tests/test_renderer.py | 67 +++++++++++- core/tests/test_tasks.py | 14 +-- core/tests/test_views.py | 9 +- core/views.py | 184 ++++++++++++++++++++++---------- 12 files changed, 415 insertions(+), 149 deletions(-) create mode 100644 core/migrations/0001_initial.py create mode 100644 core/migrations/__init__.py create mode 100644 core/models.py create mode 100644 core/tests/fixtures.py create mode 100644 core/tests/test_models.py diff --git a/conftest.py b/conftest.py index 74360270..e9402a98 100644 --- a/conftest.py +++ b/conftest.py @@ -7,6 +7,7 @@ from django.core.files import File as DjangoFile # Include the various pytest fixtures from all of our Django apps tests # directories pytest_plugins = [ + "core.tests.fixtures", "libraries.tests.fixtures", "news.tests.fixtures", "users.tests.fixtures", diff --git a/core/boostrenderer.py b/core/boostrenderer.py index 453a8343..4bea116b 100644 --- a/core/boostrenderer.py +++ b/core/boostrenderer.py @@ -18,6 +18,19 @@ from pygments.formatters.html import HtmlFormatter logger = structlog.get_logger() +def extract_file_data(response, s3_key): + """Extracts the file content, content type, and last modified date from an S3 + response object.""" + file_content = response["Body"].read() + content_type = get_content_type(s3_key, response["ContentType"]) + last_modified = response["LastModified"] + return { + "content": file_content, + "content_type": content_type, + "last_modified": last_modified, + } + + def get_body_from_html(html_string: str) -> str: """Use BeautifulSoup to get the body content from an HTML document, without the tag. @@ -49,84 +62,32 @@ def get_content_from_s3(key=None, bucket_name=None): Get content from S3. Returns the decoded file contents if able """ if not key: - logger.info( - "get_content_from_s3_no_key_provided", - key=key, - bucket_name=bucket_name, - function_name="get_content_from_s3", - ) raise ValueError("No key provided.") - if not bucket_name: - bucket_name = settings.STATIC_CONTENT_BUCKET_NAME - - s3_keys = get_s3_keys(key) - - if not s3_keys: - s3_keys = [key] - - client = boto3.client( - "s3", - aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY, - region_name="us-east-1", - ) + bucket_name = bucket_name or settings.STATIC_CONTENT_BUCKET_NAME + s3_keys = get_s3_keys(key) or [key] + client = get_s3_client() for s3_key in s3_keys: - try: - response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/")) - file_content = response["Body"].read() - content_type = get_content_type(s3_key, response["ContentType"]) - - logger.info( - "get_content_from_s3_success", - key=key, - bucket_name=bucket_name, - s3_key=s3_key, - function_name="get_content_from_s3", - ) - return file_content, content_type - except ClientError as e: - # Log the error and continue with the next key in the list - logger.exception( - "get_content_from_s3_error", - key=key, - bucket_name=bucket_name, - s3_key=s3_key, - error=str(e), - function_name="get_content_from_s3", - ) - pass + file_data = get_file_data(client, bucket_name, s3_key) + if file_data: + return file_data # Handle URLs that are directories looking for `index.html` files if s3_key.endswith("/"): - try: - original_key = s3_key.lstrip("/") - index_html_key = f"{original_key}index.html" - response = client.get_object(Bucket=bucket_name, Key=index_html_key) - file_content = response["Body"].read() - content_type = response["ContentType"] - return file_content, content_type - except ClientError as e: - # Log the error and continue with the next key in the list - logger.exception( - "get_content_from_s3_client_error", - key=key, - bucket_name=bucket_name, - s3_key=s3_key, - error=str(e), - function_name="get_content_from_s3", - ) - pass + original_key = s3_key.lstrip("/") + index_html_key = f"{original_key}index.html" + file_data = get_file_data(client, bucket_name, index_html_key) + if file_data: + return file_data - # Return None if no valid object is found logger.info( "get_content_from_s3_no_valid_object", key=key, bucket_name=bucket_name, function_name="get_content_from_s3", ) - return None + return {} def get_content_type(s3_key, content_type): @@ -148,6 +109,32 @@ def get_content_type(s3_key, content_type): return content_type +def get_file_data(client, bucket_name, s3_key): + """Get the file data from S3. Returns the decoded file contents if able.""" + try: + response = client.get_object(Bucket=bucket_name, Key=s3_key.lstrip("/")) + return extract_file_data(response, s3_key) + except ClientError as e: + # Log the exception but ignore it otherwise, since it's not necessaruly an error + logger.exception( + "get_content_from_s3_error", + s3_key=s3_key, + error=str(e), + function_name="get_content_from_s3", + ) + return + + +def get_s3_client(): + """Get an S3 client.""" + return boto3.client( + "s3", + aws_access_key_id=settings.STATIC_CONTENT_AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.STATIC_CONTENT_AWS_SECRET_ACCESS_KEY, + region_name="us-east-1", + ) + + def get_s3_keys(content_path, config_filename="stage_static_config.json"): """ Get the S3 key for a given content path diff --git a/core/migrations/0001_initial.py b/core/migrations/0001_initial.py new file mode 100644 index 00000000..8a076f56 --- /dev/null +++ b/core/migrations/0001_initial.py @@ -0,0 +1,69 @@ +# Generated by Django 4.2.1 on 2023-06-09 04:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="RenderedContent", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "cache_key", + models.CharField( + db_index=True, + help_text="The cache key for the content.", + max_length=255, + unique=True, + ), + ), + ( + "content_type", + models.CharField( + blank=True, + help_text="The content type/MIME type.", + max_length=255, + null=True, + ), + ), + ( + "content_original", + models.TextField( + blank=True, help_text="The original content.", null=True + ), + ), + ( + "content_html", + models.TextField( + blank=True, help_text="The rendered HTML content.", null=True + ), + ), + ( + "last_updated_at", + models.DateTimeField( + blank=True, + help_text="The last time the content was updated in S3.", + null=True, + ), + ), + ], + options={ + "verbose_name": "rendered content", + "verbose_name_plural": "rendered contents", + }, + ), + ] diff --git a/core/migrations/__init__.py b/core/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/models.py b/core/models.py new file mode 100644 index 00000000..a874d672 --- /dev/null +++ b/core/models.py @@ -0,0 +1,52 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + + +class RenderedContent(models.Model): + """Stores a copy of rendered content. Generally, this content is retrieved + from the S3 buckets and, if necessary, converted to HTML. + + This model is intended to be used as a cache. If the content is not found, + it will be retrieved from S3 and stored in this model. If the content is + found, it will be returned from this model.""" + + cache_key = models.CharField( + max_length=255, + unique=True, + help_text=_("The cache key for the content."), + db_index=True, + ) + content_type = models.CharField( + max_length=255, + help_text=_("The content type/MIME type."), + null=True, + blank=True, + ) + content_original = models.TextField( + help_text=_("The original content."), null=True, blank=True + ) + content_html = models.TextField( + help_text=_("The rendered HTML content."), null=True, blank=True + ) + last_updated_at = models.DateTimeField( + help_text=_("The last time the content was updated in S3."), + null=True, + blank=True, + ) + + class Meta: + verbose_name = _("rendered content") + verbose_name_plural = _("rendered contents") + + def __str__(self): + return self.cache_key + + def save(self, *args, **kwargs): + if isinstance(self.content_original, bytes): + self.content_original = self.content_original.decode("utf-8") + if isinstance(self.content_html, bytes): + self.content_html = self.content_html.decode("utf-8") + if isinstance(self.content_type, bytes): + self.content_type = self.content_type.decode("utf-8") + + super().save(*args, **kwargs) diff --git a/core/tasks.py b/core/tasks.py index 30f45c45..48274f73 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -1,26 +1,27 @@ import os + import subprocess -from django.conf import settings -from django.core.cache import caches from celery import shared_task @shared_task -def adoc_to_html(file_path, cache_key, content_type, delete_file=True): +def adoc_to_html(file_path, delete_file=True): """ - Converts an AsciiDoc file to HTML and stores the result in the cache. + Converts an AsciiDoc file to HTML. If delete_file is True, the temporary file will be deleted after the conversion is complete. + Note: This returns the full document, including the and + tags. + The asciidoctor package is a Ruby gem, which is why we're using subprocess to run the command. https://docs.asciidoctor.org/asciidoctor/latest/ :param file_path: The path to the AsciiDoc file - :param cache_key: The key to use when storing the result in the cache - :param content_type: The content type of the file :param delete_file: Whether or not to delete the temporary file after the + conversion is complete """ result = subprocess.run( ["asciidoctor", "-o", "-", file_path], @@ -30,15 +31,6 @@ def adoc_to_html(file_path, cache_key, content_type, delete_file=True): # Get the output from the command converted_html = result.stdout - # Get the static content cache - static_content_cache = caches["static_content"] - - # Store the HTML content in cache - static_content_cache.set( - cache_key, - (converted_html, content_type), - int(settings.CACHES["static_content"]["TIMEOUT"]), - ) # Delete the temporary file if delete_file: diff --git a/core/tests/fixtures.py b/core/tests/fixtures.py new file mode 100644 index 00000000..dd7132b1 --- /dev/null +++ b/core/tests/fixtures.py @@ -0,0 +1,12 @@ +import pytest +from model_bakery import baker + + +@pytest.fixture +def rendered_content(db): + return baker.make( + "core.RenderedContent", + cache_key="cache-key", + content_original="Sample content", + content_html="

Sample content

", + ) diff --git a/core/tests/test_models.py b/core/tests/test_models.py new file mode 100644 index 00000000..ecf5abb4 --- /dev/null +++ b/core/tests/test_models.py @@ -0,0 +1,19 @@ +from model_bakery import baker + + +def test_rendered_content_creation(rendered_content): + assert rendered_content.cache_key is not None + + +def test_rendered_content_save(): + content = baker.make( + "core.RenderedContent", + content_original=b"Sample original content", + content_html=b"

Sample HTML content

", + content_type=b"text/html", + ) + content.save() + content.refresh_from_db() + assert isinstance(content.content_original, str) + assert isinstance(content.content_html, str) + assert isinstance(content.content_type, str) diff --git a/core/tests/test_renderer.py b/core/tests/test_renderer.py index c380b7fe..819bb364 100644 --- a/core/tests/test_renderer.py +++ b/core/tests/test_renderer.py @@ -1,4 +1,39 @@ -from ..boostrenderer import get_body_from_html, get_content_type, get_s3_keys +from unittest.mock import Mock, patch +import datetime +from io import BytesIO +import pytest + +from ..boostrenderer import ( + extract_file_data, + get_body_from_html, + get_content_type, + get_file_data, + get_s3_keys, +) + + +@pytest.fixture +def mock_s3_client(): + return "mock_s3_client" + + +def test_extract_file_data(): + response = { + "Body": BytesIO(b"file content"), + "ContentType": "text/plain", + "LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0), + } + s3_key = "example_key.txt" + + expected_result = { + "content": b"file content", + "content_type": "text/plain", + "last_modified": datetime.datetime(2023, 6, 8, 12, 0, 0), + } + + result = extract_file_data(response, s3_key) + + assert result == expected_result def test_get_body_from_html(): @@ -41,6 +76,36 @@ def test_get_content_type(): ), "application/javascript" +def test_get_file_data(): + # Mock the S3 client + mock_client = Mock() + mock_response = Mock() + mock_extract_file_data = Mock(return_value="mock_file_data") + + # Patch the necessary functions and objects + with patch("core.boostrenderer.extract_file_data", mock_extract_file_data), patch( + "core.boostrenderer.logger" + ) as mock_logger: + # Set up the mock response + mock_client.get_object.return_value = mock_response + + bucket_name = "my-bucket" + s3_key = "/path/to/file.txt" + + expected_result = "mock_file_data" + + # Call the function being tested + result = get_file_data(mock_client, bucket_name, s3_key) + + # Assert the expected behavior and result + assert result == expected_result + mock_client.get_object.assert_called_once_with( + Bucket=bucket_name, Key=s3_key.lstrip("/") + ) + mock_extract_file_data.assert_called_once_with(mock_response, s3_key) + assert not mock_logger.exception.called + + def test_get_s3_keys(): """ Test cases: diff --git a/core/tests/test_tasks.py b/core/tests/test_tasks.py index c997b231..bdb66aac 100644 --- a/core/tests/test_tasks.py +++ b/core/tests/test_tasks.py @@ -27,7 +27,7 @@ from core.tasks import adoc_to_html ) def test_adoc_to_html(): # Get the static content cache - static_content_cache = caches["static_content"] + caches["static_content"] # The content of the sample adoc file sample_adoc_content = "= Document Title\n\nThis is a sample document.\n" @@ -37,20 +37,10 @@ def test_adoc_to_html(): temp_file.write(sample_adoc_content.encode()) temp_file_path = temp_file.name - # The cache key to be used - cache_key = "sample_key" - # Execute the task with patch("core.tasks.subprocess.run") as mock_run: mock_run.return_value.stdout = "html_content".encode() - adoc_to_html(temp_file_path, cache_key, "text/asciidoc", delete_file=True) - - # Verify that the content has been stored in the cache - cached_result = static_content_cache.get(cache_key) - - assert cached_result is not None - assert cached_result[0] == b"html_content" - assert cached_result[1] == "text/asciidoc" + adoc_to_html(temp_file_path, delete_file=True) # Verify that the temporary file has been deleted with pytest.raises(FileNotFoundError): diff --git a/core/tests/test_views.py b/core/tests/test_views.py index 5b736f85..2a3c2249 100644 --- a/core/tests/test_views.py +++ b/core/tests/test_views.py @@ -53,7 +53,8 @@ def test_content_found(request_factory): """Test that content is found and returned.""" content_path = "/develop/libs/rst.css" with patch( - "core.views.get_content_from_s3", return_value=(b"fake content", "text/plain") + "core.views.get_content_from_s3", + return_value={"content": b"fake content", "content_type": "text/plain"}, ): response = call_view(request_factory, content_path) assert response.status_code == 200 @@ -86,7 +87,11 @@ def test_cache_expiration(request_factory): cache_key = f"static_content_{content_path}" # Set the content in the cache with a 1-second timeout - cache.set(cache_key, (mock_content, mock_content_type), timeout=1) + cache.set( + cache_key, + {"content": mock_content, "content_type": mock_content_type}, + timeout=1, + ) def test_markdown_view_top_level(tp): diff --git a/core/views.py b/core/views.py index a74eea68..b7ced2c1 100644 --- a/core/views.py +++ b/core/views.py @@ -1,15 +1,16 @@ import os.path import structlog import tempfile +from dateutil.parser import parse from django.conf import settings from django.core.cache import caches from django.http import Http404, HttpResponse, HttpResponseNotFound -from django.shortcuts import render -from django.views.generic import TemplateView, View +from django.views.generic import TemplateView from .boostrenderer import get_body_from_html, get_content_from_s3 from .markdown import process_md +from .models import RenderedContent from .tasks import adoc_to_html logger = structlog.get_logger() @@ -92,78 +93,151 @@ class MarkdownTemplateView(TemplateView): return self.render_to_response(context) -class StaticContentTemplateView(View): +class ContentNotFoundException(Exception): + pass + + +class StaticContentTemplateView(TemplateView): + template_name = "adoc_content.html" + def get(self, request, *args, **kwargs): - """Verifies the file and returns the raw static content from S3 - mangling paths using the stage_static_config.json settings + """Returns static content that originates in S3, but is cached in a couple of + different places. + + Any valid S3 key to the S3 bucket apecified in settings can be returned by + this view. Pages like the Help page are stored in S3 and rendered via + this view, for example. + + See the *_static_config.json files for URL mappings to specific S3 keys. """ - content_path = kwargs.get("content_path") + content_path = self.kwargs.get("content_path") + try: + self.content_dict = self.get_content(content_path) + except ContentNotFoundException: + logger.info( + "get_content_from_s3_view_not_in_cache", + content_path=content_path, + status_code=404, + ) + return HttpResponseNotFound("Page not found") + return super().get(request, *args, **kwargs) - # Try to get content from cache, if it's not there then fetch from S3 - content, content_type = self.get_content(content_path) + def get_template_names(self): + """Returns the template name.""" + content_type = self.content_dict.get("content_type") + if content_type == "text/asciidoc": + return [self.template_name] + return [] - if content is None: - return HttpResponseNotFound("Page not found") # Return a 404 response + def get_context_data(self, **kwargs): + """Returns the content and content type for the template. In some cases, + changes the content type.""" + context = super().get_context_data(**kwargs) + content_type = self.content_dict.get("content_type") + content = self.content_dict.get("content") if content_type == "text/asciidoc": - response = self.handle_adoc_content(request, content, content_type) - else: - response = HttpResponse(content, content_type=content_type) + content_type = "text/html" + + context.update({"content": content, "content_type": content_type}) logger.info( - "get_content_from_s3_view_success", - key=kwargs.get("content_path"), - status_code=response.status_code, + "get_content_from_s3_view_success", key=self.kwargs.get("content_path") ) - return response + return context - def get_content(self, content_path): - """Get content from cache or S3.""" - static_content_cache = caches["static_content"] - cache_key = f"static_content_{content_path}" - cached_result = static_content_cache.get(cache_key) - - if cached_result: - content, content_type = cached_result + def render_to_response(self, context, **response_kwargs): + """Return the HTML response with a template, or just the content directly.""" + if self.get_template_names(): + return super().render_to_response(context, **response_kwargs) else: - result = get_content_from_s3(key=content_path) - if not result: - logger.info( - "get_content_from_s3_view_no_valid_object", - key=content_path, - status_code=404, - ) - return None, None # Return None values when content is not found - - content, content_type = result - - # Always store the original content and content_type in cache - static_content_cache.set( - cache_key, - (content, content_type), - int(settings.CACHES["static_content"]["TIMEOUT"]), + return HttpResponse( + context["content"], content_type=context["content_type"] ) - return content, content_type + def get_content(self, content_path): + """Returns content from cache, database, or S3""" + static_content_cache = caches["static_content"] + cache_key = f"static_content_{content_path}" + result = self.get_from_cache(static_content_cache, cache_key) - def handle_adoc_content(self, request, content, content_path): - """Convert AsciiDoc content to HTML and return the HTML response.""" + if result is None: + result = self.get_from_database(cache_key) + + if result is None: + result = self.get_from_s3(content_path, cache_key) + + if result is None: + logger.info( + "get_content_from_s3_view_no_valid_object", + key=content_path, + status_code=404, + ) + raise ContentNotFoundException("Content not found") + + return result + + def get_from_cache(self, static_content_cache, cache_key): + cached_result = static_content_cache.get(cache_key) + return cached_result if cached_result else None + + def get_from_database(self, cache_key): + try: + content_obj = RenderedContent.objects.get(cache_key=cache_key) + return { + "content": content_obj.content_html, + "content_type": content_obj.content_type, + } + except RenderedContent.DoesNotExist: + return None + + def get_from_s3(self, content_path, cache_key): + result = get_content_from_s3(key=content_path) + if result and result.get("content"): + self.update_or_create_content(result, cache_key) + return result + return + + def update_or_create_content(self, result, cache_key): + content = result.get("content") + content_type = result.get("content_type") + last_updated_at_raw = result.get("last_updated_at") + + if content_type == "text/asciidoc": + content = self.convert_adoc_to_html(content, cache_key) + last_updated_at = ( + parse(last_updated_at_raw) if last_updated_at_raw else None + ) + + defaults = {"content_html": content, "content_type": content_type} + if last_updated_at: + defaults["last_updated_at"] = last_updated_at + content_obj, created = RenderedContent.objects.update_or_create( + cache_key=cache_key, defaults=defaults + ) + logger.info( + "get_content_from_s3_view_saved_to_db", + cache_key=cache_key, + content_type=content_type, + status_code=200, + obj_id=content_obj.id, + created=created, + ) + result["content"] = content + result["content_type"] = content_type + + def convert_adoc_to_html(self, content, cache_key): + """Renders asciidoc content to HTML.""" # Write the content to a temporary file with tempfile.NamedTemporaryFile(delete=False) as temp_file: + if isinstance(content, str): + content = content.encode() temp_file.write(content) - # Convert the AsciiDoc to HTML - # TODO: Put this back on a delay and return a response indicating that - # the content is being prepared - html_content = adoc_to_html( - temp_file.name, content_path, "text/asciidoc", delete_file=True - ) + html_content = adoc_to_html(temp_file.name, delete_file=True) if isinstance(html_content, bytes): - # Content is a byte string, decode it using UTF-8 encoding html_content = html_content.decode("utf-8") - # Extract only the contents of the body tag from the HTML - content = get_body_from_html(html_content) - context = {"content": content, "content_type": "text/html"} - return render(request, "adoc_content.html", context) + # Extract only the contents of the body tag that we want from the HTML + return get_body_from_html(html_content)