From 0a9e0a2a7f59fc452b99fed5c8ecf2a0e9b585f5 Mon Sep 17 00:00:00 2001 From: Lacey Williams Henschel Date: Fri, 5 Jan 2024 14:29:36 -0800 Subject: [PATCH] Clear the static content database cache nightly of old files - Add `created` and `modified` fields to `RenderedContent` models - Add caching and RenderedContent docs - Change cache key for library description rendered content - Add `CLEAR_STATIC_CONTENT_CACHE_DAYS` setting - Add manager method and task to clear static content cache - Move task scheduler to main app - Add daily task to clear rendered content cache - Use created date and not updated date --- config/celery.py | 17 ++++++++++ config/settings.py | 3 ++ core/managers.py | 19 ++++++++++++ ...ontent_created_renderedcontent_modified.py | 31 +++++++++++++++++++ core/models.py | 9 ++++-- core/tasks.py | 8 +++++ core/tests/test_managers.py | 21 +++++++++++++ docs/README.md | 1 + docs/caching_rendered_content.md | 12 +++++++ docs/static_content.md | 6 ++++ libraries/models.py | 2 +- libraries/tasks.py | 10 ------ 12 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 core/migrations/0002_renderedcontent_created_renderedcontent_modified.py create mode 100644 docs/caching_rendered_content.md diff --git a/config/celery.py b/config/celery.py index 70cf751b..233f985e 100644 --- a/config/celery.py +++ b/config/celery.py @@ -1,6 +1,7 @@ import os from celery import Celery +from celery.schedules import crontab # set the default Django settings module for the 'celery' program. @@ -21,3 +22,19 @@ app.autodiscover_tasks() @app.task(bind=True) def debug_task(self): print(f"Request: {self.request!r}") + + +# Schedule Celery tasks +@app.on_after_configure.connect +def setup_periodic_tasks(sender, **kwargs): + # Update library data from GitHub. Executes daily at 7:05 AM + sender.add_periodic_task( + crontab(hour=7, minute=5), + "libraries.tasks.update_libraries", + ) + + # Clear the static content database cache. Executs daily at 4:05 AM. + sender.add_periodic_task( + crontab(hour=4, minute=5), + "core.tasks.clear_static_content_cache", + ) diff --git a/config/settings.py b/config/settings.py index 72cb40d1..c9d486a5 100755 --- a/config/settings.py +++ b/config/settings.py @@ -264,6 +264,9 @@ CACHES = { }, } +# Default interval by which to clear the static content cache +CLEAR_STATIC_CONTENT_CACHE_DAYS = 7 + # Mailman API credentials MAILMAN_REST_API_URL = env("MAILMAN_REST_API_URL", default="http://localhost:8001") MAILMAN_REST_API_USER = env("MAILMAN_REST_API_USER", default="restadmin") diff --git a/core/managers.py b/core/managers.py index a2e4e1aa..41ed89e0 100644 --- a/core/managers.py +++ b/core/managers.py @@ -3,10 +3,29 @@ import structlog from django.core.cache import caches from django.db import models +from django.utils import timezone +import datetime +from django.conf import settings + logger = structlog.get_logger() class RenderedContentManager(models.Manager): + def clear_cache_by_cache_type_and_date( + self, + cache_type="static_content_", + older_than_days=settings.CLEAR_STATIC_CONTENT_CACHE_DAYS, + ): + older_than = timezone.now() - datetime.timedelta(days=older_than_days) + deleted_count, _ = self.filter( + cache_key__startswith=cache_type, created__lte=older_than + ).delete() + logger.info( + "rendered_content_manager_clear_cache_by_cache_type_and_date", + cache_type=cache_type, + count=deleted_count, + ) + def clear_cache_by_content_type(self, content_type): """Clears the static content cache of all rendered content of a given type.""" cache = caches["static_content"] diff --git a/core/migrations/0002_renderedcontent_created_renderedcontent_modified.py b/core/migrations/0002_renderedcontent_created_renderedcontent_modified.py new file mode 100644 index 00000000..262d703b --- /dev/null +++ b/core/migrations/0002_renderedcontent_created_renderedcontent_modified.py @@ -0,0 +1,31 @@ +# Generated by Django 4.2.2 on 2024-01-05 22:28 + +from django.db import migrations +import django.utils.timezone +import django_extensions.db.fields + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="renderedcontent", + name="created", + field=django_extensions.db.fields.CreationDateTimeField( + auto_now_add=True, + default=django.utils.timezone.now, + verbose_name="created", + ), + preserve_default=False, + ), + migrations.AddField( + model_name="renderedcontent", + name="modified", + field=django_extensions.db.fields.ModificationDateTimeField( + auto_now=True, verbose_name="modified" + ), + ), + ] diff --git a/core/models.py b/core/models.py index 75f6d86d..3e48f6ad 100644 --- a/core/models.py +++ b/core/models.py @@ -1,16 +1,21 @@ from django.db import models from django.utils.translation import gettext_lazy as _ +from django_extensions.db.models import TimeStampedModel from .managers import RenderedContentManager -class RenderedContent(models.Model): +class RenderedContent(TimeStampedModel): """Stores a copy of rendered content. Generally, this content is retrieved from the S3 buckets and, if necessary, converted to HTML. This model is intended to be used as a cache. If the content is not found, it will be retrieved from S3 and stored in this model. If the content is - found, it will be returned from this model.""" + found, it will be returned from this model. + + TimeStampedModel adds `created` and `modified` fields: + https://django-extensions.readthedocs.io/en/latest/model_extensions.html + """ cache_key = models.CharField( max_length=255, diff --git a/core/tasks.py b/core/tasks.py index 4fc96813..906a3a9c 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -34,6 +34,14 @@ def clear_rendered_content_cache_by_content_type(content_type): RenderedContent.objects.delete_by_content_type(content_type) +@shared_task +def clear_static_content_cache(): + """Runs the manager method to clear the static content cache""" + RenderedContent.objects.clear_cache_by_cache_type_and_date( + cache_type="static_content_" + ) + + @shared_task def refresh_content_from_s3(s3_key, cache_key): """Calls S3 with the s3_key, then saves the result to the diff --git a/core/tests/test_managers.py b/core/tests/test_managers.py index 070faacc..cb03246e 100644 --- a/core/tests/test_managers.py +++ b/core/tests/test_managers.py @@ -1,7 +1,10 @@ +import datetime from model_bakery import baker +from django.conf import settings from django.core.cache import caches from django.test import override_settings +from django.utils import timezone from ..models import RenderedContent @@ -57,3 +60,21 @@ def test_delete_by_cache_key(): assert RenderedContent.objects.filter(cache_key="keep").exists() assert not RenderedContent.objects.filter(cache_key="clear").exists() + + +def test_clear_cache_by_cache_type_and_date(rendered_content): + cache_type = "cache-key" + older_than_days = settings.CLEAR_STATIC_CONTENT_CACHE_DAYS + + # Create old cache entry + old_date = timezone.now() - datetime.timedelta(days=older_than_days + 1) + old_content = baker.make("core.RenderedContent", cache_key=f"{cache_type}_old") + old_content.created = old_date + old_content.save() + + initial_count = RenderedContent.objects.count() + RenderedContent.objects.clear_cache_by_cache_type_and_date(cache_type=cache_type) + final_count = RenderedContent.objects.count() + assert final_count == initial_count - 1 + assert not RenderedContent.objects.filter(cache_key=f"{cache_type}_old").exists() + assert RenderedContent.objects.filter(cache_key=rendered_content.cache_key).exists() diff --git a/docs/README.md b/docs/README.md index 2ffa778b..19fc9234 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,7 @@ # Documentation for the Boost Website - [API Documentation](./api.md) - We don't have many API endpoints, but the ones we do have are documented here +- [Caching and the `RenderedContent` model](./caching_rendered_content.md) - [Dependency Management](./dependencies.md) - [Development Setup Notes](./development_setup_notes.md) - [Environment Variables](./env_vars.md) diff --git a/docs/caching_rendered_content.md b/docs/caching_rendered_content.md new file mode 100644 index 00000000..6f98d377 --- /dev/null +++ b/docs/caching_rendered_content.md @@ -0,0 +1,12 @@ +# Caching and the `RenderedContent` model + +This model is mostly used as a database cache or backup for data that is retrieved from GitHub or from the S3 buckets. + +See [Static Content](./static_content.md) for more information about retrieving static content from S3. + +Usage: + +- Cache static content (like asciidoc content, library documentation, the help pages, anything that is rendered from S3). The `cache_key` field will be prefixed with `static_content_`. + - There is a Celery task to clear this database cache for all rows older than 7 days, which is set up to run daily. +- Cache a copy of the library description (from the library asciidoc or other readme file). This enables us to load a library description even if the GitHub API goes down. The `cache_key` field will be prefixed with `library_description_`. Because these descriptions are primarily for past versions, they will not update, they will not be deleted from the database cache, and there is no need to retrieve them from GitHub fresh every time. +- Store a copy of the release notes for each Boost version. Because the release notes are for past versions, they will not update, they will not be deleted from the database cache, and there is no need to retrieve them from GitHub fresh every time. The `cache_key` field will be prefixed with `release_notes_`. diff --git a/docs/static_content.md b/docs/static_content.md index 0a73fdfa..f930ee93 100644 --- a/docs/static_content.md +++ b/docs/static_content.md @@ -54,3 +54,9 @@ Take a look at this sample `{env}_static_config.json` file: - `/site/index.html` We first try to retrieve the static content using the exact S3 key specified in the site-to-S3 mapping. If we can't find the content using that key, we will try alternative S3 keys based on the `site_path` and `s3_path` properties in the `{env}_static_config.json` file. + +## Caching + +See [Caching and the `RenderedContent` model](./caching_rendered_content.md) for how Django-side caching is handled. + +Cacching is also handled via Fastly CDN. diff --git a/libraries/models.py b/libraries/models.py index 36c5b4a0..70434a4c 100644 --- a/libraries/models.py +++ b/libraries/models.py @@ -148,7 +148,7 @@ class Library(models.Model): # Try to get the content from the cache first static_content_cache = caches["static_content"] - cache_key = f"static_content_{self.github_repo}_{tag}" + cache_key = f"library_description_{self.github_repo}_{tag}" cached_result = static_content_cache.get(cache_key) if cached_result: return cached_result diff --git a/libraries/tasks.py b/libraries/tasks.py index 61ee5bc9..49f25b38 100644 --- a/libraries/tasks.py +++ b/libraries/tasks.py @@ -1,5 +1,4 @@ import structlog -from celery.schedules import crontab from config.celery import app from core.boostrenderer import get_content_from_s3 @@ -66,15 +65,6 @@ def get_and_store_library_version_documentation_urls_for_version(version_pk): continue -@app.on_after_configure.connect -def setup_periodic_tasks(sender, **kwargs): - # Executes daily at 7:05 AM - sender.add_periodic_task( - crontab(hour=7, minute=5), - update_libraries.s(), - ) - - @app.task def update_libraries(update_all=False): """Update local libraries from GitHub Boost libraries.