Clear the static content database cache nightly of old files

- Add `created` and `modified` fields to `RenderedContent` models
- Add caching and RenderedContent docs
- Change cache key for library description rendered content
- Add `CLEAR_STATIC_CONTENT_CACHE_DAYS` setting
- Add manager method and task to clear static content cache
- Move task scheduler to main app
- Add daily task to clear rendered content cache
- Use created date and not updated date
This commit is contained in:
Lacey Williams Henschel
2024-01-05 14:29:36 -08:00
committed by Lacey Henschel
parent 330d53426d
commit 0a9e0a2a7f
12 changed files with 126 additions and 13 deletions

View File

@@ -1,6 +1,7 @@
import os
from celery import Celery
from celery.schedules import crontab
# set the default Django settings module for the 'celery' program.
@@ -21,3 +22,19 @@ app.autodiscover_tasks()
@app.task(bind=True)
def debug_task(self):
print(f"Request: {self.request!r}")
# Schedule Celery tasks
@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
# Update library data from GitHub. Executes daily at 7:05 AM
sender.add_periodic_task(
crontab(hour=7, minute=5),
"libraries.tasks.update_libraries",
)
# Clear the static content database cache. Executs daily at 4:05 AM.
sender.add_periodic_task(
crontab(hour=4, minute=5),
"core.tasks.clear_static_content_cache",
)

View File

@@ -264,6 +264,9 @@ CACHES = {
},
}
# Default interval by which to clear the static content cache
CLEAR_STATIC_CONTENT_CACHE_DAYS = 7
# Mailman API credentials
MAILMAN_REST_API_URL = env("MAILMAN_REST_API_URL", default="http://localhost:8001")
MAILMAN_REST_API_USER = env("MAILMAN_REST_API_USER", default="restadmin")

View File

@@ -3,10 +3,29 @@ import structlog
from django.core.cache import caches
from django.db import models
from django.utils import timezone
import datetime
from django.conf import settings
logger = structlog.get_logger()
class RenderedContentManager(models.Manager):
def clear_cache_by_cache_type_and_date(
self,
cache_type="static_content_",
older_than_days=settings.CLEAR_STATIC_CONTENT_CACHE_DAYS,
):
older_than = timezone.now() - datetime.timedelta(days=older_than_days)
deleted_count, _ = self.filter(
cache_key__startswith=cache_type, created__lte=older_than
).delete()
logger.info(
"rendered_content_manager_clear_cache_by_cache_type_and_date",
cache_type=cache_type,
count=deleted_count,
)
def clear_cache_by_content_type(self, content_type):
"""Clears the static content cache of all rendered content of a given type."""
cache = caches["static_content"]

View File

@@ -0,0 +1,31 @@
# Generated by Django 4.2.2 on 2024-01-05 22:28
from django.db import migrations
import django.utils.timezone
import django_extensions.db.fields
class Migration(migrations.Migration):
dependencies = [
("core", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="renderedcontent",
name="created",
field=django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True,
default=django.utils.timezone.now,
verbose_name="created",
),
preserve_default=False,
),
migrations.AddField(
model_name="renderedcontent",
name="modified",
field=django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
]

View File

@@ -1,16 +1,21 @@
from django.db import models
from django.utils.translation import gettext_lazy as _
from django_extensions.db.models import TimeStampedModel
from .managers import RenderedContentManager
class RenderedContent(models.Model):
class RenderedContent(TimeStampedModel):
"""Stores a copy of rendered content. Generally, this content is retrieved
from the S3 buckets and, if necessary, converted to HTML.
This model is intended to be used as a cache. If the content is not found,
it will be retrieved from S3 and stored in this model. If the content is
found, it will be returned from this model."""
found, it will be returned from this model.
TimeStampedModel adds `created` and `modified` fields:
https://django-extensions.readthedocs.io/en/latest/model_extensions.html
"""
cache_key = models.CharField(
max_length=255,

View File

@@ -34,6 +34,14 @@ def clear_rendered_content_cache_by_content_type(content_type):
RenderedContent.objects.delete_by_content_type(content_type)
@shared_task
def clear_static_content_cache():
"""Runs the manager method to clear the static content cache"""
RenderedContent.objects.clear_cache_by_cache_type_and_date(
cache_type="static_content_"
)
@shared_task
def refresh_content_from_s3(s3_key, cache_key):
"""Calls S3 with the s3_key, then saves the result to the

View File

@@ -1,7 +1,10 @@
import datetime
from model_bakery import baker
from django.conf import settings
from django.core.cache import caches
from django.test import override_settings
from django.utils import timezone
from ..models import RenderedContent
@@ -57,3 +60,21 @@ def test_delete_by_cache_key():
assert RenderedContent.objects.filter(cache_key="keep").exists()
assert not RenderedContent.objects.filter(cache_key="clear").exists()
def test_clear_cache_by_cache_type_and_date(rendered_content):
cache_type = "cache-key"
older_than_days = settings.CLEAR_STATIC_CONTENT_CACHE_DAYS
# Create old cache entry
old_date = timezone.now() - datetime.timedelta(days=older_than_days + 1)
old_content = baker.make("core.RenderedContent", cache_key=f"{cache_type}_old")
old_content.created = old_date
old_content.save()
initial_count = RenderedContent.objects.count()
RenderedContent.objects.clear_cache_by_cache_type_and_date(cache_type=cache_type)
final_count = RenderedContent.objects.count()
assert final_count == initial_count - 1
assert not RenderedContent.objects.filter(cache_key=f"{cache_type}_old").exists()
assert RenderedContent.objects.filter(cache_key=rendered_content.cache_key).exists()

View File

@@ -1,6 +1,7 @@
# Documentation for the Boost Website
- [API Documentation](./api.md) - We don't have many API endpoints, but the ones we do have are documented here
- [Caching and the `RenderedContent` model](./caching_rendered_content.md)
- [Dependency Management](./dependencies.md)
- [Development Setup Notes](./development_setup_notes.md)
- [Environment Variables](./env_vars.md)

View File

@@ -0,0 +1,12 @@
# Caching and the `RenderedContent` model
This model is mostly used as a database cache or backup for data that is retrieved from GitHub or from the S3 buckets.
See [Static Content](./static_content.md) for more information about retrieving static content from S3.
Usage:
- Cache static content (like asciidoc content, library documentation, the help pages, anything that is rendered from S3). The `cache_key` field will be prefixed with `static_content_`.
- There is a Celery task to clear this database cache for all rows older than 7 days, which is set up to run daily.
- Cache a copy of the library description (from the library asciidoc or other readme file). This enables us to load a library description even if the GitHub API goes down. The `cache_key` field will be prefixed with `library_description_`. Because these descriptions are primarily for past versions, they will not update, they will not be deleted from the database cache, and there is no need to retrieve them from GitHub fresh every time.
- Store a copy of the release notes for each Boost version. Because the release notes are for past versions, they will not update, they will not be deleted from the database cache, and there is no need to retrieve them from GitHub fresh every time. The `cache_key` field will be prefixed with `release_notes_`.

View File

@@ -54,3 +54,9 @@ Take a look at this sample `{env}_static_config.json` file:
- `/site/index.html`
We first try to retrieve the static content using the exact S3 key specified in the site-to-S3 mapping. If we can't find the content using that key, we will try alternative S3 keys based on the `site_path` and `s3_path` properties in the `{env}_static_config.json` file.
## Caching
See [Caching and the `RenderedContent` model](./caching_rendered_content.md) for how Django-side caching is handled.
Cacching is also handled via Fastly CDN.

View File

@@ -148,7 +148,7 @@ class Library(models.Model):
# Try to get the content from the cache first
static_content_cache = caches["static_content"]
cache_key = f"static_content_{self.github_repo}_{tag}"
cache_key = f"library_description_{self.github_repo}_{tag}"
cached_result = static_content_cache.get(cache_key)
if cached_result:
return cached_result

View File

@@ -1,5 +1,4 @@
import structlog
from celery.schedules import crontab
from config.celery import app
from core.boostrenderer import get_content_from_s3
@@ -66,15 +65,6 @@ def get_and_store_library_version_documentation_urls_for_version(version_pk):
continue
@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
# Executes daily at 7:05 AM
sender.add_periodic_task(
crontab(hour=7, minute=5),
update_libraries.s(),
)
@app.task
def update_libraries(update_all=False):
"""Update local libraries from GitHub Boost libraries.