Files
website-v2/libraries/tasks.py
Lacey Williams Henschel eeb615ceff Load docs urls for library-versions from S3
- Add documentation_url to LibraryVersion table
- Retrieve the documentation_url in the view
- If the documentation_url is not present, show the Version documentation_url
- Hide the "https://" in the template
- Add Celery task to retrieve the documentation_urls from the libraries page in S3 and match them to import_library_versions
- Add HTML helper function to extract the libraries and docs URLs from the HTML
- Add one-time command to import docs urls
- Load the documentation_urls in the command that loads all library-versions
- Add a min-release option to the command that loads library-versions
- Document the new command and changes to the existing command
2023-08-04 08:40:00 -07:00

94 lines
3.3 KiB
Python

import structlog
from celery.schedules import crontab
from config.celery import app
from core.boostrenderer import get_content_from_s3
from core.htmlhelper import get_library_documentation_urls
from libraries.github import LibraryUpdater
from libraries.models import LibraryVersion
from libraries.utils import get_first_last_day_last_month
from versions.models import Version
logger = structlog.getLogger(__name__)
@app.task
def get_and_store_library_version_documentation_urls_for_version(version_pk):
"""
Store the url paths to the documentation for each library in a given version.
The url paths are retrieved from the `libraries.htm` file in the docs stored in
S3 for the given version.
Retrieve the libraries from the "Libraries Listed Alphabetically" section of the
HTML file. Loop through the unordered list of libraries and save the url path
to the docs for that library to the database.
Background: There are enough small exceptions to how the paths to the docs for each
library are generated, so the easiest thing to do is to access the list of libraries
for a particular release, scrape the url paths to their docs, and save those to the
database.
"""
try:
version = Version.objects.get(pk=version_pk)
except Version.DoesNotExist:
raise
base_path = f"doc/libs/{version.boost_url_slug}/libs/"
key = f"{base_path}libraries.htm"
result = get_content_from_s3(key)
if not result:
raise ValueError(f"Could not get content from S3 for {key}")
content = result["content"]
library_tags = get_library_documentation_urls(content)
library_versions = LibraryVersion.objects.filter(version=version)
for library_name, url_path in library_tags:
try:
# In most cases, the name matches close enough to get the correct object
library_version = library_versions.get(library__name__iexact=library_name)
library_version.documentation_url = f"/{base_path}{url_path}"
library_version.save()
except LibraryVersion.DoesNotExist:
logger.info(
"get_library_version_documentation_urls_version_does_not_exist",
library_name=library_name,
version_slug=version.slug,
)
continue
except LibraryVersion.MultipleObjectsReturned:
logger.info(
"get_library_version_documentation_urls_multiple_objects_returned",
library_name=library_name,
version_slug=version.slug,
)
continue
@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
# Executes every 5th of the month at 7:30 a.m.
sender.add_periodic_task(
crontab(hour=7, minute=30, day_of_month=5),
update_libraries.s(),
)
@app.task
def update_libraries():
"""Update local libraries from GitHub Boost libraries.
Use the LibraryUpdater, which retrieves the active boost libraries from the
Boost GitHub repo, to update the models with the latest information on that
library (repo) along with its issues, pull requests, and related objects
from GitHub.
"""
since, until = get_first_last_day_last_month()
updater = LibraryUpdater()
updater.update_libraries(since=since, until=until)
logger.info("libraries_tasks_update_libraries_finished")