website-v2/libraries/utils.py

import random
import string
import re
from itertools import islice

import structlog
import tempfile
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta

from dateutil.parser import ParserError, parse
from django.utils.text import slugify

from libraries.constants import (
    DEFAULT_LIBRARIES_LANDING_VIEW,
    SELECTED_BOOST_VERSION_COOKIE_NAME,
    SELECTED_LIBRARY_VIEW_COOKIE_NAME,
    LATEST_RELEASE_URL_PATH_STR,
    LEGACY_LATEST_RELEASE_URL_PATH_STR,
    DEVELOP_RELEASE_URL_PATH_STR,
    MASTER_RELEASE_URL_PATH_STR,
)
from versions.models import Version

logger = structlog.get_logger()


def decode_content(content):
    """Decode bytes to string."""
    if isinstance(content, bytes):
        return content.decode("utf-8")
    return content


def generate_fake_email(val: str) -> str:
    """Slugify a string to make a fake email.

    Would not necessarily be unique -- this is a lazy way for us to avoid creating
    multiple new user records for one contributor who contributes to multiple libraries.
    """
    slug = slugify(val)
    local_email = slug.replace("-", "_")[:50]
    return f"{local_email}@example.com"


def generate_random_string(length=4):
    characters = string.ascii_letters
    random_string = "".join(random.choice(characters) for _ in range(length))
    return random_string


def version_within_range(
    version: str, min_version: str = None, max_version: str = None
):
    """Direct string comparison, assuming 'version', 'min_version', and 'max_version'
    follow the same format.

    Expects format `boost-1.84.0`
    """
    if min_version and version < min_version:
        return False
    if max_version and version > max_version:
        return False
    return True


def get_first_last_day_last_month():
    now = datetime.now()
    first_day_this_month = now.replace(day=1)
    last_day_last_month = first_day_this_month - relativedelta(days=1)
    first_day_last_month = last_day_last_month.replace(day=1)
    return first_day_last_month, last_day_last_month


def parse_date(date_str):
    """Parses a date string to a datetime. Does not return an error."""
    try:
        return parse(date_str)
    except ParserError:
        logger.info("parse_date_invalid_date", date_str=date_str)
        return None


def write_content_to_tempfile(content):
    """Accepts string or bytes content, writes it to a temporary file, and returns the
    file object."""
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        if isinstance(content, bytes):
            temp_file = open(temp_file.name, "wb")
        temp_file.write(content)
        temp_file.close()
    return temp_file


def get_version_from_url(request):
    return request.GET.get("version")


def get_version_from_cookie(request):
    return request.COOKIES.get(SELECTED_BOOST_VERSION_COOKIE_NAME)


def get_view_from_url(request):
    return request.resolver_match.kwargs.get("library_view_str")


def get_view_from_cookie(request):
    return request.COOKIES.get(SELECTED_LIBRARY_VIEW_COOKIE_NAME)


def set_view_in_cookie(response, view):
    allowed_views = {"grid", "list", "categorized"}
    if view not in allowed_views:
        return
    response.set_cookie(SELECTED_LIBRARY_VIEW_COOKIE_NAME, view)


def get_prioritized_version(request):
    """
    Version Priorities:
    1. URL parameter
    2. Cookie
    3. Default to latest version
    """
    url_version = get_version_from_url(request)
    cookie_version = get_version_from_cookie(request)
    default_version = None
    return url_version or cookie_version or default_version


def get_prioritized_library_view(request):
    """
    View Priorities:
    1. URL parameter
    2. Cookie
    3. Default to grid view
    """
    url_view = get_view_from_url(request)
    cookie_view = get_view_from_cookie(request)
    return url_view or cookie_view or DEFAULT_LIBRARIES_LANDING_VIEW


def get_category(request):
    return request.GET.get("category", "")


def determine_selected_boost_version(request_value, request):
    # use the versions in the request if they are available otherwise fall back to DB
    version_slug = request_value or get_version_from_cookie(request)
    version_args = {}
    if version_slug in (DEVELOP_RELEASE_URL_PATH_STR, MASTER_RELEASE_URL_PATH_STR):
        version_args = {f"allow_{version_slug}": True}

    valid_versions = getattr(request, "extra_context", {}).get(
        "versions", Version.objects.get_dropdown_versions(**version_args)
    )
    if version_slug in [v.slug for v in valid_versions] + [LATEST_RELEASE_URL_PATH_STR]:
        return version_slug
    logger.warning(f"Invalid version slug in cookies: {version_slug}")
    return None


def set_selected_boost_version(version_slug: str, response) -> None:
    """Update the selected version in the cookies."""
    versions_kwargs = {}
    if version_slug in [MASTER_RELEASE_URL_PATH_STR, DEVELOP_RELEASE_URL_PATH_STR]:
        versions_kwargs[f"allow_{version_slug}"] = True

    valid_versions = Version.objects.get_dropdown_versions(**versions_kwargs)
    if version_slug in [v.slug for v in valid_versions]:
        response.set_cookie(SELECTED_BOOST_VERSION_COOKIE_NAME, version_slug)
    elif version_slug == LATEST_RELEASE_URL_PATH_STR:
        response.delete_cookie(SELECTED_BOOST_VERSION_COOKIE_NAME)
    else:
        logger.warning(f"Attempted to set invalid version slug: {version_slug}")


def library_doc_latest_transform(url):
    p = re.compile(r"^(/doc/libs/)[0-9_]+(/\S+)$")
    if p.match(url):
        url = p.sub(rf"\1{LATEST_RELEASE_URL_PATH_STR}\2", url)
    return url


def generate_canonical_library_uri(uri):
    matches = re.match(
        r"https?://(?P<domainpath>[^/]+(?:/[^/]+){2}/?)(?P<version>[^/]+)(?P<docpath>/[\S]+)",
        uri,
    )
    if matches.group("version") == LATEST_RELEASE_URL_PATH_STR:
        return uri
    return f"https://{matches.group('domainpath')}{LATEST_RELEASE_URL_PATH_STR}{matches.group('docpath')}"


def get_documentation_url(library_version, latest):
    url = library_version.documentation_url
    if url and latest:
        url = library_doc_latest_transform(url)
    return url


def get_documentation_url_redirect(library_version, latest):
    """Get the documentation URL for the current library."""

    def find_documentation_url(library_version):
        # If we know the library-version docs are missing, return the version docs
        if library_version.missing_docs:
            return library_version.version.documentation_url
        # If we have the library-version docs and they are valid, return those
        elif library_version.documentation_url:
            return library_version.documentation_url
        # If we wind up here, return the version docs
        else:
            return library_version.version.documentation_url

    # Get the URL for the version.
    url = find_documentation_url(library_version)
    # Remove the "boost_" prefix from the URL.
    url = url.replace("boost_", "")
    if latest:
        url = library_doc_latest_transform(url)

    return url


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    # In python 3.12, this function can be deleted in favor of itertools.batched
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


def conditional_batched(iterable, n: int, condition: callable, *, strict=False):
    """
    Batch items that pass a condition together, return items that fail individually.

    Args:
        iterable: Items to process
        n: Batch size for items that pass the condition
        condition: Function that returns True if item should be batched
        strict: If True, raise error for incomplete final batch

    Yields:
        Tuples of batched items or single-item tuples for items that fail condition
    """
    if n < 1:
        raise ValueError("n must be at least one")

    batch = []

    for item in iterable:
        if condition(item):
            # item passes condition - add to batch
            batch.append(item)
            if len(batch) == n:
                # batch is full - yield it and start new batch
                yield tuple(batch)
                batch = []
        else:
            # item fails condition - yield any pending batch first, then item alone
            if batch:
                yield tuple(batch)
                batch = []
            yield (item,)

    # handle any remaining items in batch
    if strict and batch and len(batch) != n:
        raise ValueError("conditional_batched(): incomplete batch")
    if batch:
        yield tuple(batch)


def legacy_path_transform(content_path):
    if content_path and content_path.startswith(LEGACY_LATEST_RELEASE_URL_PATH_STR):
        content_path = re.sub(r"([a-zA-Z0-9\.]+)/(\S+)", r"latest/\2", content_path)
    return content_path


def parse_boostdep_artifact(content: str):
    """Parse and return a generator which yields libraries and their dependencies.

    - `content` is a string of the artifact content given by the dependency_report
        GH action.
    - Iterate through the file and yield a tuple of
        (library_version: LibraryVersion, dependencies: list[Library])
    - Some library keys in the output do not match the names in our database exactly,
        so transform names when necessary
    - The boost database may not contain every library version found in this file,
        if we find a definition of dependencies for a library version we are not
        tracking, ignore it and continue to the next line.
    - example content can be found in
        libraries/tests/fixtures.py -> github_action_boostdep_output_artifact

    """
    from libraries.models import Library, LibraryVersion

    libraries = {x.key: x for x in Library.objects.all()}
    # these libraries do not exist in the DB, ignore them.
    ignore_libraries = ["disjoint_sets", "tr1"]

    def fix_library_key(name):
        """Transforms library key in boostdep report to match our library keys"""
        if name == "logic":
            return "logic/tribool"
        return name.replace("~", "/")

    def parse_line(line: str):
        parts = line.split("->")
        if len(parts) == 2:
            library_key, dependencies_string = [x.strip() for x in parts]
            library_key = fix_library_key(library_key)
            dependency_names = [fix_library_key(x) for x in dependencies_string.split()]
            dependencies = [
                libraries[x] for x in dependency_names if x not in ignore_libraries
            ]
        else:
            library_key = fix_library_key(parts[0].strip())
            dependencies = []
        return library_key, dependencies

    library_versions = {}
    version_name = ""
    skipped_library_versions = 0
    for line in content.splitlines():
        # each section is headed with 'Dependencies for version boost-x.x.0'
        if line.startswith("Dependencies for version"):
            version_name = line.split()[-1]
            library_versions = {
                x.library.key: x
                for x in LibraryVersion.objects.filter(
                    version__name=version_name
                ).select_related("library")
            }
        else:
            library_key, dependencies = parse_line(line)
            if library_key in ignore_libraries:
                continue
            library_version = library_versions.get(library_key, None)
            if not library_version:
                skipped_library_versions += 1
                logger.info(
                    f"LibraryVersion with {library_key=} {version_name=} not found."
                )
                continue
            yield library_version, dependencies
    if skipped_library_versions:
        logger.info(
            "Some library versions were skipped during artifact parsing.",
            skipped_library_versions=skipped_library_versions,
        )


def update_base_tag(html: str, base_uri: str):
    """
    Replace the base tag href with the new base_uri
    """
    pattern = r'<base\s+href="[^"]*">'
    replacement = f'<base href="{base_uri}">'
    return re.sub(pattern, replacement, html)


def generate_release_report_filename(version_slug: str, published_format: bool = False):
    filename_data = ["release-report", version_slug]
    if not published_format:
        filename_data.append(datetime.now(timezone.utc).isoformat())
    filename = f"{'-'.join(filename_data)}.pdf"
    return filename