website-v2/libraries/github.py

import re
import time
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import assert_never
from dateutil.relativedelta import relativedelta
import subprocess

import structlog
from ghapi.core import HTTP404NotFoundError
from fastcore.xtras import obj2dict

from django.db.models import Exists, OuterRef
from django.contrib.auth import get_user_model
from django.db import transaction
from django.utils import dateparse, timezone

from versions.models import Version
from .constants import CATEGORY_OVERRIDES
from .models import (
    Category,
    Commit,
    CommitAuthor,
    CommitAuthorEmail,
    Issue,
    Library,
    LibraryVersion,
    PullRequest,
)
from core.githubhelper import GithubAPIClient, GithubDataParser

from .utils import generate_fake_email, parse_boostdep_artifact, parse_date

logger = structlog.get_logger()

User = get_user_model()


now = timezone.now()
FIRST_OF_MONTH_ONE_YEAR_AGO = timezone.make_aware(
    timezone.datetime(year=now.year - 1, month=now.month, day=1)
)
FIRST_OF_CURRENT_MONTH = timezone.make_aware(
    timezone.datetime(year=now.year, month=now.month, day=1)
) - relativedelta(days=1)


@dataclass
class ParsedCommit:
    email: str
    name: str
    message: str
    sha: str
    version: str
    is_merge: bool
    committed_at: timezone.datetime
    avatar_url: str | None = None


@dataclass
class VersionDiffStat:
    version: str
    files_changed: int
    insertions: int
    deletions: int


def get_commit_data_for_repo_versions(key):
    """Fetch commit data between minor versions (ignore patches).

    Get commits from one x.x.0 release to the next x.x.0 release. Commits
    to and from patches or beta versions are ignored.

    """
    library = Library.objects.get(key=key)
    parser = re.compile(
        r"^commit (?P<sha>\w+)(?:\n(?P<merge>Merge).*)?\nAuthor: (?P<name>[^\<]+)"
        r"\s+\<(?P<email>[^\>]+)\>\nDate:\s+(?P<date>.*)\n(?P<message>(.|\n)+?)"
        r"(?=(commit|\Z))",
        flags=re.MULTILINE,
    )
    re.compile(
        r"(?:(?P<files_changed>\d+) files changed)?.*?"
        r"(?:(?P<insertions>\d+) insertions)?.*?(?:(?P<deletions>\d+) deletions)?",
    )

    retry_count = 0
    with tempfile.TemporaryDirectory() as temp_dir:
        git_dir = Path(temp_dir) / f"{library.key}.git"
        is_clone_successful = False
        while retry_count < 5:
            retry_count += 1
            completed = subprocess.run(
                ["git", "clone", f"{library.github_url}.git", "--bare", str(git_dir)],
                capture_output=True,
            )
            message = completed.stdout.decode()
            error = completed.stderr.decode()
            # successful output always startswith 'Cloning into bare repository'
            # Clone message comes out from stderr, not stdout
            if not error.startswith("Cloning into bare repository"):
                logger.warning(
                    f"{completed.args} failed. Retrying git clone. Retry {retry_count}."
                )
                time.sleep(2**retry_count)
                continue
            else:
                is_clone_successful = True
                break
        if not is_clone_successful:
            logger.error(f"Clone failed for {library.key}. {message=} {error=}")
            return
        versions = [""] + list(
            Version.objects.minor_versions()
            .filter(library_version__library__key=library.key)
            .order_by("version_array")
            .values_list("name", flat=True)
        )
        for a, b in zip(versions, versions[1:]):
            shortstat = subprocess.run(
                ["git", "--git-dir", str(git_dir), "diff", f"{a}..{b}", "--shortstat"],
                capture_output=True,
            )
            stat_output = shortstat.stdout.decode()
            files_changed = insertions = deletions = 0
            if m := re.search(r"(\d+) files? changed", stat_output):
                files_changed = int(m.group(1))
            if m := re.search(r"(\d+) insertions?", stat_output):
                insertions = int(m.group(1))
            if m := re.search(r"(\d+) deletions?", stat_output):
                deletions = int(m.group(1))
            yield VersionDiffStat(
                version=b,
                insertions=insertions,
                deletions=deletions,
                files_changed=files_changed,
            )

            log_output = subprocess.run(
                ["git", "--git-dir", str(git_dir), "log", f"{a}..{b}", "--date", "iso"],
                capture_output=True,
            )
            commits = log_output.stdout.decode()
            for match in parser.finditer(commits):
                groups = match.groupdict()
                name = groups["name"].strip()
                email = groups["email"].strip()
                sha = groups["sha"].strip()
                is_merge = bool(groups.get("merge", False))
                message = groups["message"].strip("\n")
                message = "\n".join(
                    [m[4:] if m.startswith("    ") else m for m in message.split("\n")]
                )
                committed_at = dateparse.parse_datetime(groups["date"])
                assert committed_at  # should always exist
                yield ParsedCommit(
                    email=email,
                    name=name,
                    message=message,
                    sha=sha,
                    committed_at=committed_at,
                    is_merge=is_merge,
                    version=b,
                )


class LibraryUpdater:
    """
    This class is used to sync Libraries from the list of git submodules
    and their `libraries.json` file metadata.
    """

    def __init__(self, client=None, token=None):
        self.client = client or GithubAPIClient(token=token)
        self.parser = GithubDataParser()
        self.logger = structlog.get_logger()

        # Modules we need to skip as they are not really Boost Libraries
        self.skip_modules = [
            "inspect",
            "boostbook",
            "bcp",
            "build",
            "quickbook",
            "litre",
            "auto_index",
            "boostdep",
            "check_build",
            "headers",
            "boost_install",
            "docca",
            "cmake",
            "more",
        ]
        # Libraries to skip that are not "modules", but appear as child-libraries
        # of other modules. Identified by the key used in the libraries.json file.
        self.skip_libraries = ["chrono/stopwatch"]

    def get_library_list(self, gitmodules=None):
        """
        Retrieve the full list of library data for Boost libraries from their Github
        repos.

        Included libraries are retrieved from the list of modules in .gitmodules in the
        main Boost repo. The libraries.json file is retrieved from each module and
        parsed to get the library metadata. Most libraries.json files contain info
        about individual libraries, but a few such as "system", "functional", etc.
        contain multiple libraries.
        """
        libraries = []
        for gitmodule in gitmodules:
            if gitmodule["module"] in self.skip_modules:
                continue

            libraries_json = self.client.get_libraries_json(
                repo_slug=gitmodule["module"]
            )
            github_data = self.client.get_repo(repo_slug=gitmodule["module"])
            extra_data = {
                "github_url": github_data.get("html_url", ""),
            }

            if type(libraries_json) is list:
                for library in libraries_json:
                    data = self.parser.parse_libraries_json(library)
                    if data["key"] in self.skip_libraries:
                        continue
                    libraries.append({**data, **extra_data})

            elif type(libraries_json) is dict:
                data = self.parser.parse_libraries_json(libraries_json)
                if data["key"] in self.skip_libraries:
                    continue
                libraries.append({**data, **extra_data})

        return libraries

    def update_libraries(self):
        """
        Update all libraries with the metadata from their libraries.json file.
        """
        raw_gitmodules = self.client.get_gitmodules()
        gitmodules = self.parser.parse_gitmodules(raw_gitmodules.decode("utf-8"))
        library_data = self.get_library_list(gitmodules=gitmodules)

        self.logger.info(
            "update_all_libraries_metadata", library_count=len(library_data)
        )

        for lib in library_data:
            obj = self.update_library(lib)
            if not obj:
                continue

            self.update_categories(obj, categories=lib["category"])
            # self.update_authors(obj, authors=lib["authors"])

    def update_library(self, library_data: dict) -> Library:
        """Update an individual library"""
        logger = self.logger.bind(library=library_data)
        try:
            obj, created = Library.objects.update_or_create(
                key=library_data["key"],
                defaults={
                    "name": library_data["name"],
                    "github_url": library_data["github_url"],
                    "description": library_data["description"],
                    "data": library_data,
                },
            )

            logger = logger.bind(obj_created=created)
            obj.save()
            logger.info("library_udpated")
            return obj

        except Exception:
            logger.exception("library_update_failed")

    def update_categories(self, obj, categories):
        """Update all of the categories for an object"""

        obj.categories.clear()
        for cat_name in categories:
            cat_name = CATEGORY_OVERRIDES.get(cat_name, cat_name)
            cat, _ = Category.objects.get_or_create(name=cat_name)
            obj.categories.add(cat)

    def update_authors(self, obj: Library | LibraryVersion, authors=None):
        """
        Receives a list of strings from the libraries.json of a Boost library
        or library_version, and an object with an "authors" attribute.

        Processes that string into a User object that is added as an
        Author to the Library or LibraryVersion.
        """
        if not authors:
            return obj

        if isinstance(authors, str):
            authors = [authors]

        for author in authors:
            person_data = self.parser.extract_contributor_data(author)
            email = person_data["email"]
            user = User.objects.find_contributor(
                email=person_data["email"],
                first_name=person_data["first_name"],
                last_name=person_data["last_name"],
            )

            if not user:
                email = person_data.pop("email")
                if not email:
                    email = generate_fake_email(
                        f"{person_data['first_name']} {person_data['last_name']}"
                    )
                    # With a new email, we may have a user record
                    user = User.objects.find_contributor(email=email)

            # If still no user, generate a fake one
            if not user:
                user = User.objects.create_stub_user(email.lower(), **person_data)
                self.logger.info(f"User {user.email} created.")

            obj.authors.add(user)

        return obj

    def update_maintainers(self, obj, maintainers=None):
        """
        Receives a list of strings from the libraries.json of a Boost library, and
        an object with a M2M "maintainers" attribute.

        Processes the list of strings into User objects and adds them as Maintainers
        to the object.
        """
        if not maintainers:
            return

        for maintainer in maintainers:
            person_data = self.parser.extract_contributor_data(maintainer)
            user = User.objects.find_contributor(
                email=person_data["email"],
                first_name=person_data["first_name"],
                last_name=person_data["last_name"],
            )

            if not user:
                email = person_data.pop("email")
                if not email:
                    email = generate_fake_email(
                        f"{person_data['first_name']} {person_data['last_name']}"
                    )
                user = User.objects.create_stub_user(email.lower(), **person_data)
                self.logger.info(f"User {user.email} created.")

            obj.maintainers.add(user)
            self.logger.info(f"User {user.email} added as a maintainer of {obj}")

    def update_issues(self, obj):
        """Import GitHub issues for the library and update the database"""
        self.logger.info("updating_repo_issues")

        issues_data = self.client.get_repo_issues(
            self.client.owner, obj.github_repo, state="all", issues_only=True
        )
        for issue_dict in issues_data:
            # Get the date information
            closed_at = None
            created_at = None
            modified_at = None

            if issue_dict.get("closed_at"):
                closed_at = parse_date(issue_dict["closed_at"])

            if issue_dict.get("created_at"):
                created_at = parse_date(issue_dict["created_at"])

            if issue_dict.get("updated_at"):
                modified_at = parse_date(issue_dict["updated_at"])

            # Create or update the Issue object
            try:
                issue, created = Issue.objects.update_or_create(
                    library=obj,
                    github_id=issue_dict["id"],
                    defaults={
                        "title": issue_dict["title"][:255],
                        "number": issue_dict["number"],
                        "is_open": issue_dict["state"] == "open",
                        "closed": closed_at,
                        "created": created_at,
                        "modified": modified_at,
                        "data": obj2dict(issue_dict),
                    },
                )
                self.logger.info(
                    "issue_updated_successfully",
                    issue_id=issue.id,
                    created_issue=created,
                    issue_github_id=issue.github_id,
                )
            except Exception as e:
                self.logger.exception(
                    "update_issues_error_skipped_issue",
                    issue_github_id=issue_dict.get("id"),
                    exc_msg=str(e),
                )
                continue

    def update_prs(self, obj):
        """Update all PRs for a library"""
        self.logger.info("updating_repo_prs")

        prs_data = self.client.get_repo_prs(obj.github_repo, state="all")

        for pr_dict in prs_data:
            # Get the date information
            closed_at = None
            merged_at = None
            created_at = None
            modified_at = None

            if pr_dict.get("closed_at"):
                closed_at = parse_date(pr_dict["closed_at"])

            if pr_dict.get("merged_at"):
                merged_at = parse_date(pr_dict["merged_at"])

            if pr_dict.get("created_at"):
                created_at = parse_date(pr_dict["created_at"])

            if pr_dict.get("updated_at"):
                modified_at = parse_date(pr_dict["updated_at"])

            try:
                pull_request, created = PullRequest.objects.update_or_create(
                    library=obj,
                    github_id=pr_dict["id"],
                    defaults={
                        "title": pr_dict["title"][:255],
                        "number": pr_dict["number"],
                        "is_open": pr_dict["state"] == "open",
                        "closed": closed_at,
                        "merged": merged_at,
                        "created": created_at,
                        "modified": modified_at,
                        "data": obj2dict(pr_dict),
                    },
                )
                self.logger.info(
                    "pull_request_updated_successfully",
                    pr_id=pull_request.id,
                    created_pr=created,
                    pr_github_id=pull_request.github_id,
                )
            except Exception as e:
                self.logger.exception(
                    "update_prs_error_skipped_pr",
                    pr_github_id=pr_dict.get("id"),
                    exc_msg=str(e),
                )

    def update_commits(self, obj: Library, clean=False):
        """Import a record of all commits between LibraryVersions."""
        authors = {}
        commits = []
        library_versions = {
            x.version.name: x
            for x in LibraryVersion.objects.filter(library=obj).select_related(
                "version"
            )
        }
        library_version_updates = []

        def handle_commit(commit: ParsedCommit):
            author = authors.get(commit.email, None)
            if not author:
                if (
                    commit_author_email := CommitAuthorEmail.objects.filter(
                        email=commit.email,
                    )
                    .select_related("author")
                    .first()
                ):
                    author = commit_author_email.author
                else:
                    author = CommitAuthor.objects.create(
                        name=commit.name, avatar_url=commit.avatar_url
                    )
                    CommitAuthorEmail.objects.create(email=commit.email, author=author)
                authors[commit.email] = author
            return Commit(
                author=author,
                library_version=library_versions[commit.version],
                sha=commit.sha,
                message=commit.message,
                committed_at=commit.committed_at,
                is_merge=commit.is_merge,
            )

        def handle_version_diff_stat(diff: VersionDiffStat):
            lv = library_versions[diff.version]
            lv.insertions = diff.insertions
            lv.deletions = diff.deletions
            lv.files_changed = diff.files_changed
            return lv

        commits_handled = 0
        for item in get_commit_data_for_repo_versions(obj.key):
            match item:
                case ParsedCommit():
                    commits_handled += 1
                    commits.append(handle_commit(item))
                case VersionDiffStat():
                    library_version_updates.append(handle_version_diff_stat(item))
                case _:
                    assert_never()

        with transaction.atomic():
            if clean:
                Commit.objects.filter(library_version__library=obj).delete()
            Commit.objects.bulk_create(
                commits,
                update_conflicts=True,
                update_fields=["author", "message", "committed_at", "is_merge"],
                unique_fields=["library_version", "sha"],
            )
            LibraryVersion.objects.bulk_update(
                library_version_updates,
                ["insertions", "deletions", "files_changed"],
            )
        return commits_handled

    def update_commit_author_github_data(self, obj=None, email=None, overwrite=False):
        """Update CommitAuthor data by parsing data on their most recent commit."""
        if email:
            authors = CommitAuthor.objects.filter(
                Exists(CommitAuthorEmail.objects.filter(id=OuterRef("pk"), email=email))
            )
        elif obj:
            authors = CommitAuthor.objects.filter(
                Exists(
                    Library.objects.filter(
                        library_version__commit__author=OuterRef("id"),
                        pk=obj.pk,
                    )
                )
            )
        else:
            authors = CommitAuthor.objects.all()

        if not overwrite:
            authors = authors.filter(avatar_url=None)

        authors = (
            authors.annotate(
                most_recent_commit_sha=Commit.objects.filter(author=OuterRef("pk"))
                .order_by("-committed_at")
                .values("sha")[:1]
            )
            .annotate(
                most_recent_library_key=Library.objects.filter(
                    library_version__commit__sha=OuterRef("most_recent_commit_sha")
                ).values("key")[:1]
            )
            .filter(most_recent_library_key__isnull=False)
        )
        libraries = Library.objects.filter(
            key__in=[x.most_recent_library_key for x in authors]
        )
        repos = {x.key: x for x in libraries}
        for author in authors:
            try:
                commit = self.client.get_repo_ref(
                    repo_slug=repos[author.most_recent_library_key].github_repo,
                    ref=author.most_recent_commit_sha,
                )
            except HTTP404NotFoundError:
                self.logger.info(
                    f"Commit not found. Skipping avatar update for {author}."
                )
                continue
            if gh_author := commit["author"]:
                if gh_author["avatar_url"]:
                    author.avatar_url = gh_author["avatar_url"]
                if gh_author["html_url"]:
                    author.github_profile_url = gh_author["html_url"]
                author.save(update_fields=["avatar_url", "github_profile_url"])

    def fetch_most_recent_boost_dep_artifact_content(self, owner=""):
        # get artifacts with the name "boost-dep-artifact"
        artifacts = self.client.get_artifacts(
            owner=owner,
            repo_slug="website-v2",
            name="boost-dep-artifact",
        )
        if not artifacts or not artifacts.get("artifacts", None):
            logger.warning("No artifacts found.")
            return
        # get the most recent artifact
        artifact = artifacts["artifacts"][0]
        if artifact["expired"]:
            logger.error("The most recent boost-dep-artifact is expired.")
            return
        return self.client.get_artifact_content(artifact["archive_download_url"])

    def update_library_version_dependencies(self, owner="", clean=False):
        """Update LibraryVersion dependencies M2M via a github action artifact.

        owner: The repo owner. Defaults to `boostorg` in self.client.
        clean: Clear the M2M before adding dependencies.

        """
        saved_dependencies = 0
        saved_library_versions = 0

        content = self.fetch_most_recent_boost_dep_artifact_content(owner=owner)
        if not content:
            return
        for library_version, dependencies in parse_boostdep_artifact(content):
            if clean:
                library_version.dependencies.set(dependencies, clear=True)
            else:
                library_version.dependencies.add(*dependencies)
            saved_library_versions += 1
            saved_dependencies += len(dependencies)
        logger.info(
            "update_library_version_dependencies finished",
            saved_dependencies=saved_dependencies,
            saved_library_versions=saved_library_versions,
        )