From 7e847cc17f92b74e2ffbf1259b12442ddb536656 Mon Sep 17 00:00:00 2001 From: Lacey Williams Henschel Date: Wed, 30 Aug 2023 11:44:02 -0700 Subject: [PATCH] - Move GitHub retrieval and parser classes to the `core/` app from the `libraries/` app; moved tests, too, and updated import statements - Add `verbose` flag to `import_versions` to allow user to silence the output - Removed some unused options from `import_versions` - Moved the exclusion logic in `import_versions` to its own function - Stop getting the `github_url` from the github API. Instead, generate it from the tag. (reason: in older tags, the best URL GitHub gives us is a less-useful one to a _commit_ and not a tag. We can generate the url, though.) - Move the retrieval of the `release_date` to its own task, so it can be loaded async. (reason: it's a separate API call per version to get the date) - Make `release_date` optional on the `Version` model (reason: make the field easier to load async) - Simplify logic to retrieve `release_date` and just always retrieve it from the commit - Stop loading the version `description` from GitHub. We don't use it on the frontend anyway, and in all but the most recent couple of versions, the `description` is the comment from the commit, which is usually not useful. --- core/githubhelper.py | 558 ++++++++++++++++++ core/tests/test_githubhelper.py | 274 +++++++++ docs/commands.md | 19 +- libraries/github.py | 554 +---------------- .../commands/import_commit_counts.py | 3 +- .../commands/import_first_release_dates.py | 3 +- .../commands/import_library_versions.py | 3 +- libraries/tests/test_github.py | 345 +---------- libraries/views.py | 3 +- users/tasks.py | 2 +- .../management/commands/import_versions.py | 129 ++-- .../0009_alter_version_release_date.py | 17 + versions/models.py | 2 +- versions/tasks.py | 53 ++ versions/tests/test_tasks.py | 33 ++ 15 files changed, 1001 insertions(+), 997 deletions(-) create mode 100644 core/githubhelper.py create mode 100644 core/tests/test_githubhelper.py create mode 100644 versions/migrations/0009_alter_version_release_date.py create mode 100644 versions/tasks.py create mode 100644 versions/tests/test_tasks.py diff --git a/core/githubhelper.py b/core/githubhelper.py new file mode 100644 index 00000000..770f32d8 --- /dev/null +++ b/core/githubhelper.py @@ -0,0 +1,558 @@ +import base64 +import os +import re +from collections import defaultdict +from datetime import datetime +from dateutil.parser import parse +import requests +import structlog +from django.core.exceptions import ValidationError +from django.core.validators import validate_email +from fastcore.net import HTTP404NotFoundError, HTTP422UnprocessableEntityError +from fastcore.xtras import obj2dict +from ghapi.all import GhApi, paged + + +logger = structlog.get_logger() + + +class GithubAPIClient: + """A class to interact with the GitHub API.""" + + def __init__( + self, + owner: str = "boostorg", + ref: str = "heads/master", + repo_slug: str = "boost", + token: str = None, + ) -> None: + """ + Initialize the GitHubAPIClient. + + :param owner: str, the repository owner + :param ref: str, the Git reference + :param repo_slug: str, the repository slug + """ + self.api = self.initialize_api(token=token) + self.owner = owner + self.ref = ref + self.repo_slug = repo_slug + self.logger = structlog.get_logger() + + # Modules we need to skip as they are not really Boost Libraries + self.skip_modules = [ + "inspect", + "boostbook", + "bcp", + "build", + "quickbook", + "litre", + "auto_index", + "boostdep", + "check_build", + "headers", + "boost_install", + "docca", + "cmake", + "more", + ] + + def initialize_api(self, token=None) -> GhApi: + """ + Initialize the GitHub API with the token from the environment variable. + + :return: GhApi, the GitHub API + """ + if token is None: + token = os.environ.get("GITHUB_TOKEN", None) + return GhApi(token=token) + + def get_blob(self, repo_slug: str = None, file_sha: str = None) -> dict: + """ + Get the blob from the GitHub API. + + :param repo_slug: str, the repository slug + :param file_sha: str, the file sha + :return: dict, the blob + """ + if not repo_slug: + repo_slug = self.repo_slug + return self.api.git.get_blob( + owner=self.owner, repo=repo_slug, file_sha=file_sha + ) + + def get_commit_by_sha(self, repo_slug: str = None, commit_sha: str = None) -> dict: + """Get a commit by its SHA.""" + if not repo_slug: + repo_slug = self.repo_slug + return self.api.git.get_commit( + owner=self.owner, repo=repo_slug, commit_sha=commit_sha + ) + + def get_commits( + self, + repo_slug: str = None, + branch: str = "master", + since: datetime = None, + until: datetime = None, + ) -> list: + """Get all commits to the specified branch of a repo. + + :param repo_slug: str, the repository slug. If not provided, the class + instance's repo_slug will be used. + :param branch: str, the branch name. Defaults to 'master'. + :param since: datetime, only return commits after this date. + :param until: datetime, only return commits before this date. + :return: List[Commit], list of all commits in the branch. + """ + repo_slug = repo_slug or self.repo_slug + + # Get the commits + try: + pages = list( + paged( + self.api.repos.list_commits, + owner=self.owner, + repo=repo_slug, + sha=branch, + since=since, + until=until, + per_page=100, + ) + ) + all_commits = [] + for page in pages: + all_commits.extend(page) + + except Exception as e: + self.logger.exception( + "get_all_commits_failed", repo=repo_slug, exc_msg=str(e) + ) + return [] + + return all_commits + + def get_first_tag(self, repo_slug: str = None): + """ + Retrieves the earliest tag in the repo. + + :param repo_slug: str, the repository slug + :return: tuple with GitHub tag object, commit date. + - See https://docs.github.com/en/rest/git/tags for tag object format. + """ + if not repo_slug: + repo_slug = self.repo_slug + + try: + per_page = 100 + page = 1 + all_tags = [] + + while True: + tags = self.api.repos.list_tags( + owner=self.owner, repo=repo_slug, per_page=per_page, page=page + ) + all_tags.extend(tags) + if len(tags) < per_page: # End of results + break + + page += 1 # Go to the next page + + # Sort the tags by the commit date. The first tag will be the earliest. + # The Github API doesn't return the commit date with the tag, so we have to + # retrieve each one individually. This is slow, but it's the only way to get + # the commit date. + def get_tag_commit_date(tag): + """Get the commit date for a tag. + + For commit format, see + https://docs.github.com/en/rest/commits/commits.""" + commit_sha = tag["commit"]["sha"] + commit = self.get_commit_by_sha(repo_slug, commit_sha) + return commit["committer"]["date"] + + annotated_tags = [(tag, get_tag_commit_date(tag)) for tag in all_tags] + sorted_tags = sorted(annotated_tags, key=lambda x: x[1]) + + # Return the first (earliest) tag + return sorted_tags[0] + + except Exception: + self.logger.exception("get_first_tag_and_date_failed", repo=repo_slug) + return None + + def get_gitmodules(self, repo_slug: str = None, ref: str = None) -> str: + """ + Get the .gitmodules file for the repo from the GitHub API. + + :param repo_slug: str, the repository slug + :param ref: dict, the Git reference object (the commit hash). + See https://docs.github.com/en/rest/git/refs for expected format. + :return: str, the .gitmodules file from the repo + """ + if not repo_slug: + repo_slug = self.repo_slug + + if not ref: + ref = self.get_ref() + tree_sha = ref["object"]["sha"] + + try: + tree = self.get_tree(tree_sha=tree_sha) + except HTTP422UnprocessableEntityError as e: + # Only happens for version 1.61.0; uncertain why. + self.logger.exception( + "get_gitmodules_failed", repo=repo_slug, exc_msg=str(e) + ) + return None + + for item in tree["tree"]: + if item["path"] == ".gitmodules": + file_sha = item["sha"] + blob = self.get_blob(repo_slug=repo_slug, file_sha=file_sha) + return base64.b64decode(blob["content"]) + + def get_libraries_json(self, repo_slug: str, tag: str = "master"): + """ + Retrieve library metadata from 'meta/libraries.json' + Each Boost library will have a `meta` directory with a `libraries.json` file. + Example: + https://github.com/boostorg/align/blob/5ad7df63cd792fbdb801d600b93cad1a432f0151/meta/libraries.json + """ + url = f"https://raw.githubusercontent.com/{self.owner}/{repo_slug}/{tag}/meta/libraries.json" # noqa + + try: + response = requests.get(url) + response.raise_for_status() + # This usually happens because the library does not have a `meta/libraries.json` + # in the requested tag. More likely to happen with older versions of libraries. + except requests.exceptions.HTTPError: + self.logger.exception( + "get_library_metadata_failed", repo=repo_slug, url=url + ) + return None + else: + return response.json() + + def get_file_content( + self, + repo_slug: str = None, + tag: str = "master", + file_path: str = "library-detail.adoc", + ) -> str: + """ + Get the specified file for the repo from the GitHub API, if it exists. + + :param repo_slug: str, the repository slug + :param tag: str, the Git tag + :param file_name: str, the name of the file to fetch. Should be + "library-detail.adoc" or "README.md". + :return: str, the specified file content from the repo + """ + url = f"https://raw.githubusercontent.com/{self.owner}/{repo_slug}/{tag}/{file_path}" # noqa + + response = requests.get(url) + + if not response.status_code == 200: + logger.exception( + "get_file_content_failed", repo=repo_slug, url=url, file=file_path + ) + return None + + return response.content + + def get_ref(self, repo_slug: str = None, ref: str = None) -> dict: + """ + Get the ref from the GitHub API. + + :param repo_slug: str, the repository slug + :param ref: str, the Git reference + :return: dict, the ref + """ + if not repo_slug: + repo_slug = self.repo_slug + if not ref: + ref = self.ref + return self.api.git.get_ref(owner=self.owner, repo=repo_slug, ref=ref) + + def get_repo(self, repo_slug: str = None) -> dict: + """ + Get the repository from the GitHub API. + + :param repo_slug: str, the repository slug + :return: dict, the repository + """ + if not repo_slug: + repo_slug = self.repo_slug + + try: + return self.api.repos.get(owner=self.owner, repo=repo_slug) + except HTTP404NotFoundError as e: + logger.info("repo_not_found", repo_slug=repo_slug, exc_msg=str(e)) + return + + def get_repo_issues( + self, owner: str, repo_slug: str, state: str = "all", issues_only: bool = True + ): + """ + Get all issues for a repo. + Note: The GitHub API considers both PRs and Issues to be "Issues" and does not + support filtering in the request, so to exclude PRs from the list of issues, we + do some manual filtering of the results + + Note: GhApi() returns results as AttrDict objects: + https://fastcore.fast.ai/basics.html#attrdict + """ + pages = list( + paged( + self.api.issues.list_for_repo, + owner=self.owner, + repo=repo_slug, + state=state, + per_page=100, + ) + ) + # Concatenate all pages into a single list + all_results = [] + for page in pages: + all_results.extend(page) + + # Filter results + results = [] + if issues_only: + results = [ + result for result in all_results if not result.get("pull_request") + ] + else: + results = all_results + + return results + + def get_repo_prs(self, repo_slug, state="all"): + """ + Get all PRs for a repo + Note: GhApi() returns results as AttrDict objects: + https://fastcore.fast.ai/basics.html#attrdict + """ + pages = list( + paged( + self.api.pulls.list, + owner=self.owner, + repo=repo_slug, + state=state, + per_page=100, + ) + ) + # Concatenate all pages into a single list + results = [] + for p in pages: + results.extend(p) + + return results + + def get_release_by_tag(self, tag_name: str, repo_slug: str = None) -> dict: + """Get a tag by name from the GitHub API.""" + if not repo_slug: + repo_slug = self.repo_slug + try: + return self.api.repos.get_release_by_tag( + owner=self.owner, repo=repo_slug, tag=tag_name + ) + except Exception: + # Not necessarily an error, so log it but don't raise. + logger.info( + "release_by_tag_not_found", tag_name=tag_name, repo_slug=repo_slug + ) + return + + def get_tags(self, repo_slug: str = None) -> dict: + """Get all the tags from the GitHub API.""" + if not repo_slug: + repo_slug = self.repo_slug + + per_page = 50 + page = 1 + tags = [] + + while True: + new_tags = self.api.repos.list_tags( + owner=self.owner, repo=repo_slug, per_page=per_page, page=page + ) + tags.extend(new_tags) + + # Check if we reached the last page + if len(new_tags) < per_page: + break + + page += 1 + + return tags + + def get_tree(self, repo_slug: str = None, tree_sha: str = None) -> dict: + """ + Get the tree from the GitHub API. + + :param repo_slug: str, the repository slug + :param tree_sha: str, the tree sha + :return: dict, the tree + """ + if not repo_slug: + repo_slug = self.repo_slug + return self.api.git.get_tree( + owner=self.owner, repo=repo_slug, tree_sha=tree_sha + ) + + def get_user_by_username(self, username: str) -> dict: + """Return the response from GitHub's /users/{username}/""" + return self.api.users.get_by_username(username=username) + + +class GithubDataParser: + def get_commits_per_month(self, commits: list[dict]): + """Get the number of commits per month from a list of commits. + + :param commits: List[Commit], list of commits. + :return: Dict[str, datetime], dictionary mapping month-year dates to commit + counts. + """ + commit_counts = defaultdict(int) + for commit in commits: + date = parse(commit.commit.author.date) + month_year = datetime(date.year, date.month, 1).date() + commit_counts[month_year] += 1 + + return dict(commit_counts) + + def parse_commit(self, commit_data: dict) -> dict: + """Parse the commit data from Github and return a dict of the data we want.""" + published_at = commit_data["committer"]["date"] + description = commit_data.get("message", "") + github_url = commit_data["html_url"] + release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").date() + return { + "release_date": release_date, + "description": description, + "github_url": github_url, + "data": obj2dict(commit_data), + } + + def parse_gitmodules(self, gitmodules: str) -> dict: + """Parse the .gitmodules file. + Expects the multiline contents of https://github.com/boostorg/boost/.gitmodules + to be passed in + + :param gitmodules: str, the .gitmodules file + :return: dict, the parsed .gitmodules file + """ + modules = [] + current_submodule = None + + submodule_re = re.compile(r"^\[submodule \"(.*)\"\]$") + url_re = re.compile(r"^\s*url\s*\=\s*\.\.\/(.*)\.git\s*$") + + for line in gitmodules.split("\n"): + sub_m = submodule_re.match(line) + if sub_m: + current_submodule = {"module": sub_m.group(1)} + continue + + url_m = url_re.match(line) + if url_m: + name = url_m.group(1) + current_submodule["url"] = name + modules.append(current_submodule) + current_submodule = None + + return modules + + def parse_libraries_json(self, libraries_json: dict) -> dict: + """Parse the individual library metadata from 'meta/libraries.json'.""" + return { + "name": libraries_json["name"], + "key": libraries_json["key"], + "authors": libraries_json.get("authors", []), + "description": libraries_json.get("description", ""), + "category": libraries_json.get("category", []), + "maintainers": libraries_json.get("maintainers", []), + "cxxstd": libraries_json.get("cxxstd"), + } + + def parse_tag(self, tag_data: dict) -> dict: + """Parse the tag data from Github and return a dict of the data we want.""" + published_at = tag_data.get("published_at", "") + description = tag_data.get("body", "") + github_url = tag_data.get("html_url", "") + release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").date() + return { + "release_date": release_date, + "description": description, + "github_url": github_url, + "data": obj2dict(tag_data), + } + + def extract_contributor_data(self, contributor: str) -> dict: + """Takes an author/maintainer string and returns a dict with their data.""" + data = {} + + email = self.extract_email(contributor) + if bool(email): + data["email"] = email + data["valid_email"] = True + else: + data["email"] = None + data["valid_email"] = False + + first_name, last_name = self.extract_names(contributor) + data["first_name"], data["last_name"] = first_name[:30], last_name[:30] + + return data + + def extract_email(self, val: str) -> str: + """ + Finds an email address in a string, reformats it, and returns it. + Assumes the email address is in this format: + + + Does not raise errors. + + Includes as many catches for variants in the formatting as I found in a first + pass. + """ + result = re.search("<.+>", val) + if result: + raw_email = result.group() + email = ( + raw_email.replace("-at-", "@") + .replace("- at -", "@") + .replace("-dot-", ".") + .replace("<", "") + .replace(">", "") + .replace(" ", "") + .replace("-underscore-", "_") + ) + try: + validate_email(email) + except ValidationError as e: + logger.info("Could not extract valid email", value=val, exc_msg=str(e)) + return + return email + + def extract_names(self, val: str) -> list: + """ + Returns a list of first, last names for the val argument. + + NOTE: This is an overly simplistic solution to importing names. + Names that don't conform neatly to "First Last" formats will need + to be cleaned up manually. + """ + # Strip the email, if present + email = re.search("<.+>", val) + if email: + val = val.replace(email.group(), "") + + names = val.strip().rsplit(" ", 1) + + if len(names) == 1: + names.append("") + + return names diff --git a/core/tests/test_githubhelper.py b/core/tests/test_githubhelper.py new file mode 100644 index 00000000..2b5673a9 --- /dev/null +++ b/core/tests/test_githubhelper.py @@ -0,0 +1,274 @@ +import datetime +from unittest.mock import MagicMock, Mock + +import pytest +import responses +from ghapi.all import GhApi + +from core.githubhelper import GithubAPIClient, GithubDataParser + +"""GithubAPIClient Tests""" + + +@pytest.fixture +def github_api_client(): + return GithubAPIClient() + + +@pytest.fixture +def github_api_client_mock(): + """ """ + mock = MagicMock() + return mock + + +def test_initialize_api(): + """Test the initialize_api method of GitHubAPIClient.""" + api = GithubAPIClient().initialize_api() + assert isinstance(api, GhApi) + + +def test_get_blob(github_api_client): + """Test the get_blob method of GitHubAPIClient.""" + github_api_client.api.git.get_blob = MagicMock( + return_value={"sha": "12345", "content": "example content", "encoding": "utf-8"} + ) + result = github_api_client.get_blob(repo_slug="sample_repo", file_sha="12345") + assert result == {"sha": "12345", "content": "example content", "encoding": "utf-8"} + github_api_client.api.git.get_blob.assert_called_with( + owner=github_api_client.owner, repo="sample_repo", file_sha="12345" + ) + + +@responses.activate +def test_get_libraries_json(github_api_client): + """Test the get_libraries_json method of GitHubAPIClient.""" + repo_slug = "sample_repo" + url = f"https://raw.githubusercontent.com/{github_api_client.owner}/{repo_slug}/master/meta/libraries.json" + sample_json = {"key": "math", "name": "Math"} + responses.add( + responses.GET, + url, + json=sample_json, + status=200, + content_type="application/json", + ) + result = github_api_client.get_libraries_json(repo_slug=repo_slug) + assert result == {"key": "math", "name": "Math"} + assert len(responses.calls) == 1 + assert responses.calls[0].request.url == url + + +def test_get_ref(github_api_client): + """Test the get_ref method of GitHubAPIClient.""" + github_api_client.api.git.get_ref = MagicMock( + return_value={"content": "example content"} + ) + result = github_api_client.get_ref(repo_slug="sample_repo", ref="head/main") + assert result == {"content": "example content"} + + +def test_get_repo(github_api_client): + """Test the get_repo method of GitHubAPIClient.""" + github_api_client.api.repos.get = MagicMock( + return_value={"content": "example content"} + ) + result = github_api_client.get_repo(repo_slug="sample_repo") + assert result == {"content": "example content"} + + +"""Parser Tests""" + + +def create_mock_commit(date): + """Create a mock commit with the given date.""" + commit = Mock() + commit.commit.author.date = date + return commit + + +def test_get_commits_per_month(): + # Construct the mock commits. + commits = [ + create_mock_commit(datetime.datetime(2023, 1, 15).isoformat()), + create_mock_commit(datetime.datetime(2022, 1, 10).isoformat()), + create_mock_commit(datetime.datetime(2022, 2, 1).isoformat()), + create_mock_commit(datetime.datetime(2023, 1, 16).isoformat()), + ] + + # Construct the object and call the method. + parser = GithubDataParser() + results = parser.get_commits_per_month(commits) + + # Check the result. + expected = { + datetime.datetime(2022, 1, 1).date(): 1, + datetime.datetime(2022, 2, 1).date(): 1, + datetime.datetime(2023, 1, 1).date(): 2, + } + assert expected == results + + +def test_parse_gitmodules(): + sample_gitmodules = """ +[submodule "system"] + path = libs/system + url = ../system.git + fetchRecurseSubmodules = on-demand + branch = . +[submodule "multi_array"] + path = libs/multi_array + url = ../multi_array.git + fetchRecurseSubmodules = on-demand + branch = . +""" + + parser = GithubDataParser() + parsed_data = parser.parse_gitmodules(sample_gitmodules) + + expected_output = [ + { + "module": "system", + "url": "system", + }, + { + "module": "multi_array", + "url": "multi_array", + }, + ] + + assert parsed_data == expected_output + + +def test_parse_libraries_json(): + sample_libraries_json = { + "key": "math", + "name": "Math", + "authors": [], + "description": "Sample Description", + "category": ["Math"], + "maintainers": [], + "cxxstd": "14", + } + + parser = GithubDataParser() + parser.parse_libraries_json(sample_libraries_json) + + +def test_parse_commit(): + commit_data = { + "committer": {"date": "2023-05-10T00:00:00Z"}, + "message": "This is a sample description for a commit", + "html_url": "http://example.com/commit/12345", + } + expected = { + "release_date": datetime.date(2023, 5, 10), + "description": commit_data["message"], + "github_url": "http://example.com/commit/12345", + "data": commit_data, + } + result = GithubDataParser().parse_commit(commit_data) + assert result == expected + + +def test_parse_tag(): + tag_data = { + "published_at": "2023-05-10T00:00:00Z", + "body": "This is a sample description for a tag", + "html_url": "http://example.com/commit/12345", + } + expected = { + "release_date": datetime.date(2023, 5, 10), + "description": "This is a sample description for a tag", + "github_url": "http://example.com/commit/12345", + "data": tag_data, + } + result = GithubDataParser().parse_tag(tag_data) + assert result == expected + + +def test_extract_names(): + sample = "Tester Testerson " + expected = ["Tester", "Testerson"] + result = GithubDataParser().extract_names(sample) + assert expected == result + + sample = "Tester Testerson" + expected = ["Tester", "Testerson"] + result = GithubDataParser().extract_names(sample) + assert expected == result + + sample = "Tester de Testerson " + expected = ["Tester de", "Testerson"] + result = GithubDataParser().extract_names(sample) + assert expected == result + + sample = "Tester de Testerson" + expected = ["Tester de", "Testerson"] + result = GithubDataParser().extract_names(sample) + assert expected == result + + sample = "Various" + expected = ["Various", ""] + result = GithubDataParser().extract_names(sample) + assert expected == result + + +def test_extract_email(): + expected = "t_testerson@example.com" + result = GithubDataParser().extract_email( + "Tester Testerston " + ) + assert expected == result + + expected = "t.t.testerson@example.com" + result = GithubDataParser().extract_email( + "Tester Testerston " + ) + assert expected == result + + expected = "t.t.testerson@example.sample.com" + result = GithubDataParser().extract_email( + "Tester Testerston " + ) + assert expected == result + + expected = None + result = GithubDataParser().extract_email("Tester Testeron") + assert expected == result + + expected = "t_tester@example.com" + result = GithubDataParser().extract_email( + "Tester Testerston " + ) + assert expected == result + + expected = "tester@example.com" + result = GithubDataParser().extract_email( + "Tester Testerston " + ) + assert expected == result + + +def test_extract_contributor_data(): + sample = "Tester Testerson " + expected = { + "valid_email": True, + "email": "tester@gmail.com", + "first_name": "Tester", + "last_name": "Testerson", + } + result = GithubDataParser().extract_contributor_data(sample) + assert expected == result + + sample = "Tester Testerson" + expected = { + "valid_email": False, + "first_name": "Tester", + "last_name": "Testerson", + } + result = GithubDataParser().extract_contributor_data(sample) + assert expected["valid_email"] is False + assert expected["first_name"] == result["first_name"] + assert expected["last_name"] == result["last_name"] + assert "email" in result diff --git a/docs/commands.md b/docs/commands.md index 71bb2d30..4ef11438 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -95,7 +95,7 @@ Import Boost version (AKA "release") information from the Boost GitHub repo. Fun - **Retrieves Boost tags**: It collects all the Boost tags from the main Github repo, excluding beta releases and release candidates. For each tag, it gathers the associated data. If it's a full release, the data is in the tag; otherwise, the data is in the commit. - **Updates local database**: For each tag, it creates or updates a Version instance in the local database. -- **Options for managing versions and library versions**: The command provides options to delete existing versions and library versions, and to create new library versions for the most recent Boost version. +- Adds the download links from Artifactory for the release downloads - Idempotent. **Options** @@ -103,10 +103,8 @@ Import Boost version (AKA "release") information from the Boost GitHub repo. Fun Here are the options you can use: - `--delete-versions`: Deletes all existing Version instances in the database before importing new ones. -- `--delete-library-versions`: Deletes all existing LibraryVersion instances in the database before importing new ones. -- `--create-recent-library-versions`: Creates a LibraryVersion for each active Boost library and the most recent Boost version. -- `--skip-existing-versions`: If a Version exists in the database (by name), skip calling the GitHub API for more information on it. - `--token`: Pass a GitHub API token. If not passed, will use the value in `settings.GITHUB_TOKEN`. +- `--verbose`: Print output information ### Example: @@ -119,20 +117,7 @@ Output: Skipping boost-1.82.0.beta1, not a full release Saved version boost-1.81.0. Created: True Skipping boost-1.81.0.beta1, not a full release - release_by_tag_not_found - {"message": "release_by_tag_not_found", "tag_name": "boost-1.80.0", "repo_slug": "boost", "logger": "libraries.github", "level": "info", "timestamp": "2023-05-12T22:14:08.721270Z"} ... - Saved library version Math (boost-1.82.0). Created: True - Saved library version Xpressive (boost-1.82.0). Created: True - Saved library version Dynamic Bitset (boost-1.82.0). Created: True - Saved library version Multi-Index (boost-1.82.0). Created: True - ... - -**What does the `release_by_tag_not_found` error mean?** - -When importing the Boost releases, we first get a list of all the tags for the Boost repo. Then, we call the GitHub API's `get_release_by_tag` API to get metadata about the release. But not all Boost tags are also full releases. In this case, we call the `get_commit` endpoint with the commit SHA of the tag to get this information. We could call `get_commit` in every case, but the data in `get_release_by_tag` is more accurate, particularly for the release date. - -When we don't find the release in `get_release_by_tag`, we log the error (which is what you may see in the output), but we follow up by getting the data from the commit endpoint. ## `update_libraries` diff --git a/libraries/github.py b/libraries/github.py index 8d9f5c09..7f4b3fb8 100644 --- a/libraries/github.py +++ b/libraries/github.py @@ -1,17 +1,9 @@ -import base64 -import os -import re -from collections import defaultdict from datetime import datetime -from dateutil.parser import parse -import requests import structlog from django.contrib.auth import get_user_model -from django.core.exceptions import ValidationError -from django.core.validators import validate_email -from fastcore.net import HTTP404NotFoundError, HTTP422UnprocessableEntityError from fastcore.xtras import obj2dict -from ghapi.all import GhApi, paged + +from core.githubhelper import GithubAPIClient, GithubDataParser from .models import Category, Issue, Library, PullRequest @@ -22,548 +14,6 @@ logger = structlog.get_logger() User = get_user_model() -class GithubAPIClient: - """A class to interact with the GitHub API.""" - - def __init__( - self, - owner: str = "boostorg", - ref: str = "heads/master", - repo_slug: str = "boost", - token: str = None, - ) -> None: - """ - Initialize the GitHubAPIClient. - - :param owner: str, the repository owner - :param ref: str, the Git reference - :param repo_slug: str, the repository slug - """ - self.api = self.initialize_api(token=token) - self.owner = owner - self.ref = ref - self.repo_slug = repo_slug - self.logger = structlog.get_logger() - - # Modules we need to skip as they are not really Boost Libraries - self.skip_modules = [ - "inspect", - "boostbook", - "bcp", - "build", - "quickbook", - "litre", - "auto_index", - "boostdep", - "check_build", - "headers", - "boost_install", - "docca", - "cmake", - "more", - ] - - def initialize_api(self, token=None) -> GhApi: - """ - Initialize the GitHub API with the token from the environment variable. - - :return: GhApi, the GitHub API - """ - if token is None: - token = os.environ.get("GITHUB_TOKEN", None) - return GhApi(token=token) - - def get_blob(self, repo_slug: str = None, file_sha: str = None) -> dict: - """ - Get the blob from the GitHub API. - - :param repo_slug: str, the repository slug - :param file_sha: str, the file sha - :return: dict, the blob - """ - if not repo_slug: - repo_slug = self.repo_slug - return self.api.git.get_blob( - owner=self.owner, repo=repo_slug, file_sha=file_sha - ) - - def get_commit_by_sha(self, repo_slug: str = None, commit_sha: str = None) -> dict: - """Get a commit by its SHA.""" - if not repo_slug: - repo_slug = self.repo_slug - return self.api.git.get_commit( - owner=self.owner, repo=repo_slug, commit_sha=commit_sha - ) - - def get_commits( - self, - repo_slug: str = None, - branch: str = "master", - since: datetime = None, - until: datetime = None, - ) -> list: - """Get all commits to the specified branch of a repo. - - :param repo_slug: str, the repository slug. If not provided, the class - instance's repo_slug will be used. - :param branch: str, the branch name. Defaults to 'master'. - :param since: datetime, only return commits after this date. - :param until: datetime, only return commits before this date. - :return: List[Commit], list of all commits in the branch. - """ - repo_slug = repo_slug or self.repo_slug - - # Get the commits - try: - pages = list( - paged( - self.api.repos.list_commits, - owner=self.owner, - repo=repo_slug, - sha=branch, - since=since, - until=until, - per_page=100, - ) - ) - all_commits = [] - for page in pages: - all_commits.extend(page) - - except Exception as e: - self.logger.exception( - "get_all_commits_failed", repo=repo_slug, exc_msg=str(e) - ) - return [] - - return all_commits - - def get_first_tag(self, repo_slug: str = None): - """ - Retrieves the earliest tag in the repo. - - :param repo_slug: str, the repository slug - :return: tuple with GitHub tag object, commit date. - - See https://docs.github.com/en/rest/git/tags for tag object format. - """ - if not repo_slug: - repo_slug = self.repo_slug - - try: - per_page = 100 - page = 1 - all_tags = [] - - while True: - tags = self.api.repos.list_tags( - owner=self.owner, repo=repo_slug, per_page=per_page, page=page - ) - all_tags.extend(tags) - if len(tags) < per_page: # End of results - break - - page += 1 # Go to the next page - - # Sort the tags by the commit date. The first tag will be the earliest. - # The Github API doesn't return the commit date with the tag, so we have to - # retrieve each one individually. This is slow, but it's the only way to get - # the commit date. - def get_tag_commit_date(tag): - """Get the commit date for a tag. - - For commit format, see - https://docs.github.com/en/rest/commits/commits.""" - commit_sha = tag["commit"]["sha"] - commit = self.get_commit_by_sha(repo_slug, commit_sha) - return commit["committer"]["date"] - - annotated_tags = [(tag, get_tag_commit_date(tag)) for tag in all_tags] - sorted_tags = sorted(annotated_tags, key=lambda x: x[1]) - - # Return the first (earliest) tag - return sorted_tags[0] - - except Exception: - self.logger.exception("get_first_tag_and_date_failed", repo=repo_slug) - return None - - def get_gitmodules(self, repo_slug: str = None, ref: str = None) -> str: - """ - Get the .gitmodules file for the repo from the GitHub API. - - :param repo_slug: str, the repository slug - :param ref: dict, the Git reference object (the commit hash). - See https://docs.github.com/en/rest/git/refs for expected format. - :return: str, the .gitmodules file from the repo - """ - if not repo_slug: - repo_slug = self.repo_slug - - if not ref: - ref = self.get_ref() - tree_sha = ref["object"]["sha"] - - try: - tree = self.get_tree(tree_sha=tree_sha) - except HTTP422UnprocessableEntityError as e: - # Only happens for version 1.61.0; uncertain why. - self.logger.exception( - "get_gitmodules_failed", repo=repo_slug, exc_msg=str(e) - ) - return None - - for item in tree["tree"]: - if item["path"] == ".gitmodules": - file_sha = item["sha"] - blob = self.get_blob(repo_slug=repo_slug, file_sha=file_sha) - return base64.b64decode(blob["content"]) - - def get_libraries_json(self, repo_slug: str, tag: str = "master"): - """ - Retrieve library metadata from 'meta/libraries.json' - Each Boost library will have a `meta` directory with a `libraries.json` file. - Example: - https://github.com/boostorg/align/blob/5ad7df63cd792fbdb801d600b93cad1a432f0151/meta/libraries.json - """ - url = f"https://raw.githubusercontent.com/{self.owner}/{repo_slug}/{tag}/meta/libraries.json" # noqa - - try: - response = requests.get(url) - response.raise_for_status() - # This usually happens because the library does not have a `meta/libraries.json` - # in the requested tag. More likely to happen with older versions of libraries. - except requests.exceptions.HTTPError: - self.logger.exception( - "get_library_metadata_failed", repo=repo_slug, url=url - ) - return None - else: - return response.json() - - def get_file_content( - self, - repo_slug: str = None, - tag: str = "master", - file_path: str = "library-detail.adoc", - ) -> str: - """ - Get the specified file for the repo from the GitHub API, if it exists. - - :param repo_slug: str, the repository slug - :param tag: str, the Git tag - :param file_name: str, the name of the file to fetch. Should be - "library-detail.adoc" or "README.md". - :return: str, the specified file content from the repo - """ - url = f"https://raw.githubusercontent.com/{self.owner}/{repo_slug}/{tag}/{file_path}" # noqa - - response = requests.get(url) - - if not response.status_code == 200: - logger.exception( - "get_file_content_failed", repo=repo_slug, url=url, file=file_path - ) - return None - - return response.content - - def get_ref(self, repo_slug: str = None, ref: str = None) -> dict: - """ - Get the ref from the GitHub API. - - :param repo_slug: str, the repository slug - :param ref: str, the Git reference - :return: dict, the ref - """ - if not repo_slug: - repo_slug = self.repo_slug - if not ref: - ref = self.ref - return self.api.git.get_ref(owner=self.owner, repo=repo_slug, ref=ref) - - def get_repo(self, repo_slug: str = None) -> dict: - """ - Get the repository from the GitHub API. - - :param repo_slug: str, the repository slug - :return: dict, the repository - """ - if not repo_slug: - repo_slug = self.repo_slug - - try: - return self.api.repos.get(owner=self.owner, repo=repo_slug) - except HTTP404NotFoundError as e: - logger.info("repo_not_found", repo_slug=repo_slug, exc_msg=str(e)) - return - - def get_repo_issues( - self, owner: str, repo_slug: str, state: str = "all", issues_only: bool = True - ): - """ - Get all issues for a repo. - Note: The GitHub API considers both PRs and Issues to be "Issues" and does not - support filtering in the request, so to exclude PRs from the list of issues, we - do some manual filtering of the results - - Note: GhApi() returns results as AttrDict objects: - https://fastcore.fast.ai/basics.html#attrdict - """ - pages = list( - paged( - self.api.issues.list_for_repo, - owner=self.owner, - repo=repo_slug, - state=state, - per_page=100, - ) - ) - # Concatenate all pages into a single list - all_results = [] - for page in pages: - all_results.extend(page) - - # Filter results - results = [] - if issues_only: - results = [ - result for result in all_results if not result.get("pull_request") - ] - else: - results = all_results - - return results - - def get_repo_prs(self, repo_slug, state="all"): - """ - Get all PRs for a repo - Note: GhApi() returns results as AttrDict objects: - https://fastcore.fast.ai/basics.html#attrdict - """ - pages = list( - paged( - self.api.pulls.list, - owner=self.owner, - repo=repo_slug, - state=state, - per_page=100, - ) - ) - # Concatenate all pages into a single list - results = [] - for p in pages: - results.extend(p) - - return results - - def get_release_by_tag(self, tag_name: str, repo_slug: str = None) -> dict: - """Get a tag by name from the GitHub API.""" - if not repo_slug: - repo_slug = self.repo_slug - try: - return self.api.repos.get_release_by_tag( - owner=self.owner, repo=repo_slug, tag=tag_name - ) - except Exception: - # Not necessarily an error, so log it but don't raise. - logger.info( - "release_by_tag_not_found", tag_name=tag_name, repo_slug=repo_slug - ) - return - - def get_tags(self, repo_slug: str = None) -> dict: - """Get all the tags from the GitHub API.""" - if not repo_slug: - repo_slug = self.repo_slug - - per_page = 50 - page = 1 - tags = [] - - while True: - new_tags = self.api.repos.list_tags( - owner=self.owner, repo=repo_slug, per_page=per_page, page=page - ) - tags.extend(new_tags) - - # Check if we reached the last page - if len(new_tags) < per_page: - break - - page += 1 - - return tags - - def get_tree(self, repo_slug: str = None, tree_sha: str = None) -> dict: - """ - Get the tree from the GitHub API. - - :param repo_slug: str, the repository slug - :param tree_sha: str, the tree sha - :return: dict, the tree - """ - if not repo_slug: - repo_slug = self.repo_slug - return self.api.git.get_tree( - owner=self.owner, repo=repo_slug, tree_sha=tree_sha - ) - - def get_user_by_username(self, username: str) -> dict: - """Return the response from GitHub's /users/{username}/""" - return self.api.users.get_by_username(username=username) - - -class GithubDataParser: - def get_commits_per_month(self, commits: list[dict]): - """Get the number of commits per month from a list of commits. - - :param commits: List[Commit], list of commits. - :return: Dict[str, datetime], dictionary mapping month-year dates to commit - counts. - """ - commit_counts = defaultdict(int) - for commit in commits: - date = parse(commit.commit.author.date) - month_year = datetime(date.year, date.month, 1).date() - commit_counts[month_year] += 1 - - return dict(commit_counts) - - def parse_commit(self, commit_data: dict) -> dict: - """Parse the commit data from Github and return a dict of the data we want.""" - published_at = commit_data["committer"]["date"] - description = commit_data.get("message", "") - github_url = commit_data["html_url"] - release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").date() - return { - "release_date": release_date, - "description": description, - "github_url": github_url, - "data": obj2dict(commit_data), - } - - def parse_gitmodules(self, gitmodules: str) -> dict: - """Parse the .gitmodules file. - Expects the multiline contents of https://github.com/boostorg/boost/.gitmodules - to be passed in - - :param gitmodules: str, the .gitmodules file - :return: dict, the parsed .gitmodules file - """ - modules = [] - current_submodule = None - - submodule_re = re.compile(r"^\[submodule \"(.*)\"\]$") - url_re = re.compile(r"^\s*url\s*\=\s*\.\.\/(.*)\.git\s*$") - - for line in gitmodules.split("\n"): - sub_m = submodule_re.match(line) - if sub_m: - current_submodule = {"module": sub_m.group(1)} - continue - - url_m = url_re.match(line) - if url_m: - name = url_m.group(1) - current_submodule["url"] = name - modules.append(current_submodule) - current_submodule = None - - return modules - - def parse_libraries_json(self, libraries_json: dict) -> dict: - """Parse the individual library metadata from 'meta/libraries.json'.""" - return { - "name": libraries_json["name"], - "key": libraries_json["key"], - "authors": libraries_json.get("authors", []), - "description": libraries_json.get("description", ""), - "category": libraries_json.get("category", []), - "maintainers": libraries_json.get("maintainers", []), - "cxxstd": libraries_json.get("cxxstd"), - } - - def parse_tag(self, tag_data: dict) -> dict: - """Parse the tag data from Github and return a dict of the data we want.""" - published_at = tag_data.get("published_at", "") - description = tag_data.get("body", "") - github_url = tag_data.get("html_url", "") - release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").date() - return { - "release_date": release_date, - "description": description, - "github_url": github_url, - "data": obj2dict(tag_data), - } - - def extract_contributor_data(self, contributor: str) -> dict: - """Takes an author/maintainer string and returns a dict with their data.""" - data = {} - - email = self.extract_email(contributor) - if bool(email): - data["email"] = email - data["valid_email"] = True - else: - data["email"] = None - data["valid_email"] = False - - first_name, last_name = self.extract_names(contributor) - data["first_name"], data["last_name"] = first_name[:30], last_name[:30] - - return data - - def extract_email(self, val: str) -> str: - """ - Finds an email address in a string, reformats it, and returns it. - Assumes the email address is in this format: - - - Does not raise errors. - - Includes as many catches for variants in the formatting as I found in a first - pass. - """ - result = re.search("<.+>", val) - if result: - raw_email = result.group() - email = ( - raw_email.replace("-at-", "@") - .replace("- at -", "@") - .replace("-dot-", ".") - .replace("<", "") - .replace(">", "") - .replace(" ", "") - .replace("-underscore-", "_") - ) - try: - validate_email(email) - except ValidationError as e: - logger.info("Could not extract valid email", value=val, exc_msg=str(e)) - return - return email - - def extract_names(self, val: str) -> list: - """ - Returns a list of first, last names for the val argument. - - NOTE: This is an overly simplistic solution to importing names. - Names that don't conform neatly to "First Last" formats will need - to be cleaned up manually. - """ - # Strip the email, if present - email = re.search("<.+>", val) - if email: - val = val.replace(email.group(), "") - - names = val.strip().rsplit(" ", 1) - - if len(names) == 1: - names.append("") - - return names - - class LibraryUpdater: """ This class is used to sync Libraries from the list of git submodules diff --git a/libraries/management/commands/import_commit_counts.py b/libraries/management/commands/import_commit_counts.py index 7cbffa0d..f183ffae 100644 --- a/libraries/management/commands/import_commit_counts.py +++ b/libraries/management/commands/import_commit_counts.py @@ -1,6 +1,7 @@ import djclick as click -from libraries.github import GithubAPIClient, LibraryUpdater +from libraries.github import LibraryUpdater +from core.githubhelper import GithubAPIClient from libraries.models import Library diff --git a/libraries/management/commands/import_first_release_dates.py b/libraries/management/commands/import_first_release_dates.py index 3b3c8076..22b7c6c2 100644 --- a/libraries/management/commands/import_first_release_dates.py +++ b/libraries/management/commands/import_first_release_dates.py @@ -1,6 +1,7 @@ import djclick as click -from libraries.github import GithubAPIClient, LibraryUpdater +from libraries.github import LibraryUpdater +from core.githubhelper import GithubAPIClient from libraries.models import Library diff --git a/libraries/management/commands/import_library_versions.py b/libraries/management/commands/import_library_versions.py index e2bdb1c2..f9d75717 100644 --- a/libraries/management/commands/import_library_versions.py +++ b/libraries/management/commands/import_library_versions.py @@ -1,6 +1,7 @@ import djclick as click -from libraries.github import GithubAPIClient, GithubDataParser, LibraryUpdater +from libraries.github import LibraryUpdater +from core.githubhelper import GithubAPIClient, GithubDataParser from libraries.models import Library, LibraryVersion from libraries.tasks import get_and_store_library_version_documentation_urls_for_version from libraries.utils import parse_date diff --git a/libraries/tests/test_github.py b/libraries/tests/test_github.py index b3a68764..55d140f1 100644 --- a/libraries/tests/test_github.py +++ b/libraries/tests/test_github.py @@ -1,16 +1,14 @@ import datetime -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import MagicMock, patch import pytest -import responses from ghapi.all import GhApi from model_bakery import baker -from libraries.github import GithubAPIClient, GithubDataParser, LibraryUpdater +from libraries.github import LibraryUpdater +from core.githubhelper import GithubAPIClient from libraries.models import Category, Issue, Library, PullRequest -"""GithubAPIClient Tests""" - @pytest.fixture def github_api_client(): @@ -31,343 +29,6 @@ def github_api_client_mock(): return mock -def test_initialize_api(): - """Test the initialize_api method of GitHubAPIClient.""" - api = GithubAPIClient().initialize_api() - assert isinstance(api, GhApi) - - -def test_get_blob(github_api_client): - """Test the get_blob method of GitHubAPIClient.""" - github_api_client.api.git.get_blob = MagicMock( - return_value={"sha": "12345", "content": "example content", "encoding": "utf-8"} - ) - result = github_api_client.get_blob(repo_slug="sample_repo", file_sha="12345") - assert result == {"sha": "12345", "content": "example content", "encoding": "utf-8"} - github_api_client.api.git.get_blob.assert_called_with( - owner=github_api_client.owner, repo="sample_repo", file_sha="12345" - ) - - -########################################################################### -# Something is up with this test, it causes Pytest to fail spectacularly -# using Python 3.11. Commenting it out for now. - Frank -########################################################################### -# @pytest.mark.xfail(reason="Something up with bytes") -# @responses.activate -# def test_get_gitmodules(github_api_client): -# """Test the get_gitmodules method of GitHubAPIClient.""" -# sample_ref_response = { -# "object": { -# "sha": "12345", -# } -# } -# sample_tree_response = { -# "tree": [ -# { -# "path": ".gitmodules", -# "sha": "67890", -# } -# ] -# } - -# sample_content = "sample content" -# sample_blob_response = { -# "content": base64.b64encode(sample_content.encode("utf-8")).decode("utf-8") -# } - -# # Set up the mocked API responses -# ref_url = f"https://api.github.com/repos/{github_api_client.owner}/{github_api_client.repo_slug}/git/ref/{github_api_client.ref}" -# tree_url = f"https://api.github.com/repos/{github_api_client.owner}/{github_api_client.repo_slug}/git/trees/12345" - -# responses.add(responses.GET, ref_url, json=sample_ref_response, status=200) -# responses.add(responses.GET, tree_url, json=sample_tree_response, status=200) - -# # Mock the get_blob method -# github_api_client.get_blob = MagicMock(return_value=sample_blob_response) - -# # Call the get_gitmodules method -# result = github_api_client.get_gitmodules(repo_slug="sample_repo") - -# # Assert the expected result -# assert result == sample_content - -# # Check if the API calls were made with the correct arguments -# assert len(responses.calls) == 2 -# assert responses.calls[0].request.url == ref_url -# assert responses.calls[1].request.url == tree_url -# github_api_client.get_blob.assert_called_with( -# repo_slug="sample_repo", file_sha="67890" -# ) - - -@pytest.mark.skip(reason="Mocking the API is not working") -def test_get_first_tag(github_api_client, mock_api): - """Test the get_first_tag method of GithubAPIClient.""" - - # Mock tags from the GitHub API - mock_tags = [ - {"name": "tag2", "commit": {"sha": "2"}}, - {"name": "tag1", "commit": {"sha": "1"}}, - ] - - # Mock the commit data from the GitHub API - mock_commits = [ - {"sha": "2", "committer": {"date": "2023-05-12T00:00:00Z"}}, - {"sha": "1", "committer": {"date": "2023-05-11T00:00:00Z"}}, - ] - - # Setup the mock API to return the mock tags and commits - github_api_client.api.repos.list_tags.side_effect = MagicMock( - return_value=mock_tags - ) - github_api_client.api.git.get_commit.side_effect = MagicMock( - return_value=mock_commits - ) - repo_slug = "sample_repo" - tag = github_api_client.get_first_tag(repo_slug=repo_slug) - - # Assert that the earliest tag was returned - assert tag == (mock_tags[1], "2000-01-01T00:00:00Z") - - -@responses.activate -def test_get_libraries_json(github_api_client): - """Test the get_libraries_json method of GitHubAPIClient.""" - repo_slug = "sample_repo" - url = f"https://raw.githubusercontent.com/{github_api_client.owner}/{repo_slug}/master/meta/libraries.json" - sample_json = {"key": "math", "name": "Math"} - responses.add( - responses.GET, - url, - json=sample_json, - status=200, - content_type="application/json", - ) - result = github_api_client.get_libraries_json(repo_slug=repo_slug) - assert result == {"key": "math", "name": "Math"} - assert len(responses.calls) == 1 - assert responses.calls[0].request.url == url - - -def test_get_ref(github_api_client): - """Test the get_ref method of GitHubAPIClient.""" - github_api_client.api.git.get_ref = MagicMock( - return_value={"content": "example content"} - ) - result = github_api_client.get_ref(repo_slug="sample_repo", ref="head/main") - assert result == {"content": "example content"} - - -def test_get_repo(github_api_client): - """Test the get_repo method of GitHubAPIClient.""" - github_api_client.api.repos.get = MagicMock( - return_value={"content": "example content"} - ) - result = github_api_client.get_repo(repo_slug="sample_repo") - assert result == {"content": "example content"} - - -"""Parser Tests""" - - -def create_mock_commit(date): - """Create a mock commit with the given date.""" - commit = Mock() - commit.commit.author.date = date - return commit - - -def test_get_commits_per_month(): - # Construct the mock commits. - commits = [ - create_mock_commit(datetime.datetime(2023, 1, 15).isoformat()), - create_mock_commit(datetime.datetime(2022, 1, 10).isoformat()), - create_mock_commit(datetime.datetime(2022, 2, 1).isoformat()), - create_mock_commit(datetime.datetime(2023, 1, 16).isoformat()), - ] - - # Construct the object and call the method. - parser = GithubDataParser() - results = parser.get_commits_per_month(commits) - - # Check the result. - expected = { - datetime.datetime(2022, 1, 1).date(): 1, - datetime.datetime(2022, 2, 1).date(): 1, - datetime.datetime(2023, 1, 1).date(): 2, - } - assert expected == results - - -def test_parse_gitmodules(): - sample_gitmodules = """ -[submodule "system"] - path = libs/system - url = ../system.git - fetchRecurseSubmodules = on-demand - branch = . -[submodule "multi_array"] - path = libs/multi_array - url = ../multi_array.git - fetchRecurseSubmodules = on-demand - branch = . -""" - - parser = GithubDataParser() - parsed_data = parser.parse_gitmodules(sample_gitmodules) - - expected_output = [ - { - "module": "system", - "url": "system", - }, - { - "module": "multi_array", - "url": "multi_array", - }, - ] - - assert parsed_data == expected_output - - -def test_parse_libraries_json(): - sample_libraries_json = { - "key": "math", - "name": "Math", - "authors": [], - "description": "Sample Description", - "category": ["Math"], - "maintainers": [], - "cxxstd": "14", - } - - parser = GithubDataParser() - parser.parse_libraries_json(sample_libraries_json) - - -def test_parse_commit(): - commit_data = { - "committer": {"date": "2023-05-10T00:00:00Z"}, - "message": "This is a sample description for a commit", - "html_url": "http://example.com/commit/12345", - } - expected = { - "release_date": datetime.date(2023, 5, 10), - "description": commit_data["message"], - "github_url": "http://example.com/commit/12345", - "data": commit_data, - } - result = GithubDataParser().parse_commit(commit_data) - assert result == expected - - -def test_parse_tag(): - tag_data = { - "published_at": "2023-05-10T00:00:00Z", - "body": "This is a sample description for a tag", - "html_url": "http://example.com/commit/12345", - } - expected = { - "release_date": datetime.date(2023, 5, 10), - "description": "This is a sample description for a tag", - "github_url": "http://example.com/commit/12345", - "data": tag_data, - } - result = GithubDataParser().parse_tag(tag_data) - assert result == expected - - -def test_extract_names(): - sample = "Tester Testerson " - expected = ["Tester", "Testerson"] - result = GithubDataParser().extract_names(sample) - assert expected == result - - sample = "Tester Testerson" - expected = ["Tester", "Testerson"] - result = GithubDataParser().extract_names(sample) - assert expected == result - - sample = "Tester de Testerson " - expected = ["Tester de", "Testerson"] - result = GithubDataParser().extract_names(sample) - assert expected == result - - sample = "Tester de Testerson" - expected = ["Tester de", "Testerson"] - result = GithubDataParser().extract_names(sample) - assert expected == result - - sample = "Various" - expected = ["Various", ""] - result = GithubDataParser().extract_names(sample) - assert expected == result - - -def test_extract_email(): - expected = "t_testerson@example.com" - result = GithubDataParser().extract_email( - "Tester Testerston " - ) - assert expected == result - - expected = "t.t.testerson@example.com" - result = GithubDataParser().extract_email( - "Tester Testerston " - ) - assert expected == result - - expected = "t.t.testerson@example.sample.com" - result = GithubDataParser().extract_email( - "Tester Testerston " - ) - assert expected == result - - expected = None - result = GithubDataParser().extract_email("Tester Testeron") - assert expected == result - - expected = "t_tester@example.com" - result = GithubDataParser().extract_email( - "Tester Testerston " - ) - assert expected == result - - expected = "tester@example.com" - result = GithubDataParser().extract_email( - "Tester Testerston " - ) - assert expected == result - - -def test_extract_contributor_data(): - sample = "Tester Testerson " - expected = { - "valid_email": True, - "email": "tester@gmail.com", - "first_name": "Tester", - "last_name": "Testerson", - } - result = GithubDataParser().extract_contributor_data(sample) - assert expected == result - - sample = "Tester Testerson" - expected = { - "valid_email": False, - "first_name": "Tester", - "last_name": "Testerson", - } - result = GithubDataParser().extract_contributor_data(sample) - assert expected["valid_email"] is False - assert expected["first_name"] == result["first_name"] - assert expected["last_name"] == result["last_name"] - assert "email" in result - - -"""LibraryUpdater Tests""" - - @pytest.fixture def mock_gh_api_client(): client = GithubAPIClient() diff --git a/libraries/views.py b/libraries/views.py index 6a2449b0..e777e08f 100644 --- a/libraries/views.py +++ b/libraries/views.py @@ -10,7 +10,8 @@ from django.views.generic.edit import FormMixin from versions.models import Version from .forms import VersionSelectionForm -from .github import GithubAPIClient + +from core.githubhelper import GithubAPIClient from .mixins import VersionAlertMixin from .models import Category, CommitData, Library, LibraryVersion diff --git a/users/tasks.py b/users/tasks.py index f29f114c..85e12150 100644 --- a/users/tasks.py +++ b/users/tasks.py @@ -3,7 +3,7 @@ import structlog from django.contrib.auth import get_user_model from config.celery import app -from libraries.github import GithubAPIClient +from core.githubhelper import GithubAPIClient logger = structlog.getLogger(__name__) diff --git a/versions/management/commands/import_versions.py b/versions/management/commands/import_versions.py index 11aed4a9..90637c5a 100644 --- a/versions/management/commands/import_versions.py +++ b/versions/management/commands/import_versions.py @@ -1,130 +1,99 @@ import djclick as click from django.core.management import call_command +from fastcore.xtras import obj2dict -from libraries.github import GithubAPIClient, GithubDataParser -from libraries.models import Library, LibraryVersion +from core.githubhelper import GithubAPIClient from versions.models import Version +from versions.tasks import get_release_date_for_version + +# Minimum Boost version to import +MIN_BOOST_VERSION = "1.10.3" + +# Skip beta releases, release candidates, and pre-1.0 versions +EXCLUSIONS = ["beta", "-rc"] + +# Base url to generate the GitHub release URL +BASE_GITHUB_URL = "https://github.com/boostorg/boost/releases/tag/" @click.command() +@click.option("--verbose", is_flag=True, help="Enable verbose output.") @click.option("--delete-versions", is_flag=True, help="Delete all existing versions") -@click.option( - "--skip-existing-versions", - is_flag=True, - help="Skip versions that already exist in our database", -) -@click.option( - "--delete-library-versions", - is_flag=True, - help="Delete all existing library versions", -) -@click.option( - "--create-recent-library-versions", - is_flag=True, - help=( - "Create library-versions for the most recent Boost version and each active " - "Boost library" - ), -) @click.option("--token", is_flag=False, help="Github API token") def command( + verbose, delete_versions, - skip_existing_versions, - delete_library_versions, - create_recent_library_versions, token, ): """Imports Boost release information from Github and updates the local database. The function retrieves Boost tags from the main Github repo, excluding beta releases - and release candidates. For each tag, it fetches the associated data based on - whether it's a full release (data in the tag) or not (data in the commit). + and release candidates. It then creates or updates a Version instance in the local database for each tag. - Depending on the options provided, it can also delete existing versions and library - versions, and create new library versions for the most recent Boost version. Args: + verbose (bool): Enable verbose output (show logging statements) delete_versions (bool): If True, deletes all existing Version instances before importing. - skip-existing-versions (bool): If True, skips versions that already exist in - the database. - delete_library_versions (bool): If True, deletes all existing LibraryVersion - instances before importing. - create_recent_library_versions (bool): If True, creates a LibraryVersion for - each active Boost library and the most recent Boost version. token (str): Github API token, if you need to use something other than the setting. """ - # Delete Versions and LibraryVersions based on options if delete_versions: Version.objects.all().delete() click.echo("Deleted all existing versions.") - if delete_library_versions: - LibraryVersion.objects.all().delete() - click.echo("Deleted all existing library versions.") - # Get all Boost tags from Github client = GithubAPIClient(token=token) tags = client.get_tags() + for tag in tags: name = tag["name"] + if verbose: + click.secho(f"Importing {name}...", fg="yellow") - # If we already have this version, skip importing it - if skip_existing_versions and Version.objects.filter(name=name).exists(): - click.echo(f"Skipping {name}, already exists in database") + if skip_tag(name): continue - # Skip beta releases, release candidates, and pre-1.0 versions - if any( - ["beta" in name.lower(), "-rc" in name.lower(), "boost-0" in name.lower()] - ): - click.echo(f"Skipping {name}, not a full release") - continue + # Save the Version object + version, _ = Version.objects.update_or_create( + name=name, + defaults={"github_url": f"{BASE_GITHUB_URL}/{name}", "data": obj2dict(tag)}, + ) - # Get the metadata about the release from Github - - version_data = None - parser = GithubDataParser() - - # Try to get the metadata about the release from the tag - tag_data = client.get_release_by_tag(name) - if tag_data: - # This is a tag and a release, so the metadata is in the tag itself and - # we can parse the data we already have - version_data = parser.parse_tag(tag_data) - else: - # This is a tag, but not a release, so the metadata is in the commit - commit_data = client.get_commit_by_sha(commit_sha=tag["commit"]["sha"]) - version_data = parser.parse_commit(commit_data) - - if not version_data: - click.echo(f"Skipping {name}, no version data found") - continue - - version, _ = Version.objects.update_or_create(name=name, defaults=version_data) - click.echo(f"Saved version {version.name}. Created: {_}") + # Load the release date if needed + if not version.release_date: + try: + get_release_date_for_version.delay( + version.id, tag["commit"]["sha"], token=token + ) + except Exception as e: + click.secho(f"Failed to load release date for {name}: {e}", fg="red") + # Load the release downloads add_release_downloads(version) - if create_recent_library_versions: - # Associate existing Libraries with the most recent LibraryVersion - version = Version.objects.most_recent() - for library in Library.objects.all(): - library_version, _ = LibraryVersion.objects.get_or_create( - library=library, version=version - ) - click.echo(f"Saved library version {library_version}. Created: {_}") + click.secho(f"Saved version {version.name}. Created: {_}", fg="green") def add_release_downloads(version): version_num = version.name.replace("boost-", "") if version_num < "1.63.0": - print( - "Cannot get release downloads from Artifactory for versions before 1.63.0" - ) return call_command("import_artifactory_release_data", release=version_num) + + +def skip_tag(name): + """Returns True if the given tag should be skipped.""" + # If this version falls in our exclusion list, skip it + if any(pattern in name.lower() for pattern in EXCLUSIONS): + return True + + # If this version is too old, skip it + version_num = name.replace("boost-", "") + if version_num < MIN_BOOST_VERSION: + return True + + return False diff --git a/versions/migrations/0009_alter_version_release_date.py b/versions/migrations/0009_alter_version_release_date.py new file mode 100644 index 00000000..0a286acd --- /dev/null +++ b/versions/migrations/0009_alter_version_release_date.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.2 on 2023-08-31 18:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("versions", "0008_remove_versionfile_file_versionfile_display_name_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="version", + name="release_date", + field=models.DateField(null=True), + ), + ] diff --git a/versions/models.py b/versions/models.py index 5547139d..4fa8e108 100755 --- a/versions/models.py +++ b/versions/models.py @@ -10,7 +10,7 @@ class Version(models.Model): max_length=256, null=False, blank=False, help_text="Version name" ) slug = models.SlugField(blank=True, null=True) - release_date = models.DateField(auto_now=False, auto_now_add=False) + release_date = models.DateField(auto_now=False, auto_now_add=False, null=True) description = models.TextField(blank=True) active = models.BooleanField( default=True, diff --git a/versions/tasks.py b/versions/tasks.py new file mode 100644 index 00000000..3af78f82 --- /dev/null +++ b/versions/tasks.py @@ -0,0 +1,53 @@ +import structlog + +from config.celery import app +from django.conf import settings +from core.githubhelper import GithubAPIClient, GithubDataParser +from versions.models import Version + + +logger = structlog.getLogger(__name__) + + +@app.task +def get_release_date_for_version(version_pk, commit_sha, token=None): + """ + Gets and stores the release date for a Boost version using the given commit SHA. + + :param version_pk: The primary key of the version to get the release date for. + :param commit_sha: The SHA of the commit to get the release date for. + """ + try: + version = Version.objects.get(pk=version_pk) + except Version.DoesNotExist: + logger.error( + "get_release_date_for_version_no_version_found", version_pk=version_pk + ) + return + + if not token: + token = settings.GITHUB_TOKEN + + parser = GithubDataParser() + client = GithubAPIClient(token=token) + + try: + commit = client.get_commit_by_sha(commit_sha=commit_sha) + except Exception as e: + logger.error( + "get_release_date_for_version_failed", + version_pk=version_pk, + commit_sha=commit_sha, + e=str(e), + ) + return + + commit_data = parser.parse_commit(commit) + release_date = commit_data.get("release_date") + + if release_date: + version.release_date = release_date + version.save() + logger.info("get_release_date_for_version_success", version_pk=version_pk) + else: + logger.error("get_release_date_for_version_error", version_pk=version_pk) diff --git a/versions/tests/test_tasks.py b/versions/tests/test_tasks.py new file mode 100644 index 00000000..b3a580e4 --- /dev/null +++ b/versions/tests/test_tasks.py @@ -0,0 +1,33 @@ +from datetime import datetime +from unittest.mock import MagicMock, patch +from versions.tasks import get_release_date_for_version + +import pytest + + +@pytest.fixture +def github_api_client(): + return MagicMock() + + +@pytest.mark.django_db +def test_get_release_date_for_version(version): + """ + Test that the `get_release_date_for_version` task fetches and updates + the release date. + """ + commit_url = "https://api.github.com/repos/boostorg/boost/git/commits/some_sha" + expected = datetime(2023, 1, 1).date() + + with patch( + "core.githubhelper.GithubAPIClient.get_commit_by_sha" + ) as mock_get_commit_by_sha: + mock_get_commit_by_sha.return_value = { + "committer": {"date": "2023-01-01T00:00:00Z"}, + "message": "some_message", + "html_url": "some_url", + } + get_release_date_for_version(version.pk, commit_url) + + version.refresh_from_db() + assert version.release_date == expected