Release reports refactor (#1996) (#1999)

Co-authored-by: Greg Kaleka <greg@gregkaleka.com>
2026-01-19 04:42:17 +00:00 · 2025-12-01 10:27:05 -08:00
parent 61c651c665
commit 69a652d066
19 changed files with 1160 additions and 740 deletions
--- a/reports/generation.py
+++ b/reports/generation.py
@@ -3,22 +3,35 @@ import io
 import json
 import logging
 import random
-from datetime import datetime, timedelta, date
+from dataclasses import dataclass, field
+from datetime import timedelta, date
+from functools import cached_property
+from itertools import chain, groupby
+from operator import attrgetter

 import psycopg2
 from django.conf import settings
 from django.contrib.staticfiles import finders
-from django.db.models import Count
-from django.db.models.functions import ExtractWeek, ExtractIsoYear
+from django.db.models import OuterRef, Q, F, Case, When, Value, Sum, Count
 from matplotlib import pyplot as plt
 from wordcloud import WordCloud, STOPWORDS
 from algoliasearch.analytics.client import AnalyticsClientSync

 from core.models import SiteSettings
 from libraries.constants import RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
-from libraries.models import WordcloudMergeWord  # TODO: move model to this app
-from mailing_list.models import PostingData, SubscriptionData
+from libraries.models import (
+    WordcloudMergeWord,  # TODO: move model to this app
+    CommitAuthor,
+    LibraryVersion,
+    Issue,
+    Commit,
+    Library,
+)
+from libraries.utils import batched
+from mailing_list.models import EmailData
 from reports.constants import WORDCLOUD_FONT
+from slack.models import Channel, SlackActivityBucket, SlackUser
+from versions.exceptions import BoostImportedDataException
 from versions.models import Version

 logger = logging.getLogger(__name__)
@@ -48,9 +61,13 @@ def generate_algolia_words(
        "index": version.stripped_boost_url_slug,
        "limit": 100,
    }
-    search_results = client.get_top_searches(**args).to_json()
-    search_data = json.loads(search_results)
-    return {r["search"]: r["count"] for r in search_data["searches"] if r["count"] > 1}
+    try:
+        search_results = client.get_top_searches(**args).to_json()
+        search_data = json.loads(search_results)
+        searches = search_data.get("searches") or []
+        return {r["search"]: r["count"] for r in searches if r["count"] > 1}
+    except ValueError:
+        return {}


 def generate_wordcloud(
@@ -147,73 +164,27 @@ def get_mail_content(version: Version, prior_version: Version):
            yield content


-def get_mailing_list_post_stats(start_date: datetime, end_date: datetime):
-    data = (
-        PostingData.objects.filter(post_time__gt=start_date, post_time__lte=end_date)
-        .annotate(week=ExtractWeek("post_time"), iso_year=ExtractIsoYear("post_time"))
-        .values("iso_year", "week")
-        .annotate(count=Count("id"))
-        .order_by("iso_year", "week")
+def get_mailinglist_counts(version: Version):
+    return (
+        EmailData.objects.filter(version=version)
+        .with_total_counts()
+        .order_by("-total_count")[:10]
    )

-    chart_data = []
-
-    for row in data:
-        week_number = row["week"]
-        year_number = str(row["iso_year"])[2:]  # e.g. 25
-        x = f"{week_number} ({year_number})"  # e.g., "51 (24)", "1 (25)"
-        y = row["count"]
-        chart_data.append({"x": x, "y": y})
-
-    return chart_data
-
-
-def get_new_subscribers_stats(start_date: datetime, end_date: datetime):
-    data = (
-        SubscriptionData.objects.filter(
-            subscription_dt__gte=start_date,
-            subscription_dt__lte=end_date,
-            list="boost",
-        )
-        .annotate(
-            week=ExtractWeek("subscription_dt"),
-            iso_year=ExtractIsoYear("subscription_dt"),
-        )
-        .values("iso_year", "week")
-        .annotate(count=Count("id"))
-        .order_by("iso_year", "week")
-    )
-
-    # Convert data into a dict for easy lookup
-    counts_by_week = {(row["iso_year"], row["week"]): row["count"] for row in data}
-
-    # Iterate through every ISO week in the date range
-    current = start_date
-    seen = set()
-    chart_data = []
-    while current <= end_date:
-        iso_year, iso_week, _ = current.isocalendar()
-        key = (iso_year, iso_week)
-        if key not in seen:  # skip duplicate weeks in the same loop
-            seen.add(key)
-            year_suffix = str(iso_year)[2:]
-            label = f"{iso_week} ({year_suffix})"
-            count = counts_by_week.get(key, 0)
-            chart_data.append({"x": label, "y": count})
-        current += timedelta(days=7)  # hop by weeks
-
-    return chart_data
-

 def get_algolia_search_stats(client: AnalyticsClientSync, version: Version) -> dict:
    default_args = {"index": version.stripped_boost_url_slug}
    # search data
    search_response = client.get_searches_count(**default_args).to_json()
    search_data = json.loads(search_response)
-    # country data
-    country_results = client.get_top_countries(**default_args, limit=100).to_json()
-    country_data = json.loads(country_results)
-    country_stats = {r["country"]: r["count"] for r in country_data["countries"]}
+    try:
+        # country data
+        country_results = client.get_top_countries(**default_args, limit=100).to_json()
+        country_data = json.loads(country_results)
+        countries = country_data.get("countries") or []
+        country_stats = {r["country"]: r["count"] for r in countries}
+    except ValueError:
+        country_stats = {}
    return {
        "total_searches": search_data.get("count"),
        "country_stats": country_stats,
@@ -221,3 +192,524 @@ def get_algolia_search_stats(client: AnalyticsClientSync, version: Version) -> d
            :RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
        ],
    }
+
+
+def determine_versions(report_configuration_name: str) -> tuple[bool, Version, Version]:
+    version = Version.objects.filter(name=report_configuration_name).first()
+    report_before_release = False if version else True
+    prior_version = None
+    if report_before_release:
+        # if the version is not set then the user has chosen a report configuration
+        #  that's not matching a live version, so we use the most recent version
+        version = Version.objects.filter(name="master").first()
+        prior_version = Version.objects.most_recent()
+
+    if not prior_version:
+        prior_version = (
+            Version.objects.minor_versions()
+            .filter(version_array__lt=version.cleaned_version_parts_int)
+            .order_by("-version_array")
+            .first()
+        )
+    return report_before_release, prior_version, version
+
+
+def get_dependency_data(library_order, version):
+    try:
+        dependency_diff_values = version.get_dependency_diffs().values()
+    except BoostImportedDataException as e:
+        logger.warning(f"Could not get dependency diffs for version {version}: {e}")
+        dependency_diff_values = {}
+
+    diffs_by_id = {x["library_id"]: x for x in dependency_diff_values}
+    diffs = []
+    for lib_id in library_order:
+        diffs.append(diffs_by_id.get(lib_id, {}))
+    return diffs
+
+
+def global_new_contributors(version):
+    version_lt = list(
+        Version.objects.minor_versions()
+        .filter(version_array__lt=version.cleaned_version_parts_int)
+        .order_by("id")
+        .values_list("id", flat=True)
+    )
+
+    prior_version_author_ids = (
+        CommitAuthor.objects.filter(commit__library_version__version__in=version_lt)
+        .distinct()
+        .values_list("id", flat=True)
+    )
+
+    version_author_ids = (
+        CommitAuthor.objects.filter(
+            commit__library_version__version__in=version_lt + [version.id]
+        )
+        .distinct()
+        .values_list("id", flat=True)
+    )
+
+    return set(version_author_ids) - set(prior_version_author_ids)
+
+
+def get_library_queryset_by_version(version: Version, annotate_commit_count=False):
+    from libraries.forms import CreateReportFullForm
+
+    qs = CreateReportFullForm.library_queryset.none()
+    if version:
+        qs = CreateReportFullForm.library_queryset.filter(
+            library_version=LibraryVersion.objects.filter(
+                library=OuterRef("id"), version=version
+            )[:1],
+        )
+    if annotate_commit_count:
+        qs = qs.annotate(commit_count=Count("library_version__commit"))
+    return qs
+
+
+def get_top_libraries_for_version(version):
+    library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
+    return library_qs.order_by("-commit_count")
+
+
+def get_libraries_by_name(version):
+    library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
+    return library_qs.order_by("name")
+
+
+def get_libraries_by_quality(version):
+    # returns "great", "good", and "standard" libraries in that order
+    library_qs = get_library_queryset_by_version(version)
+    return list(
+        chain(
+            library_qs.filter(graphic__isnull=False),
+            library_qs.filter(graphic__isnull=True, is_good=True),
+            library_qs.filter(graphic__isnull=True, is_good=False),
+        )
+    )
+
+
+def get_library_version_counts(library_order, version):
+    library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
+    return sorted(
+        list(library_qs.values("commit_count", "id")),
+        key=lambda x: library_order.index(x["id"]),
+    )
+
+
+def get_library_full_counts(libraries, library_order):
+    return sorted(
+        list(
+            libraries.annotate(commit_count=Count("library_version__commit")).values(
+                "commit_count", "id"
+            )
+        ),
+        key=lambda x: library_order.index(x["id"]),
+    )
+
+
+def get_top_contributors_for_library_version(library_order, version):
+    top_contributors_release = []
+    for library_id in library_order:
+        top_contributors_release.append(
+            CommitAuthor.objects.filter(
+                commit__library_version=LibraryVersion.objects.get(
+                    version=version, library_id=library_id
+                )
+            )
+            .annotate(commit_count=Count("commit"))
+            .order_by("-commit_count")[:10]
+        )
+    return top_contributors_release
+
+
+def count_new_contributors(libraries, library_order, version):
+    version_lt = list(
+        Version.objects.minor_versions()
+        .filter(version_array__lt=version.cleaned_version_parts_int)
+        .values_list("id", flat=True)
+    )
+    version_lte = version_lt + [version.id]
+    lt_subquery = LibraryVersion.objects.filter(
+        version__in=version_lt,
+        library=OuterRef("id"),
+    ).values("id")
+    lte_subquery = LibraryVersion.objects.filter(
+        version__in=version_lte,
+        library=OuterRef("id"),
+    ).values("id")
+    return sorted(
+        list(
+            libraries.annotate(
+                authors_before_release_count=Count(
+                    "library_version__commit__author",
+                    filter=Q(library_version__in=lt_subquery),
+                    distinct=True,
+                ),
+                authors_through_release_count=Count(
+                    "library_version__commit__author",
+                    filter=Q(library_version__in=lte_subquery),
+                    distinct=True,
+                ),
+            )
+            .annotate(
+                count=F("authors_through_release_count")
+                - F("authors_before_release_count")
+            )
+            .values("id", "count")
+        ),
+        key=lambda x: library_order.index(x["id"]),
+    )
+
+
+def count_issues(libraries, library_order, version, prior_version):
+    data = {
+        x["library_id"]: x
+        for x in Issue.objects.count_opened_closed_during_release(
+            version, prior_version
+        ).filter(library_id__in=[x.id for x in libraries])
+    }
+    ret = []
+    for lib_id in library_order:
+        if lib_id in data:
+            ret.append(data[lib_id])
+        else:
+            ret.append({"opened": 0, "closed": 0, "library_id": lib_id})
+    return ret
+
+
+def get_library_versions(library_order, version):
+    return sorted(
+        list(
+            LibraryVersion.objects.filter(version=version, library_id__in=library_order)
+        ),
+        key=lambda x: library_order.index(x.library_id),
+    )
+
+
+def get_top_contributors_for_version(version):
+    from libraries.forms import CreateReportFullForm
+
+    return (
+        CommitAuthor.objects.filter(commit__library_version__version=version)
+        .annotate(
+            commit_count=Count(
+                "commit",
+                filter=Q(
+                    commit__library_version__library__in=CreateReportFullForm.library_queryset
+                ),
+            )
+        )
+        .order_by("-commit_count")[:10]
+    )
+
+
+def get_git_graph_data(prior_version: Version | None, version: Version):
+    """Fetch commit count data for a release and return an instance of Graph.
+
+    Returns data in a format to easily create a github style green box commit graph.
+
+    """
+    if prior_version is None:
+        return None
+
+    @dataclass
+    class Day:
+        date: date
+        count: int
+        color: str = ""
+
+    @dataclass
+    class Week:
+        days: list[Day] = field(default_factory=list)
+
+        @cached_property
+        def max(self):
+            """The max number of commits this week."""
+            return max(x.count for x in self.days)
+
+    @dataclass
+    class Graph:
+        weeks: list[Week] = field(default_factory=list)
+        colors: list[str] = field(
+            default_factory=lambda: [
+                "#E8F5E9",
+                "#C8E6C9",
+                "#A5D6A7",
+                "#81C784",
+                "#66BB6A",
+                "#4CAF50",
+                "#43A047",
+                "#388E3C",
+                "#2E7D32",
+                "#1B5E20",
+            ],
+        )
+
+        @cached_property
+        def graph_start(self):
+            return start.strftime("%B '%y")
+
+        @cached_property
+        def graph_end(self):
+            return end.strftime("%B '%y")
+
+        @cached_property
+        def max(self):
+            """The max number of commits in all weeks."""
+            return max(x.max for x in self.weeks)
+
+        def append_day(self, day: Day):
+            """Append a day into the last week of self.weeks.
+
+            - Automatically create a new week if there are already 7 days in the
+            last week.
+            """
+            if len(self.weeks) == 0 or len(self.weeks[-1].days) == 7:
+                self.weeks.append(Week())
+            self.weeks[-1].days.append(day)
+
+        def apply_colors(self):
+            """Iterate through each day and apply a color.
+
+            - The color is selected based on the number of commits made on
+            that day, relative to the highest number of commits in all days in
+            Graph.weeks.days.
+
+            """
+            if not (high := self.max):
+                # No commits this release
+                # TODO: we may want a more elegant solution
+                #  than just not graphing this library
+                return
+            for week in self.weeks:
+                for day in week.days:
+                    decimal = day.count / high
+                    if decimal == 1:
+                        day.color = self.colors[-1]
+                    else:
+                        idx = int(decimal * len(self.colors))
+                        day.color = self.colors[idx]
+
+    count_query = (
+        Commit.objects.filter(library_version__version=version)
+        .values("committed_at__date")
+        .annotate(count=Count("id"))
+    )
+    counts_by_date = {x["committed_at__date"]: x["count"] for x in count_query}
+
+    graph = Graph()
+    # The start date is the release date of the previous version
+    # The end date is one day before the release date of the current version
+    start: date = prior_version.release_date
+    end: date = (version.release_date or date.today()) - timedelta(days=1)
+
+    # if the release started on a Thursday, we want to add Sun -> Wed to the data
+    # with empty counts, even if they aren't part of the release.
+    for i in range(start.weekday(), 0, -1):
+        day = Day(date=start - timedelta(days=i), count=0)
+        graph.append_day(day)
+
+    current_date = start
+    while current_date <= end:
+        day = Day(date=current_date, count=counts_by_date.get(current_date, 0))
+        graph.append_day(day)
+        current_date = current_date + timedelta(days=1)
+    graph.apply_colors()
+    return graph
+
+
+def get_libraries(library_order: list[int]):
+    return Library.objects.filter(id__in=library_order).order_by(
+        Case(*[When(id=pk, then=Value(pos)) for pos, pk in enumerate(library_order)])
+    )
+
+
+def get_library_data(library_order: list[int], prior_version_id: int, version_id: int):
+    prior_version = Version.objects.get(pk=prior_version_id)
+    version = Version.objects.get(pk=version_id)
+    libraries = get_libraries(library_order)
+    library_data = [
+        {
+            "library": item[0],
+            "full_count": item[1],
+            "version_count": item[2],
+            "top_contributors_release": item[3],
+            "new_contributors_count": item[4],
+            "issues": item[5],
+            "library_version": item[6],
+            "deps": item[7],
+        }
+        for item in zip(
+            libraries,
+            get_library_full_counts(libraries, library_order),
+            get_library_version_counts(library_order, version),
+            get_top_contributors_for_library_version(library_order, version),
+            count_new_contributors(libraries, library_order, version),
+            count_issues(libraries, library_order, version, prior_version),
+            get_library_versions(library_order, version),
+            get_dependency_data(library_order, version),
+        )
+    ]
+    return [x for x in library_data if x["version_count"]["commit_count"] > 0]
+
+
+def get_top_libraries():
+    from libraries.forms import CreateReportFullForm
+
+    return CreateReportFullForm.library_queryset.annotate(
+        commit_count=Count("library_version__commit")
+    ).order_by("-commit_count")[:5]
+
+
+def lines_changes_count(version: Version):
+    from libraries.forms import CreateReportFullForm
+
+    lines_added = LibraryVersion.objects.filter(
+        version=version,
+        library__in=CreateReportFullForm.library_queryset,
+    ).aggregate(lines=Sum("insertions"))["lines"]
+
+    lines_removed = LibraryVersion.objects.filter(
+        version=version,
+        library__in=CreateReportFullForm.library_queryset,
+    ).aggregate(lines=Sum("deletions"))["lines"]
+
+    return lines_added, lines_removed
+
+
+def get_commit_counts(version: Version):
+    from libraries.forms import CreateReportFullForm
+
+    commit_count = Commit.objects.filter(
+        library_version__version__name__lte=version.name,
+        library_version__library__in=CreateReportFullForm.library_queryset,
+    ).count()
+    version_commit_count = Commit.objects.filter(
+        library_version__version=version,
+        library_version__library__in=CreateReportFullForm.library_queryset,
+    ).count()
+
+    return commit_count, version_commit_count
+
+
+def get_issues_counts(prior_version: Version, version: Version):
+    from libraries.forms import CreateReportFullForm
+
+    opened_issues_count = (
+        Issue.objects.filter(library__in=CreateReportFullForm.library_queryset)
+        .opened_during_release(version, prior_version)
+        .count()
+    )
+    closed_issues_count = (
+        Issue.objects.filter(library__in=CreateReportFullForm.library_queryset)
+        .closed_during_release(version, prior_version)
+        .count()
+    )
+
+    return opened_issues_count, closed_issues_count
+
+
+def get_download_links(version: Version):
+    return {
+        k: list(v)
+        for k, v in groupby(
+            version.downloads.all().order_by("operating_system"),
+            key=attrgetter("operating_system"),
+        )
+    }
+
+
+def get_mailinglist_msg_counts(version: Version) -> tuple[int, int]:
+    total_mailinglist_count = EmailData.objects.filter(version=version).aggregate(
+        total=Sum("count")
+    )["total"]
+    mailinglist_counts = (
+        EmailData.objects.filter(version=version)
+        .with_total_counts()
+        .order_by("-total_count")[:10]
+    )
+
+    return total_mailinglist_count, mailinglist_counts
+
+
+def get_slack_channels():
+    return batched(
+        Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
+    )
+
+
+def get_libraries_for_index(library_data, version: Version):
+    library_index_library_data = []
+    for library in get_libraries_by_quality(version):
+        library_index_library_data.append(
+            (
+                library,
+                library in [lib["library"] for lib in library_data],
+            )
+        )
+    return library_index_library_data
+
+
+def get_slack_stats_for_channels(
+    prior_version, version, channels: list[Channel] | None = None
+):
+    """Get slack stats for specific channels, or all channels."""
+    start = prior_version.release_date
+    end = date.today()
+    if version.release_date:
+        end = version.release_date - timedelta(days=1)
+    # count of all messages in the date range
+    q = Q(day__range=[start, end])
+    if channels:
+        q &= Q(channel__in=channels)
+    total = SlackActivityBucket.objects.filter(q).aggregate(total=Sum("count"))["total"]
+    # message counts per user in the date range
+    q = Q(slackactivitybucket__day__range=[start, end])
+    if channels:
+        q &= Q(slackactivitybucket__channel__in=channels)
+    per_user = (
+        SlackUser.objects.annotate(
+            total=Sum(
+                "slackactivitybucket__count",
+                filter=q,
+            )
+        )
+        .filter(total__gt=0)
+        .order_by("-total")
+    )
+    q = Q()
+    if channels:
+        q &= Q(channel__in=channels)
+    distinct_users = (
+        SlackActivityBucket.objects.filter(q).order_by("user_id").distinct("user_id")
+    )
+    new_user_count = (
+        distinct_users.filter(day__lte=end).count()
+        - distinct_users.filter(day__lt=start).count()
+    )
+    return {
+        "users": per_user[:10],
+        "user_count": per_user.count(),
+        "total": total,
+        "new_user_count": new_user_count,
+    }
+
+
+def get_slack_stats(prior_version: Version, version: Version):
+    """Returns all slack related stats.
+
+    Only returns channels with activity.
+    """
+    stats = []
+    for channel in Channel.objects.filter(name__istartswith="boost"):
+        channel_stat = get_slack_stats_for_channels(
+            prior_version, version, channels=[channel]
+        )
+        channel_stat["channel"] = channel
+        if channel_stat["user_count"] > 0:
+            stats.append(channel_stat)
+    stats.sort(key=lambda x: -(x["total"] or 0))  # Convert None to 0
+    # we want 2 channels per pdf page, use batched to get groups of 2
+    return batched(stats, 2)