Release reports refactor (#1996) (#1999)

Co-authored-by: Greg Kaleka <greg@gregkaleka.com>
This commit is contained in:
daveoconnor
2025-12-01 10:27:05 -08:00
committed by GitHub
parent 61c651c665
commit 69a652d066
19 changed files with 1160 additions and 740 deletions

View File

@@ -3,22 +3,35 @@ import io
import json
import logging
import random
from datetime import datetime, timedelta, date
from dataclasses import dataclass, field
from datetime import timedelta, date
from functools import cached_property
from itertools import chain, groupby
from operator import attrgetter
import psycopg2
from django.conf import settings
from django.contrib.staticfiles import finders
from django.db.models import Count
from django.db.models.functions import ExtractWeek, ExtractIsoYear
from django.db.models import OuterRef, Q, F, Case, When, Value, Sum, Count
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from algoliasearch.analytics.client import AnalyticsClientSync
from core.models import SiteSettings
from libraries.constants import RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
from libraries.models import WordcloudMergeWord # TODO: move model to this app
from mailing_list.models import PostingData, SubscriptionData
from libraries.models import (
WordcloudMergeWord, # TODO: move model to this app
CommitAuthor,
LibraryVersion,
Issue,
Commit,
Library,
)
from libraries.utils import batched
from mailing_list.models import EmailData
from reports.constants import WORDCLOUD_FONT
from slack.models import Channel, SlackActivityBucket, SlackUser
from versions.exceptions import BoostImportedDataException
from versions.models import Version
logger = logging.getLogger(__name__)
@@ -48,9 +61,13 @@ def generate_algolia_words(
"index": version.stripped_boost_url_slug,
"limit": 100,
}
search_results = client.get_top_searches(**args).to_json()
search_data = json.loads(search_results)
return {r["search"]: r["count"] for r in search_data["searches"] if r["count"] > 1}
try:
search_results = client.get_top_searches(**args).to_json()
search_data = json.loads(search_results)
searches = search_data.get("searches") or []
return {r["search"]: r["count"] for r in searches if r["count"] > 1}
except ValueError:
return {}
def generate_wordcloud(
@@ -147,73 +164,27 @@ def get_mail_content(version: Version, prior_version: Version):
yield content
def get_mailing_list_post_stats(start_date: datetime, end_date: datetime):
data = (
PostingData.objects.filter(post_time__gt=start_date, post_time__lte=end_date)
.annotate(week=ExtractWeek("post_time"), iso_year=ExtractIsoYear("post_time"))
.values("iso_year", "week")
.annotate(count=Count("id"))
.order_by("iso_year", "week")
def get_mailinglist_counts(version: Version):
return (
EmailData.objects.filter(version=version)
.with_total_counts()
.order_by("-total_count")[:10]
)
chart_data = []
for row in data:
week_number = row["week"]
year_number = str(row["iso_year"])[2:] # e.g. 25
x = f"{week_number} ({year_number})" # e.g., "51 (24)", "1 (25)"
y = row["count"]
chart_data.append({"x": x, "y": y})
return chart_data
def get_new_subscribers_stats(start_date: datetime, end_date: datetime):
data = (
SubscriptionData.objects.filter(
subscription_dt__gte=start_date,
subscription_dt__lte=end_date,
list="boost",
)
.annotate(
week=ExtractWeek("subscription_dt"),
iso_year=ExtractIsoYear("subscription_dt"),
)
.values("iso_year", "week")
.annotate(count=Count("id"))
.order_by("iso_year", "week")
)
# Convert data into a dict for easy lookup
counts_by_week = {(row["iso_year"], row["week"]): row["count"] for row in data}
# Iterate through every ISO week in the date range
current = start_date
seen = set()
chart_data = []
while current <= end_date:
iso_year, iso_week, _ = current.isocalendar()
key = (iso_year, iso_week)
if key not in seen: # skip duplicate weeks in the same loop
seen.add(key)
year_suffix = str(iso_year)[2:]
label = f"{iso_week} ({year_suffix})"
count = counts_by_week.get(key, 0)
chart_data.append({"x": label, "y": count})
current += timedelta(days=7) # hop by weeks
return chart_data
def get_algolia_search_stats(client: AnalyticsClientSync, version: Version) -> dict:
default_args = {"index": version.stripped_boost_url_slug}
# search data
search_response = client.get_searches_count(**default_args).to_json()
search_data = json.loads(search_response)
# country data
country_results = client.get_top_countries(**default_args, limit=100).to_json()
country_data = json.loads(country_results)
country_stats = {r["country"]: r["count"] for r in country_data["countries"]}
try:
# country data
country_results = client.get_top_countries(**default_args, limit=100).to_json()
country_data = json.loads(country_results)
countries = country_data.get("countries") or []
country_stats = {r["country"]: r["count"] for r in countries}
except ValueError:
country_stats = {}
return {
"total_searches": search_data.get("count"),
"country_stats": country_stats,
@@ -221,3 +192,524 @@ def get_algolia_search_stats(client: AnalyticsClientSync, version: Version) -> d
:RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
],
}
def determine_versions(report_configuration_name: str) -> tuple[bool, Version, Version]:
version = Version.objects.filter(name=report_configuration_name).first()
report_before_release = False if version else True
prior_version = None
if report_before_release:
# if the version is not set then the user has chosen a report configuration
# that's not matching a live version, so we use the most recent version
version = Version.objects.filter(name="master").first()
prior_version = Version.objects.most_recent()
if not prior_version:
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-version_array")
.first()
)
return report_before_release, prior_version, version
def get_dependency_data(library_order, version):
try:
dependency_diff_values = version.get_dependency_diffs().values()
except BoostImportedDataException as e:
logger.warning(f"Could not get dependency diffs for version {version}: {e}")
dependency_diff_values = {}
diffs_by_id = {x["library_id"]: x for x in dependency_diff_values}
diffs = []
for lib_id in library_order:
diffs.append(diffs_by_id.get(lib_id, {}))
return diffs
def global_new_contributors(version):
version_lt = list(
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("id")
.values_list("id", flat=True)
)
prior_version_author_ids = (
CommitAuthor.objects.filter(commit__library_version__version__in=version_lt)
.distinct()
.values_list("id", flat=True)
)
version_author_ids = (
CommitAuthor.objects.filter(
commit__library_version__version__in=version_lt + [version.id]
)
.distinct()
.values_list("id", flat=True)
)
return set(version_author_ids) - set(prior_version_author_ids)
def get_library_queryset_by_version(version: Version, annotate_commit_count=False):
from libraries.forms import CreateReportFullForm
qs = CreateReportFullForm.library_queryset.none()
if version:
qs = CreateReportFullForm.library_queryset.filter(
library_version=LibraryVersion.objects.filter(
library=OuterRef("id"), version=version
)[:1],
)
if annotate_commit_count:
qs = qs.annotate(commit_count=Count("library_version__commit"))
return qs
def get_top_libraries_for_version(version):
library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
return library_qs.order_by("-commit_count")
def get_libraries_by_name(version):
library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
return library_qs.order_by("name")
def get_libraries_by_quality(version):
# returns "great", "good", and "standard" libraries in that order
library_qs = get_library_queryset_by_version(version)
return list(
chain(
library_qs.filter(graphic__isnull=False),
library_qs.filter(graphic__isnull=True, is_good=True),
library_qs.filter(graphic__isnull=True, is_good=False),
)
)
def get_library_version_counts(library_order, version):
library_qs = get_library_queryset_by_version(version, annotate_commit_count=True)
return sorted(
list(library_qs.values("commit_count", "id")),
key=lambda x: library_order.index(x["id"]),
)
def get_library_full_counts(libraries, library_order):
return sorted(
list(
libraries.annotate(commit_count=Count("library_version__commit")).values(
"commit_count", "id"
)
),
key=lambda x: library_order.index(x["id"]),
)
def get_top_contributors_for_library_version(library_order, version):
top_contributors_release = []
for library_id in library_order:
top_contributors_release.append(
CommitAuthor.objects.filter(
commit__library_version=LibraryVersion.objects.get(
version=version, library_id=library_id
)
)
.annotate(commit_count=Count("commit"))
.order_by("-commit_count")[:10]
)
return top_contributors_release
def count_new_contributors(libraries, library_order, version):
version_lt = list(
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.values_list("id", flat=True)
)
version_lte = version_lt + [version.id]
lt_subquery = LibraryVersion.objects.filter(
version__in=version_lt,
library=OuterRef("id"),
).values("id")
lte_subquery = LibraryVersion.objects.filter(
version__in=version_lte,
library=OuterRef("id"),
).values("id")
return sorted(
list(
libraries.annotate(
authors_before_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lt_subquery),
distinct=True,
),
authors_through_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lte_subquery),
distinct=True,
),
)
.annotate(
count=F("authors_through_release_count")
- F("authors_before_release_count")
)
.values("id", "count")
),
key=lambda x: library_order.index(x["id"]),
)
def count_issues(libraries, library_order, version, prior_version):
data = {
x["library_id"]: x
for x in Issue.objects.count_opened_closed_during_release(
version, prior_version
).filter(library_id__in=[x.id for x in libraries])
}
ret = []
for lib_id in library_order:
if lib_id in data:
ret.append(data[lib_id])
else:
ret.append({"opened": 0, "closed": 0, "library_id": lib_id})
return ret
def get_library_versions(library_order, version):
return sorted(
list(
LibraryVersion.objects.filter(version=version, library_id__in=library_order)
),
key=lambda x: library_order.index(x.library_id),
)
def get_top_contributors_for_version(version):
from libraries.forms import CreateReportFullForm
return (
CommitAuthor.objects.filter(commit__library_version__version=version)
.annotate(
commit_count=Count(
"commit",
filter=Q(
commit__library_version__library__in=CreateReportFullForm.library_queryset
),
)
)
.order_by("-commit_count")[:10]
)
def get_git_graph_data(prior_version: Version | None, version: Version):
"""Fetch commit count data for a release and return an instance of Graph.
Returns data in a format to easily create a github style green box commit graph.
"""
if prior_version is None:
return None
@dataclass
class Day:
date: date
count: int
color: str = ""
@dataclass
class Week:
days: list[Day] = field(default_factory=list)
@cached_property
def max(self):
"""The max number of commits this week."""
return max(x.count for x in self.days)
@dataclass
class Graph:
weeks: list[Week] = field(default_factory=list)
colors: list[str] = field(
default_factory=lambda: [
"#E8F5E9",
"#C8E6C9",
"#A5D6A7",
"#81C784",
"#66BB6A",
"#4CAF50",
"#43A047",
"#388E3C",
"#2E7D32",
"#1B5E20",
],
)
@cached_property
def graph_start(self):
return start.strftime("%B '%y")
@cached_property
def graph_end(self):
return end.strftime("%B '%y")
@cached_property
def max(self):
"""The max number of commits in all weeks."""
return max(x.max for x in self.weeks)
def append_day(self, day: Day):
"""Append a day into the last week of self.weeks.
- Automatically create a new week if there are already 7 days in the
last week.
"""
if len(self.weeks) == 0 or len(self.weeks[-1].days) == 7:
self.weeks.append(Week())
self.weeks[-1].days.append(day)
def apply_colors(self):
"""Iterate through each day and apply a color.
- The color is selected based on the number of commits made on
that day, relative to the highest number of commits in all days in
Graph.weeks.days.
"""
if not (high := self.max):
# No commits this release
# TODO: we may want a more elegant solution
# than just not graphing this library
return
for week in self.weeks:
for day in week.days:
decimal = day.count / high
if decimal == 1:
day.color = self.colors[-1]
else:
idx = int(decimal * len(self.colors))
day.color = self.colors[idx]
count_query = (
Commit.objects.filter(library_version__version=version)
.values("committed_at__date")
.annotate(count=Count("id"))
)
counts_by_date = {x["committed_at__date"]: x["count"] for x in count_query}
graph = Graph()
# The start date is the release date of the previous version
# The end date is one day before the release date of the current version
start: date = prior_version.release_date
end: date = (version.release_date or date.today()) - timedelta(days=1)
# if the release started on a Thursday, we want to add Sun -> Wed to the data
# with empty counts, even if they aren't part of the release.
for i in range(start.weekday(), 0, -1):
day = Day(date=start - timedelta(days=i), count=0)
graph.append_day(day)
current_date = start
while current_date <= end:
day = Day(date=current_date, count=counts_by_date.get(current_date, 0))
graph.append_day(day)
current_date = current_date + timedelta(days=1)
graph.apply_colors()
return graph
def get_libraries(library_order: list[int]):
return Library.objects.filter(id__in=library_order).order_by(
Case(*[When(id=pk, then=Value(pos)) for pos, pk in enumerate(library_order)])
)
def get_library_data(library_order: list[int], prior_version_id: int, version_id: int):
prior_version = Version.objects.get(pk=prior_version_id)
version = Version.objects.get(pk=version_id)
libraries = get_libraries(library_order)
library_data = [
{
"library": item[0],
"full_count": item[1],
"version_count": item[2],
"top_contributors_release": item[3],
"new_contributors_count": item[4],
"issues": item[5],
"library_version": item[6],
"deps": item[7],
}
for item in zip(
libraries,
get_library_full_counts(libraries, library_order),
get_library_version_counts(library_order, version),
get_top_contributors_for_library_version(library_order, version),
count_new_contributors(libraries, library_order, version),
count_issues(libraries, library_order, version, prior_version),
get_library_versions(library_order, version),
get_dependency_data(library_order, version),
)
]
return [x for x in library_data if x["version_count"]["commit_count"] > 0]
def get_top_libraries():
from libraries.forms import CreateReportFullForm
return CreateReportFullForm.library_queryset.annotate(
commit_count=Count("library_version__commit")
).order_by("-commit_count")[:5]
def lines_changes_count(version: Version):
from libraries.forms import CreateReportFullForm
lines_added = LibraryVersion.objects.filter(
version=version,
library__in=CreateReportFullForm.library_queryset,
).aggregate(lines=Sum("insertions"))["lines"]
lines_removed = LibraryVersion.objects.filter(
version=version,
library__in=CreateReportFullForm.library_queryset,
).aggregate(lines=Sum("deletions"))["lines"]
return lines_added, lines_removed
def get_commit_counts(version: Version):
from libraries.forms import CreateReportFullForm
commit_count = Commit.objects.filter(
library_version__version__name__lte=version.name,
library_version__library__in=CreateReportFullForm.library_queryset,
).count()
version_commit_count = Commit.objects.filter(
library_version__version=version,
library_version__library__in=CreateReportFullForm.library_queryset,
).count()
return commit_count, version_commit_count
def get_issues_counts(prior_version: Version, version: Version):
from libraries.forms import CreateReportFullForm
opened_issues_count = (
Issue.objects.filter(library__in=CreateReportFullForm.library_queryset)
.opened_during_release(version, prior_version)
.count()
)
closed_issues_count = (
Issue.objects.filter(library__in=CreateReportFullForm.library_queryset)
.closed_during_release(version, prior_version)
.count()
)
return opened_issues_count, closed_issues_count
def get_download_links(version: Version):
return {
k: list(v)
for k, v in groupby(
version.downloads.all().order_by("operating_system"),
key=attrgetter("operating_system"),
)
}
def get_mailinglist_msg_counts(version: Version) -> tuple[int, int]:
total_mailinglist_count = EmailData.objects.filter(version=version).aggregate(
total=Sum("count")
)["total"]
mailinglist_counts = (
EmailData.objects.filter(version=version)
.with_total_counts()
.order_by("-total_count")[:10]
)
return total_mailinglist_count, mailinglist_counts
def get_slack_channels():
return batched(
Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
)
def get_libraries_for_index(library_data, version: Version):
library_index_library_data = []
for library in get_libraries_by_quality(version):
library_index_library_data.append(
(
library,
library in [lib["library"] for lib in library_data],
)
)
return library_index_library_data
def get_slack_stats_for_channels(
prior_version, version, channels: list[Channel] | None = None
):
"""Get slack stats for specific channels, or all channels."""
start = prior_version.release_date
end = date.today()
if version.release_date:
end = version.release_date - timedelta(days=1)
# count of all messages in the date range
q = Q(day__range=[start, end])
if channels:
q &= Q(channel__in=channels)
total = SlackActivityBucket.objects.filter(q).aggregate(total=Sum("count"))["total"]
# message counts per user in the date range
q = Q(slackactivitybucket__day__range=[start, end])
if channels:
q &= Q(slackactivitybucket__channel__in=channels)
per_user = (
SlackUser.objects.annotate(
total=Sum(
"slackactivitybucket__count",
filter=q,
)
)
.filter(total__gt=0)
.order_by("-total")
)
q = Q()
if channels:
q &= Q(channel__in=channels)
distinct_users = (
SlackActivityBucket.objects.filter(q).order_by("user_id").distinct("user_id")
)
new_user_count = (
distinct_users.filter(day__lte=end).count()
- distinct_users.filter(day__lt=start).count()
)
return {
"users": per_user[:10],
"user_count": per_user.count(),
"total": total,
"new_user_count": new_user_count,
}
def get_slack_stats(prior_version: Version, version: Version):
"""Returns all slack related stats.
Only returns channels with activity.
"""
stats = []
for channel in Channel.objects.filter(name__istartswith="boost"):
channel_stat = get_slack_stats_for_channels(
prior_version, version, channels=[channel]
)
channel_stat["channel"] = channel
if channel_stat["user_count"] > 0:
stats.append(channel_stat)
stats.sort(key=lambda x: -(x["total"] or 0)) # Convert None to 0
# we want 2 channels per pdf page, use batched to get groups of 2
return batched(stats, 2)