Files
website-v2/libraries/forms.py
2024-11-21 12:15:10 -08:00

747 lines
27 KiB
Python

import io
import base64
from functools import cached_property
from itertools import groupby
from operator import attrgetter
from dataclasses import dataclass, field
from datetime import date, timedelta
import psycopg2
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt
from django.template.loader import render_to_string
from django.db.models import F, Q, Count, OuterRef, Sum
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
from django.conf import settings
from core.models import RenderedContent, SiteSettings
from versions.models import Version
from .models import Commit, CommitAuthor, Issue, Library, LibraryVersion
from libraries.constants import SUB_LIBRARIES
from mailing_list.models import EmailData
class LibraryForm(ModelForm):
class Meta:
model = Library
fields = ["categories"]
class VersionSelectionForm(Form):
queryset = Version.objects.active().defer("data")
queryset = queryset.exclude(name__in=["develop", "master", "head"])
version = ModelChoiceField(
queryset=queryset,
label="Select a version",
empty_label="Choose a version...",
)
class CreateReportFullForm(Form):
"""Form for creating a report over all releases."""
html_template_name = "admin/library_report_full_detail.html"
library_queryset = Library.objects.exclude(key__in=SUB_LIBRARIES).order_by("name")
library_1 = ModelChoiceField(
queryset=library_queryset,
required=False,
help_text="If none are selected, the top 5 will be auto-selected.",
)
library_2 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_3 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_4 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_5 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_6 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_7 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
library_8 = ModelChoiceField(
queryset=library_queryset,
required=False,
)
no_cache = BooleanField(
required=False,
initial=False,
help_text="Force the page to be regenerated, do not use cache.",
)
@property
def cache_key(self):
chosen_libraries = [
self.cleaned_data["library_1"],
self.cleaned_data["library_2"],
self.cleaned_data["library_3"],
self.cleaned_data["library_4"],
self.cleaned_data["library_5"],
self.cleaned_data["library_6"],
self.cleaned_data["library_7"],
self.cleaned_data["library_8"],
]
lib_string = ",".join(str(x.id) if x else "" for x in chosen_libraries)
return f"full-report-{lib_string}"
def _get_top_libraries(self):
return self.library_queryset.annotate(
commit_count=Count("library_version__commit")
).order_by("-commit_count")[:5]
def _get_library_order(self, top_libraries):
library_order = [
x.id
for x in [
self.cleaned_data["library_1"],
self.cleaned_data["library_2"],
self.cleaned_data["library_3"],
self.cleaned_data["library_4"],
self.cleaned_data["library_5"],
self.cleaned_data["library_6"],
self.cleaned_data["library_7"],
self.cleaned_data["library_8"],
]
if x is not None
]
if not library_order:
library_order = [x.id for x in top_libraries]
return library_order
def _get_library_full_counts(self, libraries, library_order):
return sorted(
list(
libraries.annotate(
commit_count=Count("library_version__commit")
).values("commit_count", "id")
),
key=lambda x: library_order.index(x["id"]),
)
def _get_top_contributors_overall(self):
return (
CommitAuthor.objects.all()
.annotate(
commit_count=Count(
"commit",
filter=Q(
commit__library_version__library__in=self.library_queryset
),
)
)
.values("name", "avatar_url", "commit_count", "github_profile_url")
.order_by("-commit_count")[:10]
)
def _get_top_contributors_for_library(self, library_order):
top_contributors_library = []
for library_id in library_order:
top_contributors_library.append(
CommitAuthor.objects.filter(
commit__library_version__library_id=library_id
)
.annotate(commit_count=Count("commit"))
.values(
"name",
"avatar_url",
"github_profile_url",
"commit_count",
"commit__library_version__library_id",
)
.order_by("-commit_count")[:10]
)
return top_contributors_library
def get_stats(self):
commit_count = Commit.objects.filter(
library_version__library__in=self.library_queryset
).count()
top_libraries = self._get_top_libraries()
library_order = self._get_library_order(top_libraries)
libraries = Library.objects.filter(id__in=library_order)
library_data = [
{
"library": x[0],
"full_count": x[1],
"top_contributors": x[2],
}
for x in zip(
sorted(list(libraries), key=lambda x: library_order.index(x.id)),
self._get_library_full_counts(libraries, library_order),
self._get_top_contributors_for_library(library_order),
)
]
top_contributors = self._get_top_contributors_overall()
mailinglist_total = EmailData.objects.all().aggregate(total=Sum("count"))[
"total"
]
first_version = Version.objects.order_by("release_date").first()
return {
"mailinglist_counts": EmailData.objects.with_total_counts().order_by(
"-total_count"
)[:10],
"mailinglist_total": mailinglist_total,
"first_version": first_version,
"commit_count": commit_count,
"top_contributors": top_contributors,
"library_data": library_data,
"top_libraries": top_libraries,
"library_count": self.library_queryset.count(),
}
def cache_html(self):
"""Render and cache the html for this report."""
# ensure we have "cleaned_data"
if not self.is_valid():
return ""
html = render_to_string(self.html_template_name, self.get_stats())
self.cache_set(html)
return html
def cache_get(self) -> RenderedContent | None:
return RenderedContent.objects.filter(cache_key=self.cache_key).first()
def cache_clear(self):
return RenderedContent.objects.filter(cache_key=self.cache_key).delete()
def cache_set(self, content_html):
"""Cache the html for this report."""
return RenderedContent.objects.update_or_create(
cache_key=self.cache_key,
defaults={
"content_html": content_html,
"content_type": "text/html",
},
)
class CreateReportForm(CreateReportFullForm):
"""Form for creating a report for a specific release."""
html_template_name = "admin/release_report_detail.html"
version = ModelChoiceField(
queryset=Version.objects.minor_versions().order_by("-version_array")
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fields[
"library_1"
].help_text = "If none are selected, all libraries will be selected."
@property
def cache_key(self):
chosen_libraries = [
self.cleaned_data["library_1"],
self.cleaned_data["library_2"],
self.cleaned_data["library_3"],
self.cleaned_data["library_4"],
self.cleaned_data["library_5"],
self.cleaned_data["library_6"],
self.cleaned_data["library_7"],
self.cleaned_data["library_8"],
]
lib_string = ",".join(str(x.id) if x else "" for x in chosen_libraries)
version = self.cleaned_data["version"]
return f"release-report-{lib_string}-{version.name}"
def _get_top_contributors_for_version(self):
return (
CommitAuthor.objects.filter(
commit__library_version__version=self.cleaned_data["version"]
)
.annotate(
commit_count=Count(
"commit",
filter=Q(
commit__library_version__library__in=self.library_queryset
),
)
)
.values("name", "avatar_url", "commit_count", "github_profile_url")
.order_by("-commit_count")[:10]
)
def _get_top_libraries_for_version(self):
return (
self.library_queryset.filter(
library_version=LibraryVersion.objects.filter(
library=OuterRef("id"), version=self.cleaned_data["version"]
)[:1],
)
.annotate(commit_count=Count("library_version__commit"))
.order_by("-commit_count")
)
def _get_library_version_counts(self, libraries, library_order):
return sorted(
list(
libraries.filter(
library_version=LibraryVersion.objects.filter(
library=OuterRef("id"), version=self.cleaned_data["version"]
)[:1]
)
.annotate(commit_count=Count("library_version__commit"))
.values("commit_count", "id")
),
key=lambda x: library_order.index(x["id"]),
)
def _count_new_contributors(self, libraries, library_order):
version = self.cleaned_data["version"]
version_lt = list(
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.values_list("id", flat=True)
)
version_lte = version_lt + [version.id]
lt_subquery = LibraryVersion.objects.filter(
version__in=version_lt,
library=OuterRef("id"),
).values("id")
lte_subquery = LibraryVersion.objects.filter(
version__in=version_lte,
library=OuterRef("id"),
).values("id")
return sorted(
list(
libraries.annotate(
authors_before_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lt_subquery),
distinct=True,
),
authors_through_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lte_subquery),
distinct=True,
),
)
.annotate(
count=F("authors_through_release_count")
- F("authors_before_release_count")
)
.values("id", "count")
),
key=lambda x: library_order.index(x["id"]),
)
def _count_issues(self, libraries, library_order, version):
data = {
x["library_id"]: x
for x in Issue.objects.count_opened_closed_during_release(version).filter(
library_id__in=[x.id for x in libraries]
)
}
ret = []
for lib_id in library_order:
if lib_id in data:
ret.append(data[lib_id])
else:
ret.append({"opened": 0, "closed": 0, "library_id": lib_id})
return ret
def _count_commit_contributors_totals(self, version):
"""Get a count of contributors for this release, and a count of
new contributors.
"""
version_lt = list(
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.values_list("id", flat=True)
)
version_lte = version_lt + [version.id]
lt_subquery = LibraryVersion.objects.filter(
version__in=version_lt,
library=OuterRef("id"),
).values("id")
lte_subquery = LibraryVersion.objects.filter(
version__in=version_lte,
library=OuterRef("id"),
).values("id")
qs = self.library_queryset.aggregate(
this_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__version=version),
distinct=True,
),
authors_before_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lt_subquery),
distinct=True,
),
authors_through_release_count=Count(
"library_version__commit__author",
filter=Q(library_version__in=lte_subquery),
distinct=True,
),
)
new_count = (
qs["authors_through_release_count"] - qs["authors_before_release_count"]
)
this_release_count = qs["this_release_count"]
return this_release_count, new_count
def _get_top_contributors_for_library_version(self, library_order):
top_contributors_release = []
for library_id in library_order:
top_contributors_release.append(
CommitAuthor.objects.filter(
commit__library_version=LibraryVersion.objects.get(
version=self.cleaned_data["version"], library_id=library_id
)
)
.annotate(commit_count=Count("commit"))
.values(
"name",
"avatar_url",
"github_profile_url",
"commit_count",
"commit__library_version__library_id",
)
.order_by("-commit_count")[:10]
)
return top_contributors_release
def _get_mail_content(self, version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content
def _generate_hyperkitty_word_cloud(self, version):
"""Generates a wordcloud png and returns it as a base64 string."""
wc = WordCloud(
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.BASE_DIR / "static" / "font" / "notosans_mono.woff",
)
image_bytes = io.BytesIO()
frequencies = {}
for content in self._get_mail_content(version):
for key, val in wc.process_text(content).items():
if key not in frequencies:
frequencies[key] = 0
frequencies[key] += val
if not frequencies:
return
wc.generate_from_frequencies(frequencies)
plt.figure(figsize=(14, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode()
def _count_mailinglist_contributors(self, version):
version_lt = list(
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.values_list("id", flat=True)
)
version_lte = version_lt + [version.id]
current = (
EmailData.objects.filter(version__in=version_lte)
.distinct("author_id")
.count()
)
prior = (
EmailData.objects.filter(version__in=version_lt)
.distinct("author_id")
.count()
)
release = EmailData.objects.filter(version=version).count()
return release, current - prior
def _get_library_versions(self, library_order, version):
return sorted(
list(
LibraryVersion.objects.filter(
version=version, library_id__in=library_order
)
),
key=lambda x: library_order.index(x.library_id),
)
def _get_git_graph_data(self, prior_version: Version | None, version: Version):
"""Fetch commit count data for a release and return an instance of Graph.
Returns data in a format to easily create a github style green box commit graph.
"""
if prior_version is None:
return None
if prior_version.release_date is None or version.release_date is None:
return None
@dataclass
class Day:
date: date
count: int
color: str = ""
@dataclass
class Week:
days: list[Day] = field(default_factory=list)
@cached_property
def max(self):
"""The max number of commits this week."""
return max(x.count for x in self.days)
@dataclass
class Graph:
weeks: list[Week] = field(default_factory=list)
colors: list[str] = field(
default_factory=lambda: [
"#E8F5E9",
"#C8E6C9",
"#A5D6A7",
"#81C784",
"#66BB6A",
"#4CAF50",
"#43A047",
"#388E3C",
"#2E7D32",
"#1B5E20",
],
)
@cached_property
def max(self):
"""The max number of commits in all weeks."""
return max(x.max for x in self.weeks)
def append_day(self, day: Day):
"""Append a day into the last week of self.weeks.
- Automatically create a new week if there are already 7 days in the
last week.
"""
if len(self.weeks) == 0 or len(self.weeks[-1].days) == 7:
self.weeks.append(Week())
self.weeks[-1].days.append(day)
def apply_colors(self):
"""Iterate through each day and apply a color.
- The color is selected based on the number of commits made on
that day, relative to the highest number of commits in all days in
Graph.weeks.days.
"""
high = self.max
for week in self.weeks:
for day in week.days:
decimal = day.count / high
if decimal == 1:
day.color = self.colors[-1]
else:
idx = int(decimal * len(self.colors))
day.color = self.colors[idx]
count_query = (
Commit.objects.filter(library_version__version=version)
.values("committed_at__date")
.annotate(count=Count("id"))
)
counts_by_date = {x["committed_at__date"]: x["count"] for x in count_query}
graph = Graph()
# The start date is the release date of the previous version
# The end date is one day before the release date of the current version
start: date = prior_version.release_date
end: date = version.release_date - timedelta(days=1)
# if the release started on a Thursday, we want to add Sun -> Wed to the data
# with empty counts, even if they aren't part of the release.
for i in range(start.weekday(), 0, -1):
day = Day(date=start - timedelta(days=i), count=0)
graph.append_day(day)
current_date = start
while current_date <= end:
day = Day(date=current_date, count=counts_by_date.get(current_date, 0))
graph.append_day(day)
current_date = current_date + timedelta(days=1)
graph.apply_colors()
return graph
def get_stats(self):
version = self.cleaned_data["version"]
downloads = {
k: list(v)
for k, v in groupby(
version.downloads.all().order_by("operating_system"),
key=attrgetter("operating_system"),
)
}
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-version_array")
.first()
)
commit_count = Commit.objects.filter(
library_version__version__name__lte=version.name,
library_version__library__in=self.library_queryset,
).count()
version_commit_count = Commit.objects.filter(
library_version__version=version,
library_version__library__in=self.library_queryset,
).count()
top_libraries_for_version = self._get_top_libraries_for_version()
library_order = self._get_library_order(top_libraries_for_version)
libraries = Library.objects.filter(id__in=library_order)
library_names = (
LibraryVersion.objects.filter(
version=version,
library__in=self.library_queryset,
)
.annotate(name=F("library__name"))
.order_by("name")
.values_list("name", flat=True)
)
library_data = [
{
"library": a,
"full_count": b,
"version_count": c,
"top_contributors_release": d,
"new_contributors_count": e,
"issues": f,
"library_version": g,
}
for a, b, c, d, e, f, g in zip(
sorted(list(libraries), key=lambda x: library_order.index(x.id)),
self._get_library_full_counts(libraries, library_order),
self._get_library_version_counts(libraries, library_order),
self._get_top_contributors_for_library_version(library_order),
self._count_new_contributors(libraries, library_order),
self._count_issues(libraries, library_order, version),
self._get_library_versions(library_order, version),
)
]
library_data = [
x for x in library_data if x["version_count"]["commit_count"] > 0
]
top_contributors = self._get_top_contributors_for_version()
# total messages sent during this release (version)
total_mailinglist_count = EmailData.objects.filter(version=version).aggregate(
total=Sum("count")
)["total"]
mailinglist_counts = (
EmailData.objects.filter(version=version)
.with_total_counts()
.order_by("-total_count")[:10]
)
(
mailinglist_contributor_release_count,
mailinglist_contributor_new_count,
) = self._count_mailinglist_contributors(version)
(
commit_contributors_release_count,
commit_contributors_new_count,
) = self._count_commit_contributors_totals(version)
library_count = LibraryVersion.objects.filter(
version=version,
library__in=self.library_queryset,
).count()
if prior_version:
library_count_prior = LibraryVersion.objects.filter(
version=prior_version,
library__in=self.library_queryset,
).count()
else:
library_count_prior = 0
added_library_count = max(0, library_count - library_count_prior)
removed_library_count = max(0, library_count_prior - library_count)
lines_added = LibraryVersion.objects.filter(
version=version,
library__in=self.library_queryset,
).aggregate(lines=Sum("insertions"))["lines"]
lines_removed = LibraryVersion.objects.filter(
version=version,
library__in=self.library_queryset,
).aggregate(lines=Sum("deletions"))["lines"]
return {
"lines_added": lines_added,
"lines_removed": lines_removed,
"wordcloud_base64": self._generate_hyperkitty_word_cloud(version),
"version": version,
"prior_version": prior_version,
"opened_issues_count": Issue.objects.filter(
library__in=self.library_queryset
)
.opened_during_release(version)
.count(),
"closed_issues_count": Issue.objects.filter(
library__in=self.library_queryset
)
.closed_during_release(version)
.count(),
"mailinglist_counts": mailinglist_counts,
"mailinglist_total": total_mailinglist_count or 0,
"mailinglist_contributor_release_count": mailinglist_contributor_release_count, # noqa: E501
"mailinglist_contributor_new_count": mailinglist_contributor_new_count,
"commit_contributors_release_count": commit_contributors_release_count,
"commit_contributors_new_count": commit_contributors_new_count,
"commit_count": commit_count,
"version_commit_count": version_commit_count,
"top_contributors_release_overall": top_contributors,
"library_data": library_data,
"top_libraries_for_version": top_libraries_for_version,
"library_count": library_count,
"library_count_prior": library_count_prior,
"library_names": library_names,
"added_library_count": added_library_count,
"removed_library_count": removed_library_count,
"downloads": downloads,
"contribution_box_graph": self._get_git_graph_data(prior_version, version),
}