Website traffic stats gathering (#1679)

This commit is contained in:
Greg Kaleka
2025-03-23 16:11:21 -04:00
committed by GitHub
parent 4d5bf61dc6
commit 8b43040399
16 changed files with 474 additions and 166 deletions

View File

@@ -96,6 +96,7 @@ INSTALLED_APPS += [
"libraries",
"mailing_list",
"news",
"reports",
"core",
"slack",
]

View File

@@ -1,22 +1,16 @@
import io
import base64
from functools import cached_property
from itertools import groupby, chain
from operator import attrgetter
from dataclasses import dataclass, field
from datetime import date, timedelta
import psycopg2
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt
from django.template.loader import render_to_string
from django.db.models import F, Q, Count, OuterRef, Sum, When, Value, Case
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
from django.conf import settings
from core.models import RenderedContent, SiteSettings
from libraries.utils import batched, boost_normalize_words, grey_color_func
from core.models import RenderedContent
from reports.generation import generate_wordcloud
from slack.models import Channel, SlackActivityBucket, SlackUser
from versions.models import Version
from .models import (
@@ -25,10 +19,10 @@ from .models import (
Issue,
Library,
LibraryVersion,
WordcloudMergeWord,
)
from libraries.constants import SUB_LIBRARIES
from mailing_list.models import EmailData
from .utils import batched
class LibraryForm(ModelForm):
@@ -448,73 +442,6 @@ class CreateReportForm(CreateReportFullForm):
)
return top_contributors_release
def _get_mail_content(self, version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content
def _generate_hyperkitty_word_cloud(self, version):
"""Generates a wordcloud png and returns it as a base64 string."""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.BASE_DIR / "static" / "font" / "notosans_mono.woff",
)
word_frequencies = {}
for content in self._get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, {}
word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)
wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), word_frequencies
def _count_mailinglist_contributors(self, version):
version_lt = list(
Version.objects.minor_versions()
@@ -838,19 +765,6 @@ class CreateReportForm(CreateReportFullForm):
Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
)
committee_members = version.financial_committee_members.all()
wordcloud_base64, word_frequencies = self._generate_hyperkitty_word_cloud(
version
)
# first sort by number, then sort the top 200 alphabetically
word_frequencies = {
key: val
for key, val in sorted(
word_frequencies.items(),
key=lambda x: x[1],
reverse=True,
)
}
wordcloud_top_words = sorted(list(word_frequencies.keys())[:200])
library_index_library_data = []
for library in self._get_libraries_by_quality():
library_index_library_data.append(
@@ -859,6 +773,7 @@ class CreateReportForm(CreateReportFullForm):
library in [lib["library"] for lib in library_data],
)
)
wordcloud_base64, wordcloud_top_words = generate_wordcloud(version)
return {
"committee_members": committee_members,

View File

@@ -1,5 +1,7 @@
import traceback
from contextlib import suppress
from dataclasses import dataclass
from typing import Callable
import djclick as click
@@ -13,6 +15,7 @@ from slack_sdk.errors import SlackApiError
from core.githubhelper import GithubAPIClient
from libraries.forms import CreateReportForm
from libraries.tasks import update_commits
from reports.models import WebsiteStatReport
from slack.management.commands.fetch_slack_activity import get_my_channels, locked
from versions.models import Version
@@ -34,67 +37,94 @@ def progress_message(message: str):
return f"{timezone.now()}: {message}"
@dataclass
class ReleaseTask:
"""
A distinct task to be completed.
Action can be a callable or a list of string arguments to pass to `call_command`
"""
description: str
action: Callable | list[str]
def run(self):
if isinstance(self.action, Callable):
self.action()
else:
call_command(*self.action)
class ReleaseTasksManager:
latest_version: Version | None = None
progress_messages: list[str] = []
handled_commits: dict[str, int] = {}
def __init__(self, should_generate_report: bool = False):
self.should_generate_report = should_generate_report
self.tasks = [
ReleaseTask("Importing versions", self.import_versions),
ReleaseTask(
"Importing most recent beta version",
["import_beta_release", "--delete-versions"],
),
ReleaseTask("Importing libraries", ["update_libraries"]),
ReleaseTask(
"Saving library-version relationships", self.import_library_versions
),
ReleaseTask("Adding library maintainers", ["update_maintainers"]),
ReleaseTask("Adding library authors", ["update_authors"]),
ReleaseTask(
"Adding library version authors", ["update_library_version_authors"]
),
ReleaseTask("Importing git commits", self.handle_commits),
ReleaseTask("Syncing mailinglist statistics", ["sync_mailinglist_stats"]),
ReleaseTask("Updating github issues", ["update_issues"]),
ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]),
ReleaseTask("Updating website statistics", self.update_website_statistics),
ReleaseTask("Generating report", self.generate_report),
]
def update_release_data(self) -> dict[str:int]:
for task in self.tasks:
self.progress_messages.append(progress_message(f"{task.description}..."))
task.run()
self.progress_messages.append(
progress_message(f"Finished {task.description.lower()}")
)
return self.handled_commits
def import_versions(self):
call_command("import_versions", "--new")
self.latest_version = Version.objects.most_recent()
def import_library_versions(self):
latest_version_number = self.latest_version.name.lstrip("boost-")
call_command("import_library_versions", min_release=latest_version_number)
def handle_commits(self):
self.handled_commits = update_commits(min_version=self.latest_version.name)
def update_website_statistics(self):
report, _ = WebsiteStatReport.objects.get_or_create(version=self.latest_version)
report.populate_from_api()
def generate_report(self):
if not self.should_generate_report:
self.progress_messages.append(
progress_message("Skipped - report generation not requested")
)
return
form = CreateReportForm({"version": self.latest_version.id})
form.cache_html()
@locked(1138692)
def run_commands(progress: list[str], generate_report: bool = False):
if not settings.SLACK_BOT_TOKEN:
raise ValueError("SLACK_BOT_TOKEN is not set.")
handled_commits = {}
progress.append(progress_message("Importing versions..."))
call_command("import_versions", "--new")
progress.append(progress_message("Finished importing versions."))
latest_version: Version = Version.objects.most_recent()
latest_version_name = latest_version.name
manager = ReleaseTasksManager(should_generate_report=generate_report)
handled_commits = manager.update_release_data()
progress.append(progress_message("Importing most recent beta version..."))
call_command("import_beta_release", "--delete-versions")
progress.append(progress_message("Finished importing most recent beta version."))
progress.append(progress_message("Importing libraries..."))
call_command("update_libraries")
progress.append(progress_message("Finished importing libraries."))
progress.append(progress_message("Saving library-version relationships..."))
latest_version_number = latest_version_name.lstrip("boost-")
call_command("import_library_versions", min_release=latest_version_number)
progress.append(progress_message("Finished saving library-version relationships."))
progress.append(progress_message("Adding library maintainers..."))
call_command("update_maintainers")
progress.append(progress_message("Finished adding library maintainers."))
progress.append(progress_message("Adding library authors..."))
call_command("update_authors")
progress.append(progress_message("Finished adding library authors."))
progress.append(progress_message("Adding library version authors..."))
call_command("update_library_version_authors")
progress.append(progress_message("Finished adding library version authors."))
progress.append(progress_message("Importing git commits..."))
handled_commits = update_commits(min_version=latest_version_name)
progress.append(progress_message("Finished importing commits."))
progress.append(progress_message("Syncing mailinglist statistics..."))
call_command("sync_mailinglist_stats")
progress.append(progress_message("Finished syncing mailinglist statistics."))
progress.append(progress_message("Updating github issues..."))
call_command("update_issues")
progress.append(progress_message("Finished updating github issues..."))
progress.append(progress_message("Updating slack activity buckets..."))
call_command("fetch_slack_activity")
progress.append(progress_message("Finished updating slack activity buckets."))
if generate_report:
progress.append(
progress_message(f"Generating report for {latest_version_name}...")
)
form = CreateReportForm({"version": latest_version.id})
form.cache_html()
progress.append(
progress_message(f"Finished generating report for {latest_version_name}.")
)
progress.extend(manager.progress_messages)
return handled_commits
@@ -196,4 +226,5 @@ def command(user_id=None, generate_report=False):
send_notification(
user,
"\n\n".join(message),
subject="Task Complete: release_tasks",
)

View File

@@ -1,7 +1,7 @@
from itertools import islice
import random
import string
import re
from itertools import islice
import structlog
import tempfile
@@ -296,21 +296,3 @@ def parse_boostdep_artifact(content: str):
"Some library versions were skipped during artifact parsing.",
skipped_library_versions=skipped_library_versions,
)
def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies
def grey_color_func(
word, font_size, position, orientation, random_state=None, **kwargs
):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)

0
reports/__init__.py Normal file
View File

32
reports/admin.py Normal file
View File

@@ -0,0 +1,32 @@
from django.contrib import admin
from reports.models import WebsiteStatReport, WebsiteStatItem
class StatInline(admin.StackedInline):
model = WebsiteStatItem
extra = 0
fields = ("value",)
readonly_fields = fields
can_delete = False
@admin.register(WebsiteStatReport)
class WebsiteStatReportAdmin(admin.ModelAdmin):
inlines = (StatInline,)
list_display = ("version", "pageviews", "unique_visitors", "period")
ordering = ("-version",)
# def get_queryset(self, request):
# qs = super().get_queryset(request)
# return qs.prefetch_related("stats")
def pageviews(self, obj):
return f"{int(obj.stats.get(code_name='pageviews').value):,}"
def unique_visitors(self, obj):
return f"{int(obj.stats.get(code_name='visitors').value):,}"
@admin.register(WebsiteStatItem)
class WebsiteStatItemAdmin(admin.ModelAdmin): ...

6
reports/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class ReportsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "reports"

6
reports/constants.py Normal file
View File

@@ -0,0 +1,6 @@
WORDCLOUD_FONT = "notosans_mono.woff"
WEB_ANALYTICS_DOMAIN = "preview.boost.org"
WEB_ANALYTICS_API_URL = (
f"https://plausible.io/api/stats/{WEB_ANALYTICS_DOMAIN}/top-stats/?period=custom"
"&from={:%Y-%m-%d}&to={:%Y-%m-%d}"
)

10
reports/forms.py Normal file
View File

@@ -0,0 +1,10 @@
from django import forms
from versions.models import Version
class ImportWebAnalyticsForm(forms.Form):
version = forms.ModelChoiceField(
Version.objects.get_dropdown_versions(),
widget=forms.Select(attrs={"class": "dropdown !mb-0 h-[38px]"}),
)

112
reports/generation.py Normal file
View File

@@ -0,0 +1,112 @@
import base64
import io
import random
import psycopg2
from django.conf import settings
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from core.models import SiteSettings
from libraries.models import WordcloudMergeWord # TODO: move model to this app
from reports.constants import WORDCLOUD_FONT
from versions.models import Version
def generate_wordcloud(version: Version) -> tuple[str | None, list]:
"""Generates a wordcloud png and returns it as a base64 string and word frequencies.
Returns:
Tuple of (base64_encoded_png_string, wordcloud_top_words)
"""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.STATIC_ROOT / "font" / WORDCLOUD_FONT,
)
word_frequencies = {}
for content in get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, []
word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)
# first sort by number, then sort the top 200 alphabetically
word_frequencies = {
key: val
for key, val in sorted(
word_frequencies.items(),
key=lambda x: x[1],
reverse=True,
)
}
wordcloud_top_words = sorted(list(word_frequencies.keys())[:200])
wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), wordcloud_top_words
def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies
def grey_color_func(*args, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)
def get_mail_content(version: Version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content

View File

@@ -0,0 +1,99 @@
# Generated by Django 4.2.18 on 2025-02-26 19:02
import django.contrib.postgres.fields.ranges
from django.db import migrations, models
import django.db.models.deletion
import django_extensions.db.fields
class Migration(migrations.Migration):
initial = True
dependencies = [
("versions", "0018_version_financial_committee_members"),
]
operations = [
migrations.CreateModel(
name="WebsiteStatReport",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
("period", django.contrib.postgres.fields.ranges.DateRangeField()),
(
"version",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
to="versions.version",
),
),
],
options={
"get_latest_by": "modified",
"abstract": False,
},
),
migrations.CreateModel(
name="WebsiteStatItem",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
("name", models.CharField()),
("code_name", models.CharField()),
("value", models.FloatField()),
(
"report",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="stats",
to="reports.websitestatreport",
),
),
],
),
migrations.AddConstraint(
model_name="websitestatitem",
constraint=models.UniqueConstraint(
fields=("report", "code_name"), name="unique_report_code_name"
),
),
]

View File

96
reports/models.py Normal file
View File

@@ -0,0 +1,96 @@
from datetime import timedelta
import requests
from django.contrib.postgres.fields import DateRangeField
from django.db import models
from django.db.backends.postgresql.psycopg_any import DateRange
from django_extensions.db.models import TimeStampedModel
from reports.constants import WEB_ANALYTICS_API_URL
from versions.models import Version
INCLUSIVE = "[]"
class WebsiteStatReport(TimeStampedModel):
version = models.OneToOneField(Version, on_delete=models.CASCADE)
period = DateRangeField()
def __str__(self):
return f"Stat report for {self.version}"
def save(self, **kwargs):
"""Allow creation of reports while omitting period and/or version"""
if self.version_id is None:
self.version = Version.objects.most_recent()
if not self.period:
previous_version = (
Version.objects.filter(
beta=False, release_date__lt=self.version.release_date
)
.order_by("-release_date")
.first()
)
start_date = previous_version.release_date + timedelta(days=1)
self.period = DateRange(start_date, self.version.release_date, INCLUSIVE)
super().save(**kwargs)
@property
def analytics_api_url(self) -> str:
return WEB_ANALYTICS_API_URL.format(self.period.lower, self.period.upper)
def populate_from_api(self):
"""Fetch stats from API and generate child WebsiteStatItem instances."""
response = requests.get(self.analytics_api_url)
data = response.json()
if not data or "top_stats" not in data:
raise ValueError(f"Invalid Plausible API response: {data}")
# Clear existing stat items
WebsiteStatItem.objects.filter(report=self).delete()
stat_items = []
for stat_data in data["top_stats"]:
stat = WebsiteStatItem(
report=self,
name=stat_data["name"],
value=stat_data["value"],
code_name=stat_data["graph_metric"],
)
stat_items.append(stat)
WebsiteStatItem.objects.bulk_create(stat_items)
class WebsiteStatItem(TimeStampedModel):
"""Individual stat item (e.g. unique visitors)"""
report = models.ForeignKey(
WebsiteStatReport, on_delete=models.CASCADE, related_name="stats"
)
name = models.CharField()
code_name = models.CharField()
value = models.FloatField()
def __str__(self):
return f"{self.report.version} {self.name}"
@property
def formatted_value(self) -> str:
"""Format value based on metric type"""
if self.code_name == "visit_duration":
minutes, seconds = divmod(int(self.value), 60)
return f"{minutes}m {seconds}s"
elif self.code_name == "bounce_rate":
return f"{self.value}%"
return str(self.value)
class Meta:
constraints = [
models.UniqueConstraint(
fields=["report", "code_name"], name="unique_report_code_name"
)
]

View File

@@ -0,0 +1,16 @@
{% extends "base.html" %}
{% block content %}
<main class="content">
<div class="py-3 px-3 md:mt-3 md:px-0">
<h3 class="mb-4">Import Web Statistics</h3>
<form method="post">
{% csrf_token %}
{{ form }}
<div class="flex flex-row my-4">
<input class="py-2 px-3 text-sm text-white rounded bg-orange cursor-pointer" type="submit" value="Import">
</div>
</form>
</div>
</main>
{% endblock %}

1
reports/tests.py Normal file
View File

@@ -0,0 +1 @@
# Create your tests here.

1
reports/views.py Normal file
View File

@@ -0,0 +1 @@
# Create your views here.