mirror of
https://github.com/boostorg/website-v2.git
synced 2026-01-19 04:42:17 +00:00
Sync EmailData from hyperkitty. (#1366)
- This work is a prerequisite for #1290 and #1289 - Create an `EmailData` model which holds email counts per (CommitAuthor, Version). - Add management command `sync_mailinglist_stats` to query data from the hyperkitty database and build `EmailData` objects from it - EmailData.count is calculated between x.x.0 release_dates - Add button in EmailData admin to trigger task - Update the `merge_author` method to update EmailData with the Summed counts per version and delete EmailData which pointed to the `CommitAuthor` being merged from.
This commit is contained in:
@@ -141,6 +141,12 @@ Then as a superuser log into the admin interface, go to "Versions" and click on
|
||||
|
||||
---
|
||||
|
||||
## Syncing EmailData Locally
|
||||
|
||||
To work with mailinglist data locally, the django application expects to be
|
||||
able to query a copy of the hyperkitty database at HYPERKITTY_DATABASE_URL.
|
||||
Then, the `sync_mailinglist_stats` management command can be run.
|
||||
|
||||
## Deploying
|
||||
|
||||
TDB
|
||||
|
||||
@@ -277,6 +277,9 @@ CACHES = {
|
||||
# Default interval by which to clear the static content cache
|
||||
CLEAR_STATIC_CONTENT_CACHE_DAYS = 7
|
||||
|
||||
# Hyperkitty
|
||||
HYPERKITTY_DATABASE_URL = env("HYPERKITTY_DATABASE_URL", default="")
|
||||
|
||||
# Mailman API credentials
|
||||
MAILMAN_REST_API_URL = env("MAILMAN_REST_API_URL", default="http://localhost:8001")
|
||||
MAILMAN_REST_API_USER = env("MAILMAN_REST_API_USER", default="restadmin")
|
||||
|
||||
@@ -243,3 +243,20 @@ If both the `--release` and the `--library-name` are passed, the command will lo
|
||||
|----------------------|--------|--------------------------------------------------------------|
|
||||
| `--token` | string | Pass a GitHub API token. If not passed, will use the value in `settings.GITHUB_TOKEN`. |
|
||||
| `--delete-versions` | bool | If passed, all existing beta Version records will be deleted before the new beta release is imported. |
|
||||
|
||||
|
||||
## `sync_mailinglist_stats`
|
||||
|
||||
**Purpose**: Build EmailData objects from the hyperkitty email archive database.
|
||||
|
||||
**Example**
|
||||
|
||||
```bash
|
||||
./manage.py sync_mailinglist_stats
|
||||
```
|
||||
|
||||
**Options**
|
||||
|
||||
| Options | Format | Description |
|
||||
|----------------------|--------|--------------------------------------------------------------|
|
||||
| `--clean` | bool | If passed, all existing beta EmailData records will be deleted before running the sync. |
|
||||
|
||||
@@ -27,6 +27,7 @@ STATIC_CONTENT_REGION="us-east-2"
|
||||
STATIC_CONTENT_AWS_S3_ENDPOINT_URL="https://s3.us-east-2.amazonaws.com"
|
||||
|
||||
# Mailman database settings
|
||||
HYPERKITTY_DATABASE_URL="postgresql://postgres:postgres@db:5432/hyperkitty"
|
||||
DATABASE_URL="postgresql://postgres@db:5432/postgres"
|
||||
DATABASE_TYPE="postgres"
|
||||
DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase"
|
||||
|
||||
@@ -4,12 +4,14 @@ from urllib.parse import urlparse
|
||||
|
||||
from django.core.cache import caches
|
||||
from django.db import models, transaction
|
||||
from django.db.models import Sum
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
|
||||
from core.markdown import process_md
|
||||
from core.models import RenderedContent
|
||||
from core.asciidoc import convert_adoc_to_html
|
||||
from mailing_list.models import EmailData
|
||||
|
||||
from .utils import generate_random_string, write_content_to_tempfile
|
||||
|
||||
@@ -53,8 +55,9 @@ class CommitAuthor(models.Model):
|
||||
"""
|
||||
if self.pk == other.pk:
|
||||
return
|
||||
Commit.objects.filter(author=other).update(author=self)
|
||||
other.commitauthoremail_set.update(author=self)
|
||||
other.commit_set.update(author=self)
|
||||
self.merge_author_email_data(other)
|
||||
if not self.avatar_url:
|
||||
self.avatar_url = other.avatar_url
|
||||
if not self.github_profile_url:
|
||||
@@ -62,6 +65,29 @@ class CommitAuthor(models.Model):
|
||||
self.save(update_fields=["avatar_url", "github_profile_url"])
|
||||
other.delete()
|
||||
|
||||
@transaction.atomic
|
||||
def merge_author_email_data(self, other: Self):
|
||||
"""Merge EmailData for the 2 authors.
|
||||
|
||||
- Update or create EmailData with author=self with the total counts for
|
||||
both `self` and `other` authors for each version.
|
||||
- Delete all EmailData objects for the `other` author.
|
||||
|
||||
"""
|
||||
count_totals = (
|
||||
EmailData.objects.filter(author__in=[self, other])
|
||||
.values("version_id")
|
||||
.annotate(total_count=Sum("count"))
|
||||
)
|
||||
|
||||
for item in count_totals:
|
||||
EmailData.objects.update_or_create(
|
||||
author=self,
|
||||
version_id=item["version_id"],
|
||||
defaults={"count": item["total_count"]},
|
||||
)
|
||||
EmailData.objects.filter(author=other).delete()
|
||||
|
||||
|
||||
class CommitAuthorEmail(models.Model):
|
||||
author = models.ForeignKey(CommitAuthor, on_delete=models.CASCADE)
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import datetime
|
||||
from django.db.models import Sum
|
||||
from model_bakery import baker
|
||||
|
||||
from libraries.models import CommitAuthor
|
||||
from mailing_list.models import EmailData
|
||||
|
||||
|
||||
def test_get_cpp_standard_minimum_display(library):
|
||||
library.cpp_standard_minimum = "11"
|
||||
@@ -121,3 +125,56 @@ def test_library_version_first_boost_version_property(library):
|
||||
version_3.save()
|
||||
del library.first_boost_version
|
||||
assert library.first_boost_version == version_3
|
||||
|
||||
|
||||
def test_merge_author_deletes_author():
|
||||
author_1_email = baker.make("libraries.CommitAuthorEmail")
|
||||
author_1 = author_1_email.author
|
||||
author_2_email = baker.make("libraries.CommitAuthorEmail")
|
||||
author_2 = author_2_email.author
|
||||
|
||||
assert CommitAuthor.objects.count() == 2
|
||||
author_1.merge_author(author_2)
|
||||
assert CommitAuthor.objects.all().get() == author_1
|
||||
assert author_1.commitauthoremail_set.count() == 2
|
||||
|
||||
|
||||
def test_merge_author_reassigns_commits():
|
||||
lv = baker.make("libraries.LibraryVersion")
|
||||
author_1 = baker.make("libraries.CommitAuthor")
|
||||
author_2 = baker.make("libraries.CommitAuthor")
|
||||
author_3 = baker.make("libraries.CommitAuthor")
|
||||
|
||||
for author in [author_1, author_2, author_3]:
|
||||
baker.make("libraries.Commit", author=author, library_version=lv, _quantity=10)
|
||||
|
||||
assert author_1.commit_set.count() == 10
|
||||
author_1.merge_author(author_2)
|
||||
assert author_1.commit_set.count() == 20
|
||||
|
||||
|
||||
def test_merge_author_reassigns_emaildata():
|
||||
versions = []
|
||||
for i in range(10):
|
||||
versions.append(
|
||||
baker.make(
|
||||
"versions.Version", name=f"0.{i}.0", release_date=f"{2000 + i}-01-01"
|
||||
)
|
||||
)
|
||||
|
||||
authors = baker.make("libraries.CommitAuthor", _quantity=10)
|
||||
for author in authors:
|
||||
for version in versions:
|
||||
baker.make(
|
||||
"mailing_list.EmailData", author=author, version=version, count=10
|
||||
)
|
||||
|
||||
assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000
|
||||
assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 100
|
||||
|
||||
authors[0].merge_author(authors[1])
|
||||
|
||||
# all of author[1]'s emaildata counts should go to author[0]
|
||||
assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 200
|
||||
# total should stay the same
|
||||
assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000
|
||||
|
||||
@@ -1 +1,45 @@
|
||||
# Register your models here.
|
||||
from django.urls import path
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.contrib import admin, messages
|
||||
from django.conf import settings
|
||||
|
||||
from mailing_list.models import EmailData
|
||||
from mailing_list.tasks import sync_mailinglist_stats
|
||||
|
||||
|
||||
@admin.register(EmailData)
|
||||
class EmailDataAdmin(admin.ModelAdmin):
|
||||
list_display = ["author", "version", "count"]
|
||||
search_fields = [
|
||||
"author__commitauthoremail__email",
|
||||
"author__name",
|
||||
]
|
||||
raw_id_fields = ["author"]
|
||||
list_filter = ["version"]
|
||||
change_list_template = "admin/mailinglist_change_list.html"
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
my_urls = [
|
||||
path(
|
||||
"sync_mailinglist_stats/",
|
||||
self.admin_site.admin_view(self.sync_mailinglist_stats),
|
||||
name="sync_mailinglist_stats",
|
||||
),
|
||||
]
|
||||
return my_urls + urls
|
||||
|
||||
def sync_mailinglist_stats(self, request):
|
||||
if settings.HYPERKITTY_DATABASE_URL:
|
||||
sync_mailinglist_stats.delay()
|
||||
self.message_user(request, "Syncing EmailData.")
|
||||
else:
|
||||
self.message_user(
|
||||
request,
|
||||
"HYPERKITTY_DATABASE_URL setting not configured.",
|
||||
level=messages.WARNING,
|
||||
)
|
||||
return HttpResponseRedirect("../")
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
127
mailing_list/management/commands/sync_mailinglist_stats.py
Normal file
127
mailing_list/management/commands/sync_mailinglist_stats.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from itertools import pairwise
|
||||
import djclick as click
|
||||
import psycopg2
|
||||
from psycopg2._psycopg import connection as Connection
|
||||
|
||||
from django.db.models.functions import Lower
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
|
||||
from libraries.models import CommitAuthor, CommitAuthorEmail
|
||||
from mailing_list.models import EmailData
|
||||
from versions.models import Version
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--clean",
|
||||
is_flag=True,
|
||||
help="Delete all EmailData objects before importing.",
|
||||
)
|
||||
def command(clean):
|
||||
if not settings.HYPERKITTY_DATABASE_URL:
|
||||
click.echo("HYPERKITTY_DATABASE_URL setting is empty. Not syncing.")
|
||||
return
|
||||
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
|
||||
with transaction.atomic():
|
||||
if clean:
|
||||
click.echo("Deleting all EmailData objects.")
|
||||
EmailData.objects.all().delete()
|
||||
click.echo("Creating CommitAuthors for emails.")
|
||||
create_commitauthors(conn)
|
||||
click.echo("Creating EmailData aggregated stats.")
|
||||
create_emaildata(conn)
|
||||
|
||||
|
||||
def create_emaildata(conn: Connection):
|
||||
def bulk_create(rows, version):
|
||||
author_emails = {
|
||||
x.lower_email: x
|
||||
for x in CommitAuthorEmail.objects.annotate(lower_email=Lower("email"))
|
||||
.filter(lower_email__in=[x["email"] for x in rows])
|
||||
.select_related("author")
|
||||
}
|
||||
# group EmailData by CommitAuthor
|
||||
authors = {}
|
||||
for row in rows:
|
||||
author = author_emails[row["email"]].author
|
||||
if author not in authors:
|
||||
authors[author] = EmailData(version=version, author=author, count=0)
|
||||
authors[author].count += row["count"]
|
||||
EmailData.objects.bulk_create(
|
||||
authors.values(),
|
||||
update_conflicts=True,
|
||||
unique_fields=["author", "version"],
|
||||
update_fields=["count"],
|
||||
)
|
||||
|
||||
versions = Version.objects.minor_versions().order_by("version_array")
|
||||
columns = ["email", "name", "count"]
|
||||
for a, b in pairwise(versions):
|
||||
start = a.release_date
|
||||
end = b.release_date
|
||||
if not (start and end):
|
||||
raise ValueError("All x.x.0 versions must have a release date.")
|
||||
with conn.cursor(name=f"emaildata_sync_{b.name}") as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT
|
||||
LOWER(sender_id) AS email
|
||||
, (ARRAY_AGG(distinct(sender_name)))[1] as name
|
||||
, count(*) AS count
|
||||
FROM hyperkitty_email
|
||||
WHERE date >= %(start)s AND date < %(end)s
|
||||
GROUP BY LOWER(sender_id);
|
||||
""",
|
||||
{"start": start, "end": end},
|
||||
)
|
||||
rows = [{x: data[i] for i, x in enumerate(columns)} for data in cursor]
|
||||
bulk_create(rows, b)
|
||||
|
||||
|
||||
def create_commitauthors(conn: Connection):
|
||||
"""Create CommitAuthor and CommitAuthorEmail objects for
|
||||
all emails in hyperkitty.
|
||||
"""
|
||||
|
||||
def bulk_create(rows):
|
||||
emails = {x["email"]: x for x in rows}
|
||||
commitauthoremails = {
|
||||
x.lower_email: x.author_id
|
||||
for x in CommitAuthorEmail.objects.annotate(
|
||||
lower_email=Lower("email")
|
||||
).filter(lower_email__in=emails)
|
||||
}
|
||||
authors_to_create = []
|
||||
author_emails_to_create = []
|
||||
for email_lower, row in emails.items():
|
||||
if email_lower not in commitauthoremails:
|
||||
new_author = CommitAuthor(name=row["name"])
|
||||
authors_to_create.append(new_author)
|
||||
author_emails_to_create.append(
|
||||
CommitAuthorEmail(email=row["email"], author=new_author)
|
||||
)
|
||||
CommitAuthor.objects.bulk_create(authors_to_create)
|
||||
CommitAuthorEmail.objects.bulk_create(author_emails_to_create)
|
||||
|
||||
columns = ["email", "name"]
|
||||
# Uses a named cursor to use a serverside postgres cursor
|
||||
with conn.cursor(name="commitauthor_sync") as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT
|
||||
LOWER(sender_id) AS email
|
||||
, (ARRAY_AGG(distinct(sender_name)))[1] as name
|
||||
FROM hyperkitty_email
|
||||
GROUP BY LOWER(sender_id);
|
||||
"""
|
||||
)
|
||||
rows = []
|
||||
for i, data in enumerate(cursor):
|
||||
row = {x: data[j] for j, x in enumerate(columns)}
|
||||
rows.append(row)
|
||||
if i % 2000 == 0 and i != 0:
|
||||
bulk_create(rows)
|
||||
rows = []
|
||||
if rows:
|
||||
bulk_create(rows)
|
||||
54
mailing_list/migrations/0004_initial.py
Normal file
54
mailing_list/migrations/0004_initial.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# Generated by Django 4.2.15 on 2024-10-16 21:34
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
("libraries", "0023_libraryversion_authors"),
|
||||
("versions", "0011_version_full_release"),
|
||||
("mailing_list", "0003_delete_mailinglistmessage"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="EmailData",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("count", models.IntegerField()),
|
||||
(
|
||||
"author",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="libraries.commitauthor",
|
||||
),
|
||||
),
|
||||
(
|
||||
"version",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="versions.version",
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name="emaildata",
|
||||
constraint=models.UniqueConstraint(
|
||||
fields=("author", "version"),
|
||||
name="mailing_list_emaildata_author_version_unique",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,18 @@
|
||||
from django.db import models
|
||||
|
||||
|
||||
class EmailData(models.Model):
|
||||
author = models.ForeignKey("libraries.CommitAuthor", on_delete=models.CASCADE)
|
||||
version = models.ForeignKey("versions.Version", on_delete=models.CASCADE)
|
||||
count = models.IntegerField()
|
||||
|
||||
class Meta:
|
||||
constraints = [
|
||||
models.UniqueConstraint(
|
||||
fields=["author", "version"],
|
||||
name="%(app_label)s_%(class)s_author_version_unique",
|
||||
),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return self.author.name
|
||||
|
||||
18
mailing_list/tasks.py
Normal file
18
mailing_list/tasks.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import structlog
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.conf import settings
|
||||
|
||||
from config.celery import app
|
||||
|
||||
|
||||
logger = structlog.getLogger(__name__)
|
||||
|
||||
|
||||
@app.task
|
||||
def sync_mailinglist_stats():
|
||||
"""Task to create EmailData from hyperkitty database."""
|
||||
if not settings.HYPERKITTY_DATABASE_URL:
|
||||
logger.warning("HYPERKITTY_DATABASE_URL not set.")
|
||||
return
|
||||
call_command("sync_mailinglist_stats")
|
||||
11
templates/admin/mailinglist_change_list.html
Normal file
11
templates/admin/mailinglist_change_list.html
Normal file
@@ -0,0 +1,11 @@
|
||||
{% extends "admin/change_list.html" %}
|
||||
{% load i18n admin_urls %}
|
||||
|
||||
{% block object-tools %}
|
||||
<ul class="object-tools">
|
||||
{% block object-tools-items %}
|
||||
{{ block.super }}
|
||||
<li><a href="{% url 'admin:sync_mailinglist_stats' %}" class="addlink">{% trans "Sync Mailing List Stats" %}</a></li>
|
||||
{% endblock %}
|
||||
</ul>
|
||||
{% endblock %}
|
||||
Reference in New Issue
Block a user