Sync EmailData from hyperkitty. (#1366)

- This work is a prerequisite for #1290 and #1289
- Create an `EmailData` model which holds email counts per
(CommitAuthor, Version).
- Add management command `sync_mailinglist_stats` to query data from the
hyperkitty database and build `EmailData` objects from it
- EmailData.count is calculated between x.x.0 release_dates
- Add button in EmailData admin to trigger task
- Update the `merge_author` method to update EmailData with the Summed
counts per version and delete EmailData which pointed to the
`CommitAuthor` being merged from.
This commit is contained in:
Brian Perrett
2024-10-21 09:39:09 -07:00
committed by GitHub
parent c0d453a58f
commit fdeb79b8e3
12 changed files with 384 additions and 2 deletions

View File

@@ -141,6 +141,12 @@ Then as a superuser log into the admin interface, go to "Versions" and click on
---
## Syncing EmailData Locally
To work with mailinglist data locally, the django application expects to be
able to query a copy of the hyperkitty database at HYPERKITTY_DATABASE_URL.
Then, the `sync_mailinglist_stats` management command can be run.
## Deploying
TDB

View File

@@ -277,6 +277,9 @@ CACHES = {
# Default interval by which to clear the static content cache
CLEAR_STATIC_CONTENT_CACHE_DAYS = 7
# Hyperkitty
HYPERKITTY_DATABASE_URL = env("HYPERKITTY_DATABASE_URL", default="")
# Mailman API credentials
MAILMAN_REST_API_URL = env("MAILMAN_REST_API_URL", default="http://localhost:8001")
MAILMAN_REST_API_USER = env("MAILMAN_REST_API_USER", default="restadmin")

View File

@@ -243,3 +243,20 @@ If both the `--release` and the `--library-name` are passed, the command will lo
|----------------------|--------|--------------------------------------------------------------|
| `--token` | string | Pass a GitHub API token. If not passed, will use the value in `settings.GITHUB_TOKEN`. |
| `--delete-versions` | bool | If passed, all existing beta Version records will be deleted before the new beta release is imported. |
## `sync_mailinglist_stats`
**Purpose**: Build EmailData objects from the hyperkitty email archive database.
**Example**
```bash
./manage.py sync_mailinglist_stats
```
**Options**
| Options | Format | Description |
|----------------------|--------|--------------------------------------------------------------|
| `--clean` | bool | If passed, all existing beta EmailData records will be deleted before running the sync. |

View File

@@ -27,6 +27,7 @@ STATIC_CONTENT_REGION="us-east-2"
STATIC_CONTENT_AWS_S3_ENDPOINT_URL="https://s3.us-east-2.amazonaws.com"
# Mailman database settings
HYPERKITTY_DATABASE_URL="postgresql://postgres:postgres@db:5432/hyperkitty"
DATABASE_URL="postgresql://postgres@db:5432/postgres"
DATABASE_TYPE="postgres"
DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase"

View File

@@ -4,12 +4,14 @@ from urllib.parse import urlparse
from django.core.cache import caches
from django.db import models, transaction
from django.db.models import Sum
from django.utils.functional import cached_property
from django.utils.text import slugify
from core.markdown import process_md
from core.models import RenderedContent
from core.asciidoc import convert_adoc_to_html
from mailing_list.models import EmailData
from .utils import generate_random_string, write_content_to_tempfile
@@ -53,8 +55,9 @@ class CommitAuthor(models.Model):
"""
if self.pk == other.pk:
return
Commit.objects.filter(author=other).update(author=self)
other.commitauthoremail_set.update(author=self)
other.commit_set.update(author=self)
self.merge_author_email_data(other)
if not self.avatar_url:
self.avatar_url = other.avatar_url
if not self.github_profile_url:
@@ -62,6 +65,29 @@ class CommitAuthor(models.Model):
self.save(update_fields=["avatar_url", "github_profile_url"])
other.delete()
@transaction.atomic
def merge_author_email_data(self, other: Self):
"""Merge EmailData for the 2 authors.
- Update or create EmailData with author=self with the total counts for
both `self` and `other` authors for each version.
- Delete all EmailData objects for the `other` author.
"""
count_totals = (
EmailData.objects.filter(author__in=[self, other])
.values("version_id")
.annotate(total_count=Sum("count"))
)
for item in count_totals:
EmailData.objects.update_or_create(
author=self,
version_id=item["version_id"],
defaults={"count": item["total_count"]},
)
EmailData.objects.filter(author=other).delete()
class CommitAuthorEmail(models.Model):
author = models.ForeignKey(CommitAuthor, on_delete=models.CASCADE)

View File

@@ -1,6 +1,10 @@
import datetime
from django.db.models import Sum
from model_bakery import baker
from libraries.models import CommitAuthor
from mailing_list.models import EmailData
def test_get_cpp_standard_minimum_display(library):
library.cpp_standard_minimum = "11"
@@ -121,3 +125,56 @@ def test_library_version_first_boost_version_property(library):
version_3.save()
del library.first_boost_version
assert library.first_boost_version == version_3
def test_merge_author_deletes_author():
author_1_email = baker.make("libraries.CommitAuthorEmail")
author_1 = author_1_email.author
author_2_email = baker.make("libraries.CommitAuthorEmail")
author_2 = author_2_email.author
assert CommitAuthor.objects.count() == 2
author_1.merge_author(author_2)
assert CommitAuthor.objects.all().get() == author_1
assert author_1.commitauthoremail_set.count() == 2
def test_merge_author_reassigns_commits():
lv = baker.make("libraries.LibraryVersion")
author_1 = baker.make("libraries.CommitAuthor")
author_2 = baker.make("libraries.CommitAuthor")
author_3 = baker.make("libraries.CommitAuthor")
for author in [author_1, author_2, author_3]:
baker.make("libraries.Commit", author=author, library_version=lv, _quantity=10)
assert author_1.commit_set.count() == 10
author_1.merge_author(author_2)
assert author_1.commit_set.count() == 20
def test_merge_author_reassigns_emaildata():
versions = []
for i in range(10):
versions.append(
baker.make(
"versions.Version", name=f"0.{i}.0", release_date=f"{2000 + i}-01-01"
)
)
authors = baker.make("libraries.CommitAuthor", _quantity=10)
for author in authors:
for version in versions:
baker.make(
"mailing_list.EmailData", author=author, version=version, count=10
)
assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000
assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 100
authors[0].merge_author(authors[1])
# all of author[1]'s emaildata counts should go to author[0]
assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 200
# total should stay the same
assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000

View File

@@ -1 +1,45 @@
# Register your models here.
from django.urls import path
from django.http import HttpResponseRedirect
from django.contrib import admin, messages
from django.conf import settings
from mailing_list.models import EmailData
from mailing_list.tasks import sync_mailinglist_stats
@admin.register(EmailData)
class EmailDataAdmin(admin.ModelAdmin):
list_display = ["author", "version", "count"]
search_fields = [
"author__commitauthoremail__email",
"author__name",
]
raw_id_fields = ["author"]
list_filter = ["version"]
change_list_template = "admin/mailinglist_change_list.html"
def get_urls(self):
urls = super().get_urls()
my_urls = [
path(
"sync_mailinglist_stats/",
self.admin_site.admin_view(self.sync_mailinglist_stats),
name="sync_mailinglist_stats",
),
]
return my_urls + urls
def sync_mailinglist_stats(self, request):
if settings.HYPERKITTY_DATABASE_URL:
sync_mailinglist_stats.delay()
self.message_user(request, "Syncing EmailData.")
else:
self.message_user(
request,
"HYPERKITTY_DATABASE_URL setting not configured.",
level=messages.WARNING,
)
return HttpResponseRedirect("../")
def has_add_permission(self, request):
return False

View File

@@ -0,0 +1,127 @@
from itertools import pairwise
import djclick as click
import psycopg2
from psycopg2._psycopg import connection as Connection
from django.db.models.functions import Lower
from django.conf import settings
from django.db import transaction
from libraries.models import CommitAuthor, CommitAuthorEmail
from mailing_list.models import EmailData
from versions.models import Version
@click.command()
@click.option(
"--clean",
is_flag=True,
help="Delete all EmailData objects before importing.",
)
def command(clean):
if not settings.HYPERKITTY_DATABASE_URL:
click.echo("HYPERKITTY_DATABASE_URL setting is empty. Not syncing.")
return
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with transaction.atomic():
if clean:
click.echo("Deleting all EmailData objects.")
EmailData.objects.all().delete()
click.echo("Creating CommitAuthors for emails.")
create_commitauthors(conn)
click.echo("Creating EmailData aggregated stats.")
create_emaildata(conn)
def create_emaildata(conn: Connection):
def bulk_create(rows, version):
author_emails = {
x.lower_email: x
for x in CommitAuthorEmail.objects.annotate(lower_email=Lower("email"))
.filter(lower_email__in=[x["email"] for x in rows])
.select_related("author")
}
# group EmailData by CommitAuthor
authors = {}
for row in rows:
author = author_emails[row["email"]].author
if author not in authors:
authors[author] = EmailData(version=version, author=author, count=0)
authors[author].count += row["count"]
EmailData.objects.bulk_create(
authors.values(),
update_conflicts=True,
unique_fields=["author", "version"],
update_fields=["count"],
)
versions = Version.objects.minor_versions().order_by("version_array")
columns = ["email", "name", "count"]
for a, b in pairwise(versions):
start = a.release_date
end = b.release_date
if not (start and end):
raise ValueError("All x.x.0 versions must have a release date.")
with conn.cursor(name=f"emaildata_sync_{b.name}") as cursor:
cursor.execute(
"""
SELECT
LOWER(sender_id) AS email
, (ARRAY_AGG(distinct(sender_name)))[1] as name
, count(*) AS count
FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s
GROUP BY LOWER(sender_id);
""",
{"start": start, "end": end},
)
rows = [{x: data[i] for i, x in enumerate(columns)} for data in cursor]
bulk_create(rows, b)
def create_commitauthors(conn: Connection):
"""Create CommitAuthor and CommitAuthorEmail objects for
all emails in hyperkitty.
"""
def bulk_create(rows):
emails = {x["email"]: x for x in rows}
commitauthoremails = {
x.lower_email: x.author_id
for x in CommitAuthorEmail.objects.annotate(
lower_email=Lower("email")
).filter(lower_email__in=emails)
}
authors_to_create = []
author_emails_to_create = []
for email_lower, row in emails.items():
if email_lower not in commitauthoremails:
new_author = CommitAuthor(name=row["name"])
authors_to_create.append(new_author)
author_emails_to_create.append(
CommitAuthorEmail(email=row["email"], author=new_author)
)
CommitAuthor.objects.bulk_create(authors_to_create)
CommitAuthorEmail.objects.bulk_create(author_emails_to_create)
columns = ["email", "name"]
# Uses a named cursor to use a serverside postgres cursor
with conn.cursor(name="commitauthor_sync") as cursor:
cursor.execute(
"""
SELECT
LOWER(sender_id) AS email
, (ARRAY_AGG(distinct(sender_name)))[1] as name
FROM hyperkitty_email
GROUP BY LOWER(sender_id);
"""
)
rows = []
for i, data in enumerate(cursor):
row = {x: data[j] for j, x in enumerate(columns)}
rows.append(row)
if i % 2000 == 0 and i != 0:
bulk_create(rows)
rows = []
if rows:
bulk_create(rows)

View File

@@ -0,0 +1,54 @@
# Generated by Django 4.2.15 on 2024-10-16 21:34
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
("libraries", "0023_libraryversion_authors"),
("versions", "0011_version_full_release"),
("mailing_list", "0003_delete_mailinglistmessage"),
]
operations = [
migrations.CreateModel(
name="EmailData",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("count", models.IntegerField()),
(
"author",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="libraries.commitauthor",
),
),
(
"version",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="versions.version",
),
),
],
),
migrations.AddConstraint(
model_name="emaildata",
constraint=models.UniqueConstraint(
fields=("author", "version"),
name="mailing_list_emaildata_author_version_unique",
),
),
]

View File

@@ -0,0 +1,18 @@
from django.db import models
class EmailData(models.Model):
author = models.ForeignKey("libraries.CommitAuthor", on_delete=models.CASCADE)
version = models.ForeignKey("versions.Version", on_delete=models.CASCADE)
count = models.IntegerField()
class Meta:
constraints = [
models.UniqueConstraint(
fields=["author", "version"],
name="%(app_label)s_%(class)s_author_version_unique",
),
]
def __str__(self):
return self.author.name

18
mailing_list/tasks.py Normal file
View File

@@ -0,0 +1,18 @@
import structlog
from django.core.management import call_command
from django.conf import settings
from config.celery import app
logger = structlog.getLogger(__name__)
@app.task
def sync_mailinglist_stats():
"""Task to create EmailData from hyperkitty database."""
if not settings.HYPERKITTY_DATABASE_URL:
logger.warning("HYPERKITTY_DATABASE_URL not set.")
return
call_command("sync_mailinglist_stats")

View File

@@ -0,0 +1,11 @@
{% extends "admin/change_list.html" %}
{% load i18n admin_urls %}
{% block object-tools %}
<ul class="object-tools">
{% block object-tools-items %}
{{ block.super }}
<li><a href="{% url 'admin:sync_mailinglist_stats' %}" class="addlink">{% trans "Sync Mailing List Stats" %}</a></li>
{% endblock %}
</ul>
{% endblock %}