diff --git a/README.md b/README.md index 7cdf5be8..59bbefac 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,12 @@ Then as a superuser log into the admin interface, go to "Versions" and click on --- +## Syncing EmailData Locally + +To work with mailinglist data locally, the django application expects to be +able to query a copy of the hyperkitty database at HYPERKITTY_DATABASE_URL. +Then, the `sync_mailinglist_stats` management command can be run. + ## Deploying TDB diff --git a/config/settings.py b/config/settings.py index 8a44d5da..037a6203 100755 --- a/config/settings.py +++ b/config/settings.py @@ -277,6 +277,9 @@ CACHES = { # Default interval by which to clear the static content cache CLEAR_STATIC_CONTENT_CACHE_DAYS = 7 +# Hyperkitty +HYPERKITTY_DATABASE_URL = env("HYPERKITTY_DATABASE_URL", default="") + # Mailman API credentials MAILMAN_REST_API_URL = env("MAILMAN_REST_API_URL", default="http://localhost:8001") MAILMAN_REST_API_USER = env("MAILMAN_REST_API_USER", default="restadmin") diff --git a/docs/commands.md b/docs/commands.md index 9391a73b..11759938 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -243,3 +243,20 @@ If both the `--release` and the `--library-name` are passed, the command will lo |----------------------|--------|--------------------------------------------------------------| | `--token` | string | Pass a GitHub API token. If not passed, will use the value in `settings.GITHUB_TOKEN`. | | `--delete-versions` | bool | If passed, all existing beta Version records will be deleted before the new beta release is imported. | + + +## `sync_mailinglist_stats` + +**Purpose**: Build EmailData objects from the hyperkitty email archive database. + +**Example** + +```bash +./manage.py sync_mailinglist_stats +``` + +**Options** + +| Options | Format | Description | +|----------------------|--------|--------------------------------------------------------------| +| `--clean` | bool | If passed, all existing beta EmailData records will be deleted before running the sync. | diff --git a/env.template b/env.template index 4d2db2d3..800cfaa6 100644 --- a/env.template +++ b/env.template @@ -27,6 +27,7 @@ STATIC_CONTENT_REGION="us-east-2" STATIC_CONTENT_AWS_S3_ENDPOINT_URL="https://s3.us-east-2.amazonaws.com" # Mailman database settings +HYPERKITTY_DATABASE_URL="postgresql://postgres:postgres@db:5432/hyperkitty" DATABASE_URL="postgresql://postgres@db:5432/postgres" DATABASE_TYPE="postgres" DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase" diff --git a/libraries/models.py b/libraries/models.py index a38281f0..35d92fad 100644 --- a/libraries/models.py +++ b/libraries/models.py @@ -4,12 +4,14 @@ from urllib.parse import urlparse from django.core.cache import caches from django.db import models, transaction +from django.db.models import Sum from django.utils.functional import cached_property from django.utils.text import slugify from core.markdown import process_md from core.models import RenderedContent from core.asciidoc import convert_adoc_to_html +from mailing_list.models import EmailData from .utils import generate_random_string, write_content_to_tempfile @@ -53,8 +55,9 @@ class CommitAuthor(models.Model): """ if self.pk == other.pk: return - Commit.objects.filter(author=other).update(author=self) other.commitauthoremail_set.update(author=self) + other.commit_set.update(author=self) + self.merge_author_email_data(other) if not self.avatar_url: self.avatar_url = other.avatar_url if not self.github_profile_url: @@ -62,6 +65,29 @@ class CommitAuthor(models.Model): self.save(update_fields=["avatar_url", "github_profile_url"]) other.delete() + @transaction.atomic + def merge_author_email_data(self, other: Self): + """Merge EmailData for the 2 authors. + + - Update or create EmailData with author=self with the total counts for + both `self` and `other` authors for each version. + - Delete all EmailData objects for the `other` author. + + """ + count_totals = ( + EmailData.objects.filter(author__in=[self, other]) + .values("version_id") + .annotate(total_count=Sum("count")) + ) + + for item in count_totals: + EmailData.objects.update_or_create( + author=self, + version_id=item["version_id"], + defaults={"count": item["total_count"]}, + ) + EmailData.objects.filter(author=other).delete() + class CommitAuthorEmail(models.Model): author = models.ForeignKey(CommitAuthor, on_delete=models.CASCADE) diff --git a/libraries/tests/test_models.py b/libraries/tests/test_models.py index ba181df6..245f3137 100644 --- a/libraries/tests/test_models.py +++ b/libraries/tests/test_models.py @@ -1,6 +1,10 @@ import datetime +from django.db.models import Sum from model_bakery import baker +from libraries.models import CommitAuthor +from mailing_list.models import EmailData + def test_get_cpp_standard_minimum_display(library): library.cpp_standard_minimum = "11" @@ -121,3 +125,56 @@ def test_library_version_first_boost_version_property(library): version_3.save() del library.first_boost_version assert library.first_boost_version == version_3 + + +def test_merge_author_deletes_author(): + author_1_email = baker.make("libraries.CommitAuthorEmail") + author_1 = author_1_email.author + author_2_email = baker.make("libraries.CommitAuthorEmail") + author_2 = author_2_email.author + + assert CommitAuthor.objects.count() == 2 + author_1.merge_author(author_2) + assert CommitAuthor.objects.all().get() == author_1 + assert author_1.commitauthoremail_set.count() == 2 + + +def test_merge_author_reassigns_commits(): + lv = baker.make("libraries.LibraryVersion") + author_1 = baker.make("libraries.CommitAuthor") + author_2 = baker.make("libraries.CommitAuthor") + author_3 = baker.make("libraries.CommitAuthor") + + for author in [author_1, author_2, author_3]: + baker.make("libraries.Commit", author=author, library_version=lv, _quantity=10) + + assert author_1.commit_set.count() == 10 + author_1.merge_author(author_2) + assert author_1.commit_set.count() == 20 + + +def test_merge_author_reassigns_emaildata(): + versions = [] + for i in range(10): + versions.append( + baker.make( + "versions.Version", name=f"0.{i}.0", release_date=f"{2000 + i}-01-01" + ) + ) + + authors = baker.make("libraries.CommitAuthor", _quantity=10) + for author in authors: + for version in versions: + baker.make( + "mailing_list.EmailData", author=author, version=version, count=10 + ) + + assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000 + assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 100 + + authors[0].merge_author(authors[1]) + + # all of author[1]'s emaildata counts should go to author[0] + assert sum(authors[0].emaildata_set.all().values_list("count", flat=True)) == 200 + # total should stay the same + assert EmailData.objects.all().aggregate(total=Sum("count"))["total"] == 1000 diff --git a/mailing_list/admin.py b/mailing_list/admin.py index 846f6b40..551cd296 100644 --- a/mailing_list/admin.py +++ b/mailing_list/admin.py @@ -1 +1,45 @@ -# Register your models here. +from django.urls import path +from django.http import HttpResponseRedirect +from django.contrib import admin, messages +from django.conf import settings + +from mailing_list.models import EmailData +from mailing_list.tasks import sync_mailinglist_stats + + +@admin.register(EmailData) +class EmailDataAdmin(admin.ModelAdmin): + list_display = ["author", "version", "count"] + search_fields = [ + "author__commitauthoremail__email", + "author__name", + ] + raw_id_fields = ["author"] + list_filter = ["version"] + change_list_template = "admin/mailinglist_change_list.html" + + def get_urls(self): + urls = super().get_urls() + my_urls = [ + path( + "sync_mailinglist_stats/", + self.admin_site.admin_view(self.sync_mailinglist_stats), + name="sync_mailinglist_stats", + ), + ] + return my_urls + urls + + def sync_mailinglist_stats(self, request): + if settings.HYPERKITTY_DATABASE_URL: + sync_mailinglist_stats.delay() + self.message_user(request, "Syncing EmailData.") + else: + self.message_user( + request, + "HYPERKITTY_DATABASE_URL setting not configured.", + level=messages.WARNING, + ) + return HttpResponseRedirect("../") + + def has_add_permission(self, request): + return False diff --git a/mailing_list/management/commands/sync_mailinglist_stats.py b/mailing_list/management/commands/sync_mailinglist_stats.py new file mode 100644 index 00000000..812baa13 --- /dev/null +++ b/mailing_list/management/commands/sync_mailinglist_stats.py @@ -0,0 +1,127 @@ +from itertools import pairwise +import djclick as click +import psycopg2 +from psycopg2._psycopg import connection as Connection + +from django.db.models.functions import Lower +from django.conf import settings +from django.db import transaction + +from libraries.models import CommitAuthor, CommitAuthorEmail +from mailing_list.models import EmailData +from versions.models import Version + + +@click.command() +@click.option( + "--clean", + is_flag=True, + help="Delete all EmailData objects before importing.", +) +def command(clean): + if not settings.HYPERKITTY_DATABASE_URL: + click.echo("HYPERKITTY_DATABASE_URL setting is empty. Not syncing.") + return + conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL) + with transaction.atomic(): + if clean: + click.echo("Deleting all EmailData objects.") + EmailData.objects.all().delete() + click.echo("Creating CommitAuthors for emails.") + create_commitauthors(conn) + click.echo("Creating EmailData aggregated stats.") + create_emaildata(conn) + + +def create_emaildata(conn: Connection): + def bulk_create(rows, version): + author_emails = { + x.lower_email: x + for x in CommitAuthorEmail.objects.annotate(lower_email=Lower("email")) + .filter(lower_email__in=[x["email"] for x in rows]) + .select_related("author") + } + # group EmailData by CommitAuthor + authors = {} + for row in rows: + author = author_emails[row["email"]].author + if author not in authors: + authors[author] = EmailData(version=version, author=author, count=0) + authors[author].count += row["count"] + EmailData.objects.bulk_create( + authors.values(), + update_conflicts=True, + unique_fields=["author", "version"], + update_fields=["count"], + ) + + versions = Version.objects.minor_versions().order_by("version_array") + columns = ["email", "name", "count"] + for a, b in pairwise(versions): + start = a.release_date + end = b.release_date + if not (start and end): + raise ValueError("All x.x.0 versions must have a release date.") + with conn.cursor(name=f"emaildata_sync_{b.name}") as cursor: + cursor.execute( + """ + SELECT + LOWER(sender_id) AS email + , (ARRAY_AGG(distinct(sender_name)))[1] as name + , count(*) AS count + FROM hyperkitty_email + WHERE date >= %(start)s AND date < %(end)s + GROUP BY LOWER(sender_id); + """, + {"start": start, "end": end}, + ) + rows = [{x: data[i] for i, x in enumerate(columns)} for data in cursor] + bulk_create(rows, b) + + +def create_commitauthors(conn: Connection): + """Create CommitAuthor and CommitAuthorEmail objects for + all emails in hyperkitty. + """ + + def bulk_create(rows): + emails = {x["email"]: x for x in rows} + commitauthoremails = { + x.lower_email: x.author_id + for x in CommitAuthorEmail.objects.annotate( + lower_email=Lower("email") + ).filter(lower_email__in=emails) + } + authors_to_create = [] + author_emails_to_create = [] + for email_lower, row in emails.items(): + if email_lower not in commitauthoremails: + new_author = CommitAuthor(name=row["name"]) + authors_to_create.append(new_author) + author_emails_to_create.append( + CommitAuthorEmail(email=row["email"], author=new_author) + ) + CommitAuthor.objects.bulk_create(authors_to_create) + CommitAuthorEmail.objects.bulk_create(author_emails_to_create) + + columns = ["email", "name"] + # Uses a named cursor to use a serverside postgres cursor + with conn.cursor(name="commitauthor_sync") as cursor: + cursor.execute( + """ + SELECT + LOWER(sender_id) AS email + , (ARRAY_AGG(distinct(sender_name)))[1] as name + FROM hyperkitty_email + GROUP BY LOWER(sender_id); + """ + ) + rows = [] + for i, data in enumerate(cursor): + row = {x: data[j] for j, x in enumerate(columns)} + rows.append(row) + if i % 2000 == 0 and i != 0: + bulk_create(rows) + rows = [] + if rows: + bulk_create(rows) diff --git a/mailing_list/migrations/0004_initial.py b/mailing_list/migrations/0004_initial.py new file mode 100644 index 00000000..e1d70ef4 --- /dev/null +++ b/mailing_list/migrations/0004_initial.py @@ -0,0 +1,54 @@ +# Generated by Django 4.2.15 on 2024-10-16 21:34 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("libraries", "0023_libraryversion_authors"), + ("versions", "0011_version_full_release"), + ("mailing_list", "0003_delete_mailinglistmessage"), + ] + + operations = [ + migrations.CreateModel( + name="EmailData", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("count", models.IntegerField()), + ( + "author", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="libraries.commitauthor", + ), + ), + ( + "version", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="versions.version", + ), + ), + ], + ), + migrations.AddConstraint( + model_name="emaildata", + constraint=models.UniqueConstraint( + fields=("author", "version"), + name="mailing_list_emaildata_author_version_unique", + ), + ), + ] diff --git a/mailing_list/models.py b/mailing_list/models.py index e69de29b..f4fa0445 100644 --- a/mailing_list/models.py +++ b/mailing_list/models.py @@ -0,0 +1,18 @@ +from django.db import models + + +class EmailData(models.Model): + author = models.ForeignKey("libraries.CommitAuthor", on_delete=models.CASCADE) + version = models.ForeignKey("versions.Version", on_delete=models.CASCADE) + count = models.IntegerField() + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["author", "version"], + name="%(app_label)s_%(class)s_author_version_unique", + ), + ] + + def __str__(self): + return self.author.name diff --git a/mailing_list/tasks.py b/mailing_list/tasks.py new file mode 100644 index 00000000..5f1c8dd2 --- /dev/null +++ b/mailing_list/tasks.py @@ -0,0 +1,18 @@ +import structlog + +from django.core.management import call_command +from django.conf import settings + +from config.celery import app + + +logger = structlog.getLogger(__name__) + + +@app.task +def sync_mailinglist_stats(): + """Task to create EmailData from hyperkitty database.""" + if not settings.HYPERKITTY_DATABASE_URL: + logger.warning("HYPERKITTY_DATABASE_URL not set.") + return + call_command("sync_mailinglist_stats") diff --git a/templates/admin/mailinglist_change_list.html b/templates/admin/mailinglist_change_list.html new file mode 100644 index 00000000..744589d3 --- /dev/null +++ b/templates/admin/mailinglist_change_list.html @@ -0,0 +1,11 @@ +{% extends "admin/change_list.html" %} +{% load i18n admin_urls %} + +{% block object-tools %} +