diff --git a/docs/commands.md b/docs/commands.md index fddff569..f1068737 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -320,3 +320,19 @@ For this to work `SLACK_BOT_API` must be set in the `.env` file. | Options | Format | Description | |----------------------|--------|--------------------------------------------------------------| | `--user_id` | int | If passed, the user with this ID will receive email notifications when this task is started and finished, or if the task raises and exception. | + + +## `import_ml_counts` + +**Purpose**: Import mailing list counts from the mailman archives. + +```bash +./manage.py import_ml_counts +``` + +**Options** + +| Options | Format | Description | +|----------------|--------|----------------------------------------------------------------------------------------------------------------------| +| `--start_date` | date | If passed, retrieves data from the start date supplied, d-m-y, default 20-11-1998 (the start of the data in mailman) | +| `--end_date` | date | If passed, If passed, retrieves data until the start date supplied, d-m-y, default today | diff --git a/docs/development_setup_notes.md b/docs/development_setup_notes.md index c0281a3b..308481cf 100644 --- a/docs/development_setup_notes.md +++ b/docs/development_setup_notes.md @@ -313,9 +313,9 @@ https://docs.allauth.org/en/latest/socialaccount/providers/google.html 1. `TF_VAR_google_cloud_email` (the email address of your Google Cloud account) 2. `TF_VAR_google_organization_domain` (usually the domain of your Google Cloud account, e.g. "boost.org" if you will be using an @boost.org email address) 3. `TF_VAR_google_cloud_project_name` (optional, default: localboostdev) - needs to change if destroyed and a setup is needed within 30 days -2. Run `make development-tofu-init` to initialize tofu. -3. Run `make development-tofu-plan` to confirm the planned changes. -4. Run `make development-tofu-apply` to apply the changes. +2. Run `just development-tofu-init` to initialize tofu. +3. Run `just development-tofu-plan` to confirm the planned changes. +4. Run `just development-tofu-apply` to apply the changes. 5. Go to https://console.developers.google.com/ 1. Search for the newly created project, named "Boost Development" (ID: localboostdev by default). 2. Type "credentials" in the search input at the top of the page. @@ -352,6 +352,7 @@ In your env: #### Set Up Pycharm You can set up your IDE with a new "Python Debug Server" configuration as: + PyCharm Debugger Settings #### Common Usage diff --git a/docs/first_time_data_import.md b/docs/first_time_data_import.md index 259908dd..f5cc8125 100644 --- a/docs/first_time_data_import.md +++ b/docs/first_time_data_import.md @@ -38,6 +38,7 @@ The `boost_setup` command will run all of the processes listed here: # Get the most recent beta release, and delete old beta releases ./manage.py import_beta_release --delete-versions +./manage.py import_ml_counts ``` Read more aboout these [management commands](./commands.md). diff --git a/docs/release_reports.md b/docs/release_reports.md index 0a29b7fd..cb089bb4 100644 --- a/docs/release_reports.md +++ b/docs/release_reports.md @@ -1,5 +1,17 @@ # Release Reports +## Prerequisites + +1. You should upload updated subscriber data. + 1. Ask Sam for a copy of the "subscribe" data. + 2. In the Django admin interface go to "Subscription datas" under "MAILING_LIST". + 3. At the top of the page click on the "IMPORT 'SUBSCRIBE' DATA" button. +2. To update the mailing list counts, if you haven't already run the "DO IT ALL" button: + 1. Go to "Versions" under "VERSIONS" in the admin interface + 2. At the top of the page click on the "DO IT ALL" button. + +## Report Creation + 1. Go to /admin 2. Go to the "Libraries" section 3. In the top menu click on "GET RELEASE REPORT". diff --git a/libraries/forms.py b/libraries/forms.py index 348109dd..ae189173 100644 --- a/libraries/forms.py +++ b/libraries/forms.py @@ -10,7 +10,11 @@ from django.db.models import F, Q, Count, OuterRef, Sum, When, Value, Case from django.forms import Form, ModelChoiceField, ModelForm, BooleanField from core.models import RenderedContent -from reports.generation import generate_wordcloud +from reports.generation import ( + generate_wordcloud, + get_mailing_list_post_stats, + get_new_subscribers_stats, +) from slack.models import Channel, SlackActivityBucket, SlackUser from versions.models import Version from .models import ( @@ -772,6 +776,12 @@ class CreateReportForm(CreateReportFullForm): Channel.objects.filter(name__istartswith="boost").order_by("name"), 10 ) committee_members = version.financial_committee_members.all() + mailinglist_post_stats = get_mailing_list_post_stats( + prior_version.release_date, version.release_date + ) + new_subscribers_stats = get_new_subscribers_stats( + prior_version.release_date, version.release_date + ) library_index_library_data = [] for library in self._get_libraries_by_quality(): library_index_library_data.append( @@ -804,6 +814,8 @@ class CreateReportForm(CreateReportFullForm): "mailinglist_total": total_mailinglist_count or 0, "mailinglist_contributor_release_count": mailinglist_contributor_release_count, # noqa: E501 "mailinglist_contributor_new_count": mailinglist_contributor_new_count, + "mailinglist_post_stats": mailinglist_post_stats, + "mailinglist_new_subscribers_stats": new_subscribers_stats, "commit_contributors_release_count": commit_contributors_release_count, "commit_contributors_new_count": commit_contributors_new_count, "global_contributors_new_count": len( diff --git a/libraries/management/commands/release_tasks.py b/libraries/management/commands/release_tasks.py index 986105d8..75504742 100644 --- a/libraries/management/commands/release_tasks.py +++ b/libraries/management/commands/release_tasks.py @@ -82,6 +82,7 @@ class ReleaseTasksManager: ReleaseTask("Updating github issues", ["update_issues"]), ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]), ReleaseTask("Updating website statistics", self.update_website_statistics), + ReleaseTask("Importing mailing list counts", ["import_ml_counts"]), ReleaseTask("Generating report", self.generate_report), ] diff --git a/mailing_list/admin.py b/mailing_list/admin.py index 5ac81a5e..778b9d4a 100644 --- a/mailing_list/admin.py +++ b/mailing_list/admin.py @@ -1,11 +1,21 @@ +import csv +import logging +import re +from datetime import datetime +from io import TextIOWrapper + +from django import forms +from django.shortcuts import redirect, render from django.urls import path from django.http import HttpResponseRedirect from django.contrib import admin, messages from django.conf import settings -from mailing_list.models import EmailData +from mailing_list.models import EmailData, SubscriptionData from mailing_list.tasks import sync_mailinglist_stats +logger = logging.getLogger(__name__) + @admin.register(EmailData) class EmailDataAdmin(admin.ModelAdmin): @@ -43,3 +53,62 @@ class EmailDataAdmin(admin.ModelAdmin): def has_add_permission(self, request): return False + + +class SubscribesCSVForm(forms.Form): + csv_file = forms.FileField() + + +@admin.register(SubscriptionData) +class SubscriptionDataAdmin(admin.ModelAdmin): + list_display = ["subscription_dt", "email"] + search_fields = ["email"] + change_list_template = "admin/mailinglist_change_list.html" + + email_regex = re.compile("([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") + + def get_urls(self): + return [ + path("import-csv", self.import_csv, name="import_csv") + ] + super().get_urls() + + def parse_rows(self, reader): + for row in reader: + date_str = " ".join(row[0:4]) + try: + dt = datetime.strptime(date_str, "%b %d %H:%M:%S %Y") + except ValueError: + logger.error(f"Error parsing date {date_str} from {row=}") + dt = None + # re-merge, the email address isn't always in a consistent position + email_matches = re.search(self.email_regex, " ".join(row[6:])) + email = email_matches.group(0) if email_matches else None + entry_type = row[6] + # only save confirmed subscriber entries, it's all we need for now + if entry_type != "new": + continue + if not email: + logger.error( + f"Invalid email {row=} {email_matches=} {' '.join(row[6:])=}" + ) + continue + yield SubscriptionData( + email=email, + entry_type=entry_type, + list=row[5].rstrip(":-1"), + subscription_dt=dt, + ) + + def import_csv(self, request): + if request.method == "POST": + csv_file = request.FILES["csv_file"] + rows = TextIOWrapper(csv_file, encoding="ISO-8859-1", newline="") + reader = csv.reader(rows, delimiter=" ") + SubscriptionData.objects.bulk_create( + self.parse_rows(reader), batch_size=500, ignore_conflicts=True + ) + self.message_user(request, "Subscribe CSV file imported.") + return redirect("..") + + payload = {"form": SubscribesCSVForm()} + return render(request, "admin/mailinglist_subscribe_csv_form.html", payload) diff --git a/mailing_list/constants.py b/mailing_list/constants.py new file mode 100644 index 00000000..7ca5ad7a --- /dev/null +++ b/mailing_list/constants.py @@ -0,0 +1,41 @@ +# we only want boost devel for now, leaving the others in case that changes. +ML_STATS_URLS = [ + "https://lists.boost.org/Archives/boost/{:04}/{:02}/author.php", + # "https://lists.boost.org/boost-users/{:04}/{:02}/author.php", + # "https://lists.boost.org/boost-announce/{:04}/{:02}/author.php", +] +ARG_DATE_REGEX = r"^([0-9]+)(?:$|(?:-|/)([0-9]+)(?:$|(?:-|/)([0-9]+)$))" +AUTHOR_PATTERN_REGEX = r"
  • (.*)" +DATE_PATTERN_REGEX = r".*\((\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\)" + +# used to map latin-1 characters to their utf-8 equivalents in the mailing list +# page html parser +LATIN_1_EQUIVS = { + 8364: 128, + 8218: 130, + 402: 131, + 8222: 132, + 8230: 133, + 8224: 134, + 8225: 135, + 710: 136, + 8240: 137, + 352: 138, + 8249: 139, + 338: 140, + 381: 142, + 8216: 145, + 8217: 146, + 8220: 147, + 8221: 148, + 8226: 149, + 8211: 150, + 8212: 151, + 732: 152, + 8482: 153, + 353: 154, + 8250: 155, + 339: 156, + 382: 158, + 376: 159, +} diff --git a/mailing_list/management/commands/import_ml_counts.py b/mailing_list/management/commands/import_ml_counts.py new file mode 100644 index 00000000..6afaaa8f --- /dev/null +++ b/mailing_list/management/commands/import_ml_counts.py @@ -0,0 +1,133 @@ +# Copyright 2024 Dave O'Connor +# Derived from code by Joaquin M Lopez Munoz. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) +import djclick as click +import logging +import re +import warnings +from datetime import timedelta, datetime +import html + +from dateutil.relativedelta import relativedelta +from unidecode import unidecode + +import requests + +from mailing_list.constants import ( + ML_STATS_URLS, + LATIN_1_EQUIVS, + ARG_DATE_REGEX, + AUTHOR_PATTERN_REGEX, + DATE_PATTERN_REGEX, +) +from mailing_list.models import PostingData + +logger = logging.getLogger(__name__) + +arg_date_pattern = re.compile(ARG_DATE_REGEX) +author_pattern = re.compile(AUTHOR_PATTERN_REGEX) +date_pattern = re.compile(DATE_PATTERN_REGEX) + + +def decode_broken_html(str): + def latin_1_ord(char): + n = ord(char) + return LATIN_1_EQUIVS.get(n, n) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return unidecode( + bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore") + ) + + +def parse_start_datetime(date_str): + m = arg_date_pattern.match(date_str) + if not m: + raise ValueError("wrong date format") + logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}") + return datetime( + int(m.group(3)) if m.group(3) else 1, + int(m.group(2)) if m.group(2) else 1, + int(m.group(1)), + 0, + 0, + 0, + ) + + +def parse_end_datetime(date_str): + m = arg_date_pattern.match(date_str) + if not m: + raise ValueError("wrong date format") + logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}") + if m.group(2): + if m.group(3): + return datetime( + int(m.group(3)), int(m.group(2)), int(m.group(1)), 23, 59, 59 + ) + else: + return ( + datetime(int(m.group(1)), int(m.group(2)), 1) + timedelta(days=31), + 23, + 59, + 59, + ).replace(day=1) - timedelta(days=1) + return datetime(int(m.group(1)), 12, 31, 23, 59, 59) + + +def retrieve_authors_from_ml(url, start_date, end_date): + posts = [] + logger.info(f"Retrieving data from {url=}.") + r = requests.get(url) + if r.status_code == 404: + return posts + + author = None + for line in r.text.splitlines(): + author_match = author_pattern.match(line) + if author_match: + # needs multiple passes to work + author = decode_broken_html(author_match.group(1)) + else: + date_pattern_match = date_pattern.match(line) + if author and date_pattern_match: + post_date = datetime.strptime( + date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S" + ) + if start_date <= post_date and post_date <= end_date: + posts.append(PostingData(name=author, post_time=post_date)) + return posts + + +def retrieve_authors(start_date, end_date): + logger.info(f"retrieve_authors from {start_date=} to {end_date=}") + start_month = datetime(start_date.year, start_date.month, 1) + end_month = datetime(end_date.year, end_date.month, 1) + authors = [] + while start_month <= end_month: + for ml in ML_STATS_URLS: + authors += retrieve_authors_from_ml( + ml.format(start_month.year, start_month.month), start_date, end_date + ) + start_month = start_month + relativedelta(months=+1) + PostingData.objects.filter( + post_time__gte=start_date, post_time__lte=end_date + ).delete() + PostingData.objects.bulk_create(authors) + + +@click.command() +@click.option("--start_date", is_flag=False, help="Start Date", default=None) +@click.option("--end_date", is_flag=False, help="End Date", default=None) +def command(start_date, end_date): + logger.info(f"Starting import_ml_counts {start_date=} {end_date=}") + start_date = ( + parse_start_datetime(start_date) if start_date else datetime(1998, 11, 11) + ) + logger.info(f"{start_date=}") + end_date = parse_end_datetime(end_date) if end_date else datetime.now() + logger.info(f"{end_date=}") + retrieve_authors(start_date, end_date) diff --git a/mailing_list/migrations/0005_postingdata_subscriptiondata.py b/mailing_list/migrations/0005_postingdata_subscriptiondata.py new file mode 100644 index 00000000..8f8d55c1 --- /dev/null +++ b/mailing_list/migrations/0005_postingdata_subscriptiondata.py @@ -0,0 +1,52 @@ +# Generated by Django 4.2.16 on 2025-03-20 18:02 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("mailing_list", "0004_initial"), + ] + + operations = [ + migrations.CreateModel( + name="PostingData", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("name", models.CharField(max_length=255)), + ("post_time", models.DateTimeField()), + ("created", models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name="SubscriptionData", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("subscription_dt", models.DateTimeField()), + ("email", models.EmailField(max_length=255)), + ("entry_type", models.CharField(max_length=24)), + ("list", models.CharField(max_length=24)), + ("created", models.DateTimeField(auto_now_add=True)), + ], + options={ + "unique_together": {("subscription_dt", "email", "list")}, + }, + ), + ] diff --git a/mailing_list/models.py b/mailing_list/models.py index 5e666540..cd8a1f3a 100644 --- a/mailing_list/models.py +++ b/mailing_list/models.py @@ -35,3 +35,25 @@ class EmailData(models.Model): def __str__(self): return self.author.name + + +class PostingData(models.Model): + name = models.CharField(max_length=255) + post_time = models.DateTimeField() + + created = models.DateTimeField(auto_now_add=True) + + def __str__(self): + return f"{self.name} {self.post_time}" + + +class SubscriptionData(models.Model): + subscription_dt = models.DateTimeField() + email = models.EmailField(max_length=255) + entry_type = models.CharField(max_length=24) + list = models.CharField(max_length=24) + + created = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = ["subscription_dt", "email", "list"] diff --git a/reports/generation.py b/reports/generation.py index 197cddd5..2b77c8c8 100644 --- a/reports/generation.py +++ b/reports/generation.py @@ -1,17 +1,24 @@ import base64 import io +import logging import random +from datetime import datetime import psycopg2 from django.conf import settings +from django.db.models import Count +from django.db.models.functions import ExtractWeek, ExtractIsoYear from matplotlib import pyplot as plt from wordcloud import WordCloud, STOPWORDS from core.models import SiteSettings from libraries.models import WordcloudMergeWord # TODO: move model to this app +from mailing_list.models import PostingData, SubscriptionData from reports.constants import WORDCLOUD_FONT from versions.models import Version +logger = logging.getLogger(__name__) + def generate_wordcloud(version: Version) -> tuple[str | None, list]: """Generates a wordcloud png and returns it as a base64 string and word frequencies. @@ -25,7 +32,7 @@ def generate_wordcloud(version: Version) -> tuple[str | None, list]: width=1400, height=700, stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set, - font_path=settings.STATIC_ROOT / "font" / WORDCLOUD_FONT, + font_path=f"{settings.STATIC_ROOT}/font/{WORDCLOUD_FONT}", ) word_frequencies = {} for content in get_mail_content(version): @@ -110,3 +117,40 @@ def get_mail_content(version: Version): ) for [content] in cursor: yield content + + +def get_mailing_list_post_stats(start_date: datetime, end_date: datetime): + logger.info(f"from {start_date} to {end_date}") + data = ( + PostingData.objects.filter(post_time__gt=start_date, post_time__lte=end_date) + .annotate(week=ExtractWeek("post_time"), iso_year=ExtractIsoYear("post_time")) + .values("week") + .annotate(count=Count("id")) + .order_by("iso_year", "week") + ) + return [{"y": s.get("count"), "x": s.get("week")} for s in data] + + +def get_new_subscribers_stats(start_date: datetime, end_date: datetime): + data = ( + SubscriptionData.objects.filter( + subscription_dt__gte=start_date, + subscription_dt__lte=end_date, + list="boost", + ) + .annotate( + week=ExtractWeek("subscription_dt"), + iso_year=ExtractIsoYear("subscription_dt"), + ) + .values("week", "list") + .annotate(count=Count("id")) + .order_by("iso_year", "week") + ) + + formatted_data = [{"x": s.get("week"), "y": s.get("count")} for s in data] + referenced_weeks = [x.get("week") for x in data] + # account for weeks that no data is retrieved + for w in range(start_date.isocalendar().week, end_date.isocalendar().week + 1): + if w not in referenced_weeks: + formatted_data.append({"x": w, "y": 0}) + return formatted_data diff --git a/requirements.in b/requirements.in index 0a9c964e..363a21e6 100644 --- a/requirements.in +++ b/requirements.in @@ -31,6 +31,7 @@ wheel cryptography boto3 jsoncomment +unidecode wordcloud # Logging diff --git a/requirements.txt b/requirements.txt index 54235d6e..acdb7513 100644 --- a/requirements.txt +++ b/requirements.txt @@ -371,6 +371,8 @@ tzdata==2024.2 # via # celery # kombu +unidecode==1.3.8 + # via -r ./requirements.in urllib3==1.26.20 # via # botocore diff --git a/templates/admin/mailinglist_change_list.html b/templates/admin/mailinglist_change_list.html index 744589d3..62623527 100644 --- a/templates/admin/mailinglist_change_list.html +++ b/templates/admin/mailinglist_change_list.html @@ -1,11 +1,13 @@ -{% extends "admin/change_list.html" %} +{% extends 'admin/change_list.html' %} {% load i18n admin_urls %} {% block object-tools %} - + + {% endblock %} diff --git a/templates/admin/mailinglist_subscribe_csv_form.html b/templates/admin/mailinglist_subscribe_csv_form.html new file mode 100644 index 00000000..c9e7d093 --- /dev/null +++ b/templates/admin/mailinglist_subscribe_csv_form.html @@ -0,0 +1,15 @@ +{% extends "admin/base_site.html" %} +{% load static %} +{% block content %} + {{ block.super }} +
    +

    Upload Subscribe File

    +
    +
    + {% csrf_token %} + {{ form.as_p }} + +
    +
    +
    +{% endblock content %} diff --git a/templates/admin/release_report_detail.html b/templates/admin/release_report_detail.html index 5929921c..20327ba9 100644 --- a/templates/admin/release_report_detail.html +++ b/templates/admin/release_report_detail.html @@ -30,6 +30,14 @@ body { .committee_members img { filter: grayscale(1); } +#top-committed-libraries-chart .apexcharts-xaxis-label:nth-child(odd) { + transform: translateY(-8px); +} + +#top-committed-libraries-chart .apexcharts-xaxis-label:nth-child(even) { + transform: translateY(8px); +} + {% endblock css %} {% block content %} @@ -219,7 +227,7 @@ body {
    -
    Top Contributors
    +
    Top Contributors
    {% for item in mailinglist_counts %}
    @@ -249,21 +257,22 @@ body { poster{{ mailinglist_contributor_release_count|pluralize }} in this version. ({{ mailinglist_contributor_new_count }} New)
    +
    Weekly mailing list posts from {{prior_version.release_date}} to {{version.release_date}} on the Boost Developers mailing list.
    +
    - {% if wordcloud_base64 %} -
    -
    -

    Mailing List Word Cloud

    -
    - Mailing List Word Cloud -
    -
    +
    +
    +

    Mailing List New Subscribers

    +
    Mailing list new subscribers from from {{prior_version.release_date}} to {{version.release_date}} on the Boost Developers mailing list.
    +
    - {% endif %} +
    + +

    Mailing List Top 200 Most Frequently Used Words

    @@ -276,6 +285,16 @@ body {
    + {% if wordcloud_base64 %} +
    +
    +

    Mailing List Word Cloud

    +
    + Mailing List Word Cloud +
    +
    +
    + {% endif %} {% if slack %} {% for slack_channel_group in slack_channels %} @@ -441,11 +460,7 @@ body { {% endwith %} -