From d24e46ac12cda2b3a1e71184c1a7ea5cc2bccd7d Mon Sep 17 00:00:00 2001 From: daveoconnor Date: Tue, 6 Jan 2026 23:05:41 +0000 Subject: [PATCH] Updated import_ml_counts to pull data from hyperkitty db (#2054) --- .github/workflows/actions-gcp.yaml | 1 + .github/workflows/actions.yml | 1 + config/settings.py | 17 +++++- env.template | 1 + mailing_list/admin.py | 17 +++++- .../management/commands/import_ml_counts.py | 57 ++----------------- mailing_list/models.py | 17 ++++++ 7 files changed, 55 insertions(+), 56 deletions(-) diff --git a/.github/workflows/actions-gcp.yaml b/.github/workflows/actions-gcp.yaml index 57fba5ed..a0fddf89 100644 --- a/.github/workflows/actions-gcp.yaml +++ b/.github/workflows/actions-gcp.yaml @@ -68,6 +68,7 @@ jobs: - name: Test with pytest env: DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres" + HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web" SECRET_KEY: "for-testing-only" REDIS_HOST: "localhost" CI: "true" diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 607b8cb6..f3534136 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -58,6 +58,7 @@ jobs: - name: Test with pytest env: DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres" + HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web" SECRET_KEY: "for-testing-only" REDIS_HOST: "localhost" CI: "true" diff --git a/config/settings.py b/config/settings.py index 39d6cc4d..1c2fdbcb 100755 --- a/config/settings.py +++ b/config/settings.py @@ -189,7 +189,10 @@ WSGI_APPLICATION = "config.wsgi.application" # https://docs.djangoproject.com/en/1.10/ref/settings/#databases try: - DATABASES = {"default": env.dj_db_url("DATABASE_URL")} + DATABASES = { + "default": env.dj_db_url("DATABASE_URL"), + "hyperkitty": env.dj_db_url("HYPERKITTY_DATABASE_URL"), + } except (ImproperlyConfigured, environs.EnvError): DATABASES = { "default": { @@ -201,7 +204,17 @@ except (ImproperlyConfigured, environs.EnvError): "USER": env("PGUSER"), "CONN_MAX_AGE": 0, "OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)}, - } + }, + "hyperkitty": { + "ENGINE": "django_db_geventpool.backends.postgresql_psycopg2", + "HOST": env("PGHOST"), + "NAME": env("HYPERKITTY_DATABASE_NAME", default=""), + "PASSWORD": env("PGPASSWORD"), + "PORT": env.int("PGPORT", default=5432), + "USER": env("PGUSER"), + "CONN_MAX_AGE": 0, + "OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)}, + }, } # Password validation diff --git a/env.template b/env.template index 1340d656..f392ce53 100644 --- a/env.template +++ b/env.template @@ -35,6 +35,7 @@ PROD_MEDIA_CONTENT_AWS_S3_ENDPOINT_URL=$STATIC_CONTENT_AWS_S3_ENDPOINT_URL # Mailman database settings HYPERKITTY_DATABASE_NAME="lists_production_web" DATABASE_URL="postgresql://postgres@db:5432/postgres" +HYPERKITTY_DATABASE_URL="postgresql://postgres@db:5432/lists_production_web" DATABASE_TYPE="postgres" DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase" HYPERKITTY_API_KEY="changeme!" diff --git a/mailing_list/admin.py b/mailing_list/admin.py index 778b9d4a..3df12967 100644 --- a/mailing_list/admin.py +++ b/mailing_list/admin.py @@ -11,7 +11,7 @@ from django.http import HttpResponseRedirect from django.contrib import admin, messages from django.conf import settings -from mailing_list.models import EmailData, SubscriptionData +from mailing_list.models import EmailData, SubscriptionData, ListPosting from mailing_list.tasks import sync_mailinglist_stats logger = logging.getLogger(__name__) @@ -112,3 +112,18 @@ class SubscriptionDataAdmin(admin.ModelAdmin): payload = {"form": SubscribesCSVForm()} return render(request, "admin/mailinglist_subscribe_csv_form.html", payload) + + +@admin.register(ListPosting) +class ListPostingAdmin(admin.ModelAdmin): + list_display = ["id", "date", "sender_id"] + search_fields = ["sender_id"] + + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return False diff --git a/mailing_list/management/commands/import_ml_counts.py b/mailing_list/management/commands/import_ml_counts.py index df930f08..7636dac9 100644 --- a/mailing_list/management/commands/import_ml_counts.py +++ b/mailing_list/management/commands/import_ml_counts.py @@ -6,23 +6,15 @@ import djclick as click import logging import re -import warnings from datetime import timedelta, datetime -import html -from dateutil.relativedelta import relativedelta -from unidecode import unidecode - -import requests from mailing_list.constants import ( - ML_STATS_URLS, - LATIN_1_EQUIVS, ARG_DATE_REGEX, AUTHOR_PATTERN_REGEX, DATE_PATTERN_REGEX, ) -from mailing_list.models import PostingData +from mailing_list.models import PostingData, ListPosting logger = logging.getLogger(__name__) @@ -31,18 +23,6 @@ author_pattern = re.compile(AUTHOR_PATTERN_REGEX) date_pattern = re.compile(DATE_PATTERN_REGEX) -def decode_broken_html(str): - def latin_1_ord(char): - n = ord(char) - return LATIN_1_EQUIVS.get(n, n) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return unidecode( - bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore") - ) - - def parse_datetime(date_str: str, is_start: bool) -> datetime: """ Parse a date string (YYYY, YYYY-MM, YYYY-MM-DD) into a datetime object. @@ -75,41 +55,12 @@ def parse_datetime(date_str: str, is_start: bool) -> datetime: return datetime(year, month, day, 23, 59, 59) -def retrieve_authors_from_ml(url, start_date, end_date): - posts = [] - logger.info(f"Retrieving data from {url=}.") - r = requests.get(url) - if r.status_code == 404: - return posts - - author = None - for line in r.text.splitlines(): - author_match = author_pattern.match(line) - if author_match: - # needs multiple passes to work - author = decode_broken_html(author_match.group(1)) - else: - date_pattern_match = date_pattern.match(line) - if author and date_pattern_match: - post_date = datetime.strptime( - date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S" - ) - if start_date <= post_date and post_date <= end_date: - posts.append(PostingData(name=author, post_time=post_date)) - return posts - - def retrieve_authors(start_date, end_date): logger.info(f"Retrieve_authors from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}") - start_month = datetime(start_date.year, start_date.month, 1) - end_month = datetime(end_date.year, end_date.month, 1) authors = [] - while start_month <= end_month: - for ml in ML_STATS_URLS: - authors += retrieve_authors_from_ml( - ml.format(start_month.year, start_month.month), start_date, end_date - ) - start_month = start_month + relativedelta(months=+1) + for p in ListPosting.objects.filter(date__gte=start_date, date__lte=end_date): + authors.append(PostingData(name=p.sender_id, post_time=p.date)) + PostingData.objects.filter( post_time__gte=start_date, post_time__lte=end_date ).delete() diff --git a/mailing_list/models.py b/mailing_list/models.py index cd8a1f3a..2fa45999 100644 --- a/mailing_list/models.py +++ b/mailing_list/models.py @@ -57,3 +57,20 @@ class SubscriptionData(models.Model): class Meta: unique_together = ["subscription_dt", "email", "list"] + + +class ListPostingManager(models.Manager): + def get_queryset(self): + return super().get_queryset().using("hyperkitty") + + +class ListPosting(models.Model): + id = models.IntegerField(primary_key=True, blank=False, null=False) + date = models.DateTimeField(blank=False, null=False) + sender_id = models.CharField(blank=False, null=False) + + objects = ListPostingManager() + + class Meta: + managed = False + db_table = "hyperkitty_email"