Updated import_ml_counts to pull data from hyperkitty db (#2054)

This commit is contained in:
daveoconnor
2026-01-06 23:05:41 +00:00
committed by GitHub
parent f34458524e
commit d24e46ac12
7 changed files with 55 additions and 56 deletions

View File

@@ -68,6 +68,7 @@ jobs:
- name: Test with pytest
env:
DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres"
HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web"
SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost"
CI: "true"

View File

@@ -58,6 +58,7 @@ jobs:
- name: Test with pytest
env:
DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres"
HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web"
SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost"
CI: "true"

View File

@@ -189,7 +189,10 @@ WSGI_APPLICATION = "config.wsgi.application"
# https://docs.djangoproject.com/en/1.10/ref/settings/#databases
try:
DATABASES = {"default": env.dj_db_url("DATABASE_URL")}
DATABASES = {
"default": env.dj_db_url("DATABASE_URL"),
"hyperkitty": env.dj_db_url("HYPERKITTY_DATABASE_URL"),
}
except (ImproperlyConfigured, environs.EnvError):
DATABASES = {
"default": {
@@ -201,7 +204,17 @@ except (ImproperlyConfigured, environs.EnvError):
"USER": env("PGUSER"),
"CONN_MAX_AGE": 0,
"OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)},
}
},
"hyperkitty": {
"ENGINE": "django_db_geventpool.backends.postgresql_psycopg2",
"HOST": env("PGHOST"),
"NAME": env("HYPERKITTY_DATABASE_NAME", default=""),
"PASSWORD": env("PGPASSWORD"),
"PORT": env.int("PGPORT", default=5432),
"USER": env("PGUSER"),
"CONN_MAX_AGE": 0,
"OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)},
},
}
# Password validation

View File

@@ -35,6 +35,7 @@ PROD_MEDIA_CONTENT_AWS_S3_ENDPOINT_URL=$STATIC_CONTENT_AWS_S3_ENDPOINT_URL
# Mailman database settings
HYPERKITTY_DATABASE_NAME="lists_production_web"
DATABASE_URL="postgresql://postgres@db:5432/postgres"
HYPERKITTY_DATABASE_URL="postgresql://postgres@db:5432/lists_production_web"
DATABASE_TYPE="postgres"
DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase"
HYPERKITTY_API_KEY="changeme!"

View File

@@ -11,7 +11,7 @@ from django.http import HttpResponseRedirect
from django.contrib import admin, messages
from django.conf import settings
from mailing_list.models import EmailData, SubscriptionData
from mailing_list.models import EmailData, SubscriptionData, ListPosting
from mailing_list.tasks import sync_mailinglist_stats
logger = logging.getLogger(__name__)
@@ -112,3 +112,18 @@ class SubscriptionDataAdmin(admin.ModelAdmin):
payload = {"form": SubscribesCSVForm()}
return render(request, "admin/mailinglist_subscribe_csv_form.html", payload)
@admin.register(ListPosting)
class ListPostingAdmin(admin.ModelAdmin):
list_display = ["id", "date", "sender_id"]
search_fields = ["sender_id"]
def has_add_permission(self, request):
return False
def has_change_permission(self, request, obj=None):
return False
def has_delete_permission(self, request, obj=None):
return False

View File

@@ -6,23 +6,15 @@
import djclick as click
import logging
import re
import warnings
from datetime import timedelta, datetime
import html
from dateutil.relativedelta import relativedelta
from unidecode import unidecode
import requests
from mailing_list.constants import (
ML_STATS_URLS,
LATIN_1_EQUIVS,
ARG_DATE_REGEX,
AUTHOR_PATTERN_REGEX,
DATE_PATTERN_REGEX,
)
from mailing_list.models import PostingData
from mailing_list.models import PostingData, ListPosting
logger = logging.getLogger(__name__)
@@ -31,18 +23,6 @@ author_pattern = re.compile(AUTHOR_PATTERN_REGEX)
date_pattern = re.compile(DATE_PATTERN_REGEX)
def decode_broken_html(str):
def latin_1_ord(char):
n = ord(char)
return LATIN_1_EQUIVS.get(n, n)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return unidecode(
bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore")
)
def parse_datetime(date_str: str, is_start: bool) -> datetime:
"""
Parse a date string (YYYY, YYYY-MM, YYYY-MM-DD) into a datetime object.
@@ -75,41 +55,12 @@ def parse_datetime(date_str: str, is_start: bool) -> datetime:
return datetime(year, month, day, 23, 59, 59)
def retrieve_authors_from_ml(url, start_date, end_date):
posts = []
logger.info(f"Retrieving data from {url=}.")
r = requests.get(url)
if r.status_code == 404:
return posts
author = None
for line in r.text.splitlines():
author_match = author_pattern.match(line)
if author_match:
# needs multiple passes to work
author = decode_broken_html(author_match.group(1))
else:
date_pattern_match = date_pattern.match(line)
if author and date_pattern_match:
post_date = datetime.strptime(
date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S"
)
if start_date <= post_date and post_date <= end_date:
posts.append(PostingData(name=author, post_time=post_date))
return posts
def retrieve_authors(start_date, end_date):
logger.info(f"Retrieve_authors from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}")
start_month = datetime(start_date.year, start_date.month, 1)
end_month = datetime(end_date.year, end_date.month, 1)
authors = []
while start_month <= end_month:
for ml in ML_STATS_URLS:
authors += retrieve_authors_from_ml(
ml.format(start_month.year, start_month.month), start_date, end_date
)
start_month = start_month + relativedelta(months=+1)
for p in ListPosting.objects.filter(date__gte=start_date, date__lte=end_date):
authors.append(PostingData(name=p.sender_id, post_time=p.date))
PostingData.objects.filter(
post_time__gte=start_date, post_time__lte=end_date
).delete()

View File

@@ -57,3 +57,20 @@ class SubscriptionData(models.Model):
class Meta:
unique_together = ["subscription_dt", "email", "list"]
class ListPostingManager(models.Manager):
def get_queryset(self):
return super().get_queryset().using("hyperkitty")
class ListPosting(models.Model):
id = models.IntegerField(primary_key=True, blank=False, null=False)
date = models.DateTimeField(blank=False, null=False)
sender_id = models.CharField(blank=False, null=False)
objects = ListPostingManager()
class Meta:
managed = False
db_table = "hyperkitty_email"