Updated import_ml_counts to pull data from hyperkitty db (#2054)

This commit is contained in:
daveoconnor
2026-01-06 23:05:41 +00:00
committed by GitHub
parent f34458524e
commit d24e46ac12
7 changed files with 55 additions and 56 deletions

View File

@@ -68,6 +68,7 @@ jobs:
- name: Test with pytest - name: Test with pytest
env: env:
DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres" DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres"
HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web"
SECRET_KEY: "for-testing-only" SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost" REDIS_HOST: "localhost"
CI: "true" CI: "true"

View File

@@ -58,6 +58,7 @@ jobs:
- name: Test with pytest - name: Test with pytest
env: env:
DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres" DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres"
HYPERKITTY_DATABASE_URL: "postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/lists_production_web"
SECRET_KEY: "for-testing-only" SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost" REDIS_HOST: "localhost"
CI: "true" CI: "true"

View File

@@ -189,7 +189,10 @@ WSGI_APPLICATION = "config.wsgi.application"
# https://docs.djangoproject.com/en/1.10/ref/settings/#databases # https://docs.djangoproject.com/en/1.10/ref/settings/#databases
try: try:
DATABASES = {"default": env.dj_db_url("DATABASE_URL")} DATABASES = {
"default": env.dj_db_url("DATABASE_URL"),
"hyperkitty": env.dj_db_url("HYPERKITTY_DATABASE_URL"),
}
except (ImproperlyConfigured, environs.EnvError): except (ImproperlyConfigured, environs.EnvError):
DATABASES = { DATABASES = {
"default": { "default": {
@@ -201,7 +204,17 @@ except (ImproperlyConfigured, environs.EnvError):
"USER": env("PGUSER"), "USER": env("PGUSER"),
"CONN_MAX_AGE": 0, "CONN_MAX_AGE": 0,
"OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)}, "OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)},
} },
"hyperkitty": {
"ENGINE": "django_db_geventpool.backends.postgresql_psycopg2",
"HOST": env("PGHOST"),
"NAME": env("HYPERKITTY_DATABASE_NAME", default=""),
"PASSWORD": env("PGPASSWORD"),
"PORT": env.int("PGPORT", default=5432),
"USER": env("PGUSER"),
"CONN_MAX_AGE": 0,
"OPTIONS": {"MAX_CONNS": env("MAX_CONNECTIONS", default=20)},
},
} }
# Password validation # Password validation

View File

@@ -35,6 +35,7 @@ PROD_MEDIA_CONTENT_AWS_S3_ENDPOINT_URL=$STATIC_CONTENT_AWS_S3_ENDPOINT_URL
# Mailman database settings # Mailman database settings
HYPERKITTY_DATABASE_NAME="lists_production_web" HYPERKITTY_DATABASE_NAME="lists_production_web"
DATABASE_URL="postgresql://postgres@db:5432/postgres" DATABASE_URL="postgresql://postgres@db:5432/postgres"
HYPERKITTY_DATABASE_URL="postgresql://postgres@db:5432/lists_production_web"
DATABASE_TYPE="postgres" DATABASE_TYPE="postgres"
DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase" DATABASE_CLASS="mailman.database.postgresql.PostgreSQLDatabase"
HYPERKITTY_API_KEY="changeme!" HYPERKITTY_API_KEY="changeme!"

View File

@@ -11,7 +11,7 @@ from django.http import HttpResponseRedirect
from django.contrib import admin, messages from django.contrib import admin, messages
from django.conf import settings from django.conf import settings
from mailing_list.models import EmailData, SubscriptionData from mailing_list.models import EmailData, SubscriptionData, ListPosting
from mailing_list.tasks import sync_mailinglist_stats from mailing_list.tasks import sync_mailinglist_stats
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -112,3 +112,18 @@ class SubscriptionDataAdmin(admin.ModelAdmin):
payload = {"form": SubscribesCSVForm()} payload = {"form": SubscribesCSVForm()}
return render(request, "admin/mailinglist_subscribe_csv_form.html", payload) return render(request, "admin/mailinglist_subscribe_csv_form.html", payload)
@admin.register(ListPosting)
class ListPostingAdmin(admin.ModelAdmin):
list_display = ["id", "date", "sender_id"]
search_fields = ["sender_id"]
def has_add_permission(self, request):
return False
def has_change_permission(self, request, obj=None):
return False
def has_delete_permission(self, request, obj=None):
return False

View File

@@ -6,23 +6,15 @@
import djclick as click import djclick as click
import logging import logging
import re import re
import warnings
from datetime import timedelta, datetime from datetime import timedelta, datetime
import html
from dateutil.relativedelta import relativedelta
from unidecode import unidecode
import requests
from mailing_list.constants import ( from mailing_list.constants import (
ML_STATS_URLS,
LATIN_1_EQUIVS,
ARG_DATE_REGEX, ARG_DATE_REGEX,
AUTHOR_PATTERN_REGEX, AUTHOR_PATTERN_REGEX,
DATE_PATTERN_REGEX, DATE_PATTERN_REGEX,
) )
from mailing_list.models import PostingData from mailing_list.models import PostingData, ListPosting
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -31,18 +23,6 @@ author_pattern = re.compile(AUTHOR_PATTERN_REGEX)
date_pattern = re.compile(DATE_PATTERN_REGEX) date_pattern = re.compile(DATE_PATTERN_REGEX)
def decode_broken_html(str):
def latin_1_ord(char):
n = ord(char)
return LATIN_1_EQUIVS.get(n, n)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return unidecode(
bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore")
)
def parse_datetime(date_str: str, is_start: bool) -> datetime: def parse_datetime(date_str: str, is_start: bool) -> datetime:
""" """
Parse a date string (YYYY, YYYY-MM, YYYY-MM-DD) into a datetime object. Parse a date string (YYYY, YYYY-MM, YYYY-MM-DD) into a datetime object.
@@ -75,41 +55,12 @@ def parse_datetime(date_str: str, is_start: bool) -> datetime:
return datetime(year, month, day, 23, 59, 59) return datetime(year, month, day, 23, 59, 59)
def retrieve_authors_from_ml(url, start_date, end_date):
posts = []
logger.info(f"Retrieving data from {url=}.")
r = requests.get(url)
if r.status_code == 404:
return posts
author = None
for line in r.text.splitlines():
author_match = author_pattern.match(line)
if author_match:
# needs multiple passes to work
author = decode_broken_html(author_match.group(1))
else:
date_pattern_match = date_pattern.match(line)
if author and date_pattern_match:
post_date = datetime.strptime(
date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S"
)
if start_date <= post_date and post_date <= end_date:
posts.append(PostingData(name=author, post_time=post_date))
return posts
def retrieve_authors(start_date, end_date): def retrieve_authors(start_date, end_date):
logger.info(f"Retrieve_authors from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}") logger.info(f"Retrieve_authors from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}")
start_month = datetime(start_date.year, start_date.month, 1)
end_month = datetime(end_date.year, end_date.month, 1)
authors = [] authors = []
while start_month <= end_month: for p in ListPosting.objects.filter(date__gte=start_date, date__lte=end_date):
for ml in ML_STATS_URLS: authors.append(PostingData(name=p.sender_id, post_time=p.date))
authors += retrieve_authors_from_ml(
ml.format(start_month.year, start_month.month), start_date, end_date
)
start_month = start_month + relativedelta(months=+1)
PostingData.objects.filter( PostingData.objects.filter(
post_time__gte=start_date, post_time__lte=end_date post_time__gte=start_date, post_time__lte=end_date
).delete() ).delete()

View File

@@ -57,3 +57,20 @@ class SubscriptionData(models.Model):
class Meta: class Meta:
unique_together = ["subscription_dt", "email", "list"] unique_together = ["subscription_dt", "email", "list"]
class ListPostingManager(models.Manager):
def get_queryset(self):
return super().get_queryset().using("hyperkitty")
class ListPosting(models.Model):
id = models.IntegerField(primary_key=True, blank=False, null=False)
date = models.DateTimeField(blank=False, null=False)
sender_id = models.CharField(blank=False, null=False)
objects = ListPostingManager()
class Meta:
managed = False
db_table = "hyperkitty_email"