Improve ml fetching efficiency during release tasks (#1746)

This commit is contained in:
Greg Kaleka
2025-04-12 17:31:48 -04:00
committed by GitHub
parent d21352dbcf
commit 6b58dc3692
2 changed files with 48 additions and 36 deletions

View File

@@ -1,6 +1,7 @@
import traceback
from contextlib import suppress
from dataclasses import dataclass
from datetime import timedelta
from typing import Callable
import djclick as click
@@ -82,13 +83,16 @@ class ReleaseTasksManager:
ReleaseTask("Updating github issues", ["update_issues"]),
ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]),
ReleaseTask("Updating website statistics", self.update_website_statistics),
ReleaseTask("Importing mailing list counts", ["import_ml_counts"]),
ReleaseTask("Importing mailing list counts", self.import_ml_counts),
ReleaseTask("Generating report", self.generate_report),
]
def update_release_data(self) -> dict[str:int]:
for task in self.tasks:
self.progress_messages.append(progress_message(f"{task.description}..."))
# "Release Task: " prefix for easy log parsing
self.progress_messages.append(
progress_message(f"Release Task: {task.description}...")
)
task.run()
self.progress_messages.append(
progress_message(f"Finished {task.description.lower()}")
@@ -110,6 +114,15 @@ class ReleaseTasksManager:
report, _ = WebsiteStatReport.objects.get_or_create(version=self.latest_version)
report.populate_from_api()
def import_ml_counts(self):
"""Import counts for the last four months. Should be more than enough,
and saves lots of time vs importing all.
"""
start_date = timezone.now() - timedelta(days=120)
date_string = start_date.strftime("%Y-%m-%d")
print(f"{date_string = }")
call_command("import_ml_counts", start_date=date_string)
def generate_report(self):
if not self.should_generate_report:
self.progress_messages.append(

View File

@@ -43,39 +43,36 @@ def decode_broken_html(str):
)
def parse_start_datetime(date_str):
def parse_datetime(date_str: str, is_start: bool) -> datetime:
"""
Parse a date string (YYYY, YYYY-MM, YYYY-MM-DD) into a datetime object.
If is_start=True, returns the earliest time possible for the data given.
If is_start=False, returns the latest time possible for the data given.
"""
m = arg_date_pattern.match(date_str)
if not m:
raise ValueError("wrong date format")
logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
return datetime(
int(m.group(3)) if m.group(3) else 1,
int(m.group(2)) if m.group(2) else 1,
int(m.group(1)),
0,
0,
0,
)
raise ValueError(f"Invalid date format: {date_str!r}")
year_text, month_text, day_text = m.groups()
year = int(year_text)
month = int(month_text) if month_text is not None else (1 if is_start else 12)
day = int(day_text) if day_text is not None else 1
def parse_end_datetime(date_str):
m = arg_date_pattern.match(date_str)
if not m:
raise ValueError("wrong date format")
logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
if m.group(2):
if m.group(3):
return datetime(
int(m.group(3)), int(m.group(2)), int(m.group(1)), 23, 59, 59
)
else:
return (
datetime(int(m.group(1)), int(m.group(2)), 1) + timedelta(days=31),
23,
59,
59,
).replace(day=1) - timedelta(days=1)
return datetime(int(m.group(1)), 12, 31, 23, 59, 59)
if is_start:
# Start date - return start of day
return datetime(year, month, day, 0, 0, 0)
# End date - return latest datetime possible from given criteria
if day_text is None:
# No day provided: find the last day of the month
first_of_next_month = (datetime(year, month, 1) + timedelta(days=31)).replace(
day=1
)
last_day_of_month = first_of_next_month - timedelta(days=1)
day = last_day_of_month.day
return datetime(year, month, day, 23, 59, 59)
def retrieve_authors_from_ml(url, start_date, end_date):
@@ -103,7 +100,7 @@ def retrieve_authors_from_ml(url, start_date, end_date):
def retrieve_authors(start_date, end_date):
logger.info(f"retrieve_authors from {start_date=} to {end_date=}")
logger.info(f"Retrieve_authors from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}")
start_month = datetime(start_date.year, start_date.month, 1)
end_month = datetime(end_date.year, end_date.month, 1)
authors = []
@@ -125,9 +122,11 @@ def retrieve_authors(start_date, end_date):
def command(start_date, end_date):
logger.info(f"Starting import_ml_counts {start_date=} {end_date=}")
start_date = (
parse_start_datetime(start_date) if start_date else datetime(1998, 11, 11)
parse_datetime(start_date, is_start=True)
if start_date
else datetime(1998, 11, 11)
)
logger.info(f"{start_date=}")
end_date = parse_end_datetime(end_date) if end_date else datetime.now()
logger.info(f"{end_date=}")
logger.info(f"{start_date = }")
end_date = parse_datetime(end_date, is_start=False) if end_date else datetime.now()
logger.info(f"{end_date = }")
retrieve_authors(start_date, end_date)