News article automated summaries generation (#1906)

This commit is contained in:
daveoconnor
2025-09-25 17:28:45 -07:00
committed by GitHub
parent ad6f0d0d49
commit c7571ae569
17 changed files with 327 additions and 35 deletions

View File

@@ -71,6 +71,7 @@ jobs:
SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost"
CI: "true"
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
run: |
python -m pytest

View File

@@ -61,6 +61,7 @@ jobs:
SECRET_KEY: "for-testing-only"
REDIS_HOST: "localhost"
CI: "true"
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
run: |
python -m pytest

View File

@@ -581,3 +581,5 @@ if DEBUG_TOOLBAR:
MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware")
BOOST_BRANCHES = ["master", "develop"]
OPENROUTER_URL = "https://openrouter.ai/api/v1"
OPENROUTER_API_KEY = env("OPENROUTER_API_KEY")

View File

@@ -34,8 +34,6 @@ def multi_truncate_middle(value, arg):
else:
word = word_or_link
print(word_or_link)
if link_inner_match:
if len(word) > ln + 10:
start = word[: ((ln + 10) // 2)]

View File

@@ -78,3 +78,5 @@ PROD_LISTS_CORE_DB_DUMP_URL=gs://boostbackups/db1/daily/
PROD_LISTS_CORE_DB_DUMP_FILE_WILDCARD=lists_production_core.db1*
PROD_LISTS_WEB_DB_DUMP_URL=gs://boostbackups/db1/daily/
PROD_LISTS_WEB_DB_DUMP_FILE_WILDCARD=lists_production_web.db1*
OPENROUTER_API_KEY=

View File

@@ -1,2 +1,3 @@
NEWS_APPROVAL_SALT = "news-approval"
MAGIC_LINK_EXPIRATION = 3600 * 24 # 24h
CONTENT_SUMMARIZATION_THRESHOLD = 1000 # characters

View File

@@ -1,18 +1,13 @@
# import requests
#
# from django.conf import settings
#
#
# def get_link_preview_data(link):
# """gets the link preview json response from LinkPreview api"""
# api_url = "https://api.linkpreview.net"
# api_key = settings.LINK_PREVIEW_API_KEY
# target = link
#
# # TODO: Add additional field `image_size` to help validate image https://docs.linkpreview.net/#image-processing-and-validation
# response = requests.get(
# api_url,
# headers={'X-Linkpreview-Api-Key': api_key},
# params={'q': target},
# )
# return response.json()
from bs4 import BeautifulSoup
def extract_content(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
non_visible_tags = ["style", "script", "head", "meta", "[document]"]
for script_or_style in soup(non_visible_tags):
script_or_style.decompose()
text = soup.get_text(separator="\n")
lines = (line.strip() for line in text.splitlines())
# drop blank lines
minimized = [line for line in lines if line]
return "\n".join(minimized)

View File

View File

View File

@@ -0,0 +1,36 @@
import djclick as click
from news.models import Entry
from news.tasks import summary_dispatcher
@click.command()
@click.option(
"--dry-run",
is_flag=True,
help="Show which entries would be processed without actually dispatching tasks",
)
def command(dry_run):
"""Backpopulate summary field for news entries where summary is not set."""
entries_without_summary = Entry.objects.filter(summary="")
count = entries_without_summary.count()
if count == 0:
click.echo("No entries found without summaries.")
return
if dry_run:
click.echo(f"Would process {count} entries:")
for entry in entries_without_summary[:10]:
click.echo(f" - {entry.pk}: {entry.title}")
if count > 10:
click.echo(f" ... and {count - 10} more")
return
click.echo(f"Processing {count} entries without summaries...")
for entry in entries_without_summary:
click.echo(f"Dispatching summary task for entry {entry.pk}: {entry.title}")
summary_dispatcher.delay(entry.pk)
click.echo(f"Dispatched summary tasks for {count} entries.")

View File

@@ -0,0 +1,22 @@
# Generated by Django 4.2.16 on 2025-09-02 21:21
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("news", "0010_news_attachment"),
]
operations = [
migrations.AddField(
model_name="entry",
name="summary",
field=models.TextField(
blank=True,
default="",
help_text="AI generated summary. Delete to regenerate.",
),
),
]

View File

@@ -1,5 +1,6 @@
from pathlib import Path
from structlog import get_logger
from django.contrib.auth import get_user_model
from django.db import models
from django.db.models import Case, Value, When
@@ -17,8 +18,11 @@ from core.validators import (
)
from . import acl
from .constants import CONTENT_SUMMARIZATION_THRESHOLD
from .tasks import summary_dispatcher
User = get_user_model()
logger = get_logger(__name__)
class EntryManager(models.Manager):
@@ -86,6 +90,9 @@ class Entry(models.Model):
approved_at = models.DateTimeField(null=True, blank=True)
modified_at = models.DateTimeField(auto_now=True)
publish_at = models.DateTimeField(default=now)
summary = models.TextField(
blank=True, default="", help_text="AI generated summary. Delete to regenerate."
)
objects = EntryManager()
@@ -154,6 +161,21 @@ class Entry(models.Model):
result = False
return result
@cached_property
def determined_news_type(self):
if self.is_blogpost:
return "blogpost"
elif self.is_link:
return "link"
elif self.is_news:
return "news"
elif self.is_poll:
return "poll"
elif self.is_video:
return "video"
else:
return None
def approve(self, user, commit=True):
"""Mark this entry as approved by the given `user`."""
if self.is_approved:
@@ -163,10 +185,28 @@ class Entry(models.Model):
if commit:
self.save(update_fields=["moderator", "approved_at", "modified_at"])
@cached_property
def use_summary(self):
return self.summary and (
not self.content or len(self.content) > CONTENT_SUMMARIZATION_THRESHOLD
)
@cached_property
def visible_content(self):
if self.use_summary:
return self.summary
return self.content
def save(self, *args, **kwargs):
if not self.slug:
self.slug = slugify(self.title)
return super().save(*args, **kwargs)
result = super().save(*args, **kwargs)
if not self.summary:
logger.info(f"Passing {self.pk=} to dispatcher")
summary_dispatcher.delay(self.pk)
return result
def get_absolute_url(self):
return reverse("news-detail", args=[self.slug])

142
news/tasks.py Normal file
View File

@@ -0,0 +1,142 @@
from textwrap import dedent
from openai import OpenAI, OpenAIError
import requests
import structlog
from config.celery import app
from config.settings import OPENROUTER_API_KEY, OPENROUTER_URL
from news.constants import CONTENT_SUMMARIZATION_THRESHOLD
from news.helpers import extract_content
logger = structlog.get_logger(__name__)
@app.task(bind=True, max_retries=3, autoretry_for=(OpenAIError,))
def summarize_content(self, content: str, title: str, model: str) -> str:
"""Summarize content using an LLM model."""
if not content:
logger.warning("No content provided to summarize, skipping.")
raise ValueError("No content provided to summarize.")
logger.info(f"Summarizing {content[:100]=}... with {model=}")
max_length = 256
system_prompt = dedent(
f"""
You are an experienced technical writer tasked with summarizing content. Provide
a brief description of what the content after the "----" is discussing.
The title is also provided and may be in the content, repeating it in the
summary would be redundant so should be avoided.
Your summary should be concise, clear, and capture the main points of the
content. It should be less than {max_length} characters, with a single paragraph
of text, without going into detail. Before returning your response, check if
it's less than {max_length} characters, if not, shorten it until it is.
Write summaries in an impersonal, passive voice, never attributing actions to
'the author' or similar.
If no content is provided, do not return anything at all.
Don't format with markdown, html, or any other markup, just plain text.
Avoid adding any personal opinions or extraneous information.
Do not allow any NSFW content such as profanity, sexual content, or violence to
be returned in the summary, work around it.
Do not allow any security vulnerabilities to be returned in the summary, work
around them.
"""
)
user_prompt = dedent(
f"""
Please provide a summary of the following content:
----
Title: {title}
Content: {content}
"""
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
logger.debug(f"{messages=}")
content = None
try:
client = OpenAI(base_url=OPENROUTER_URL, api_key=OPENROUTER_API_KEY)
response = client.chat.completions.create(model=model, messages=messages)
content = response.choices[0].message.content
logger.info(
f"Received summarized content for {content[:100]=}: {len(content)=}..."
)
except (AttributeError, IndexError) as e:
logger.error(f"Error getting summarized content: {e=}")
return content
@app.task
def save_entry_summary_value(summary: str, pk: int):
from news.models import Entry
entry = Entry.objects.get(pk=pk)
entry.summary = summary
entry.save()
@app.task
def summary_dispatcher(pk: int):
from news.models import Entry
entry = Entry.objects.get(pk=pk)
logger.info(f"Dispatching {pk=} with {entry.news_type=}")
handler = {
"news": set_summary_for_event_entry,
"blogpost": set_summary_for_event_entry,
"link": set_summary_for_link_entry,
"video": set_summary_for_video_entry,
"poll": set_summary_for_poll_entry,
}[entry.determined_news_type]
logger.info(f"Dispatching summary task for {pk=} to {handler.__name__=}")
handler.delay(pk)
@app.task
def set_summary_for_event_entry(pk: int):
from news.models import Entry
entry = Entry.objects.get(pk=pk)
logger.info(f"dispatching summarize task for {pk=} with {entry.content[:40]=}...")
if entry.content and len(entry.content) < CONTENT_SUMMARIZATION_THRESHOLD:
logger.warning(f"Content too short to summarize for {pk=}, skipping.")
return
logger.info(f"handing off {pk=} to summarize_content task")
summarize_content.apply_async(
(entry.content, entry.title, "gpt-oss-120b"),
link=save_entry_summary_value.s(pk),
)
@app.task
def set_summary_for_link_entry(pk: int):
logger.info(f"Setting summary for link entry {pk=}")
from news.models import Entry
entry = Entry.objects.get(pk=pk)
try:
logger.info(f"Fetching content from {entry.external_url=} for entry.{pk=}")
response = requests.get(entry.external_url, timeout=10)
response.raise_for_status()
markup = response.text
logger.debug(f"Fetched {len(markup)=} for entry.{pk=}...")
content = extract_content(markup)
logger.info(f"extracted content from {entry.external_url=}, {markup[:100]=}")
except requests.RequestException as e:
logger.error(f"Error fetching content from {entry.external_url=}: {e=}")
return
logger.info(f"dispatching summarize task for {pk=} with {content[:40]=}...")
summarize_content.apply_async(
(content, entry.title, "gpt-oss-120b"), link=save_entry_summary_value.s(pk)
)
@app.task
def set_summary_for_video_entry(pk: int):
logger.info("Summarization not implemented")
@app.task
def set_summary_for_poll_entry(pk: int):
logger.info("Summarization not implemented")

View File

@@ -34,6 +34,7 @@ jsoncomment
unidecode
wordcloud
lxml
openai
# Logging
django-tracer

View File

@@ -2,6 +2,12 @@
# uv pip compile ./requirements.in --no-strip-extras --output-file ./requirements.txt
amqp==5.3.1
# via kombu
annotated-types==0.7.0
# via pydantic
anyio==4.10.0
# via
# httpx
# openai
appdirs==1.4.4
# via fs
argon2-cffi==25.1.0
@@ -40,6 +46,8 @@ celery==5.5.3
certifi==2025.8.3
# via
# elasticsearch
# httpcore
# httpx
# minio
# requests
cffi==1.17.1
@@ -82,7 +90,9 @@ decorator==5.2.1
# via ipython
distlib==0.4.0
# via virtualenv
dj-database-url==3.0.1
distro==1.9.0
# via openai
dj-database-url==2.2.0
# via environs
dj-email-url==1.0.6
# via environs
@@ -179,11 +189,20 @@ greenlet==3.2.4
# gevent
gunicorn==23.0.0
# via -r ./requirements.in
identify==2.6.13
h11==0.16.0
# via httpcore
httpcore==1.0.9
# via httpx
httpx==0.28.1
# via openai
identify==2.6.1
# via pre-commit
idna==3.10
# via requests
iniconfig==2.1.0
# via
# anyio
# httpx
# requests
iniconfig==2.0.0
# via pytest
interrogate==1.7.0
# via -r ./requirements.in
@@ -195,6 +214,8 @@ itsdangerous==2.2.0
# via -r ./requirements.in
jedi==0.19.2
# via ipython
jiter==0.10.0
# via openai
jmespath==1.0.1
# via
# boto3
@@ -236,7 +257,9 @@ oauthlib==3.3.1
# via
# django-allauth
# django-oauth-toolkit
packaging==25.0
openai==1.102.0
# via -r ./requirements.in
packaging==24.1
# via
# black
# django-haystack
@@ -288,16 +311,20 @@ pycparser==2.22
# via cffi
pycryptodome==3.23.0
# via minio
pygments==2.19.2
pydantic==2.11.7
# via openai
pydantic-core==2.33.2
# via pydantic
pygments==2.18.0
# via
# ipython
# ipython-pygments-lexers
# pytest
pyjwt[crypto]==2.10.1
pyjwt[crypto]==2.9.0
# via
# django-allauth
# redis
pyparsing==3.2.3
pyparsing==3.2.0
# via matplotlib
pytest==8.4.2
# via
@@ -353,7 +380,11 @@ six==1.17.0
# python-dateutil
slack-sdk==3.36.0
# via -r ./requirements.in
soupsieve==2.8
sniffio==1.3.1
# via
# anyio
# openai
soupsieve==2.6
# via beautifulsoup4
sqlparse==0.5.3
# via django
@@ -363,16 +394,26 @@ structlog==25.4.0
# via -r ./requirements.in
tabulate==0.9.0
# via interrogate
tqdm==4.67.1
# via openai
traitlets==5.14.3
# via
# ipython
# matplotlib-inline
typing-extensions==4.15.0
# via
# anyio
# beautifulsoup4
# dj-database-url
# ipython
# jwcrypto
# minio
# openai
# pydantic
# pydantic-core
# typing-inspection
typing-inspection==0.4.1
# via pydantic
tzdata==2025.2
# via
# faker

View File

@@ -78,8 +78,11 @@
{% if entry.news.attachment %}
<a href="{{ entry.news.attachment.url }}" class="text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange">{{ entry.news.attachment_filename }}</a>
{% endif %}
{{ entry.content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaks }}
{% with content_to_display=entry.content|default:entry.visible_content %}
{% if content_to_display %}
{{ content_to_display|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaks }}
{% endif %}
{% endwith %}
</div>
{% if entry.image %}

View File

@@ -141,9 +141,16 @@
<span class="text-xs">{{ entry.publish_at|date:"M jS, Y" }}</span>
</div>
</div>
{% if entry.content %}
<div>
<span class="text-base text-gray-500 dark:text-white/70">{{ entry.content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaksbr|multi_truncate_middle:30|truncatechars_html:500 }}</span>
{% if entry.visible_content %}
<div>
<div class="text-base text-gray-500 dark:text-white/70">{{ entry.visible_content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaksbr|multi_truncate_middle:30 }}</div>
{% if entry.use_summary %}
{% if entry.determined_news_type == "link" %}
<a href="{{ entry.external_url }}" class="inline-block text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange mt-2">Read more&#8230;</a>
{% else %}
<a href="{{ entry.get_absolute_url }}" class="inline-block text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange mt-2">Read more&#8230;</a>
{% endif %}
{% endif %}
</div>
{% endif %}