mirror of
https://github.com/boostorg/website-v2.git
synced 2026-01-19 04:42:17 +00:00
News article automated summaries generation (#1906)
This commit is contained in:
1
.github/workflows/actions-gcp.yaml
vendored
1
.github/workflows/actions-gcp.yaml
vendored
@@ -71,6 +71,7 @@ jobs:
|
||||
SECRET_KEY: "for-testing-only"
|
||||
REDIS_HOST: "localhost"
|
||||
CI: "true"
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
run: |
|
||||
python -m pytest
|
||||
|
||||
|
||||
1
.github/workflows/actions.yml
vendored
1
.github/workflows/actions.yml
vendored
@@ -61,6 +61,7 @@ jobs:
|
||||
SECRET_KEY: "for-testing-only"
|
||||
REDIS_HOST: "localhost"
|
||||
CI: "true"
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
run: |
|
||||
python -m pytest
|
||||
|
||||
|
||||
@@ -581,3 +581,5 @@ if DEBUG_TOOLBAR:
|
||||
MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware")
|
||||
|
||||
BOOST_BRANCHES = ["master", "develop"]
|
||||
OPENROUTER_URL = "https://openrouter.ai/api/v1"
|
||||
OPENROUTER_API_KEY = env("OPENROUTER_API_KEY")
|
||||
|
||||
@@ -34,8 +34,6 @@ def multi_truncate_middle(value, arg):
|
||||
else:
|
||||
word = word_or_link
|
||||
|
||||
print(word_or_link)
|
||||
|
||||
if link_inner_match:
|
||||
if len(word) > ln + 10:
|
||||
start = word[: ((ln + 10) // 2)]
|
||||
|
||||
@@ -78,3 +78,5 @@ PROD_LISTS_CORE_DB_DUMP_URL=gs://boostbackups/db1/daily/
|
||||
PROD_LISTS_CORE_DB_DUMP_FILE_WILDCARD=lists_production_core.db1*
|
||||
PROD_LISTS_WEB_DB_DUMP_URL=gs://boostbackups/db1/daily/
|
||||
PROD_LISTS_WEB_DB_DUMP_FILE_WILDCARD=lists_production_web.db1*
|
||||
|
||||
OPENROUTER_API_KEY=
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
NEWS_APPROVAL_SALT = "news-approval"
|
||||
MAGIC_LINK_EXPIRATION = 3600 * 24 # 24h
|
||||
CONTENT_SUMMARIZATION_THRESHOLD = 1000 # characters
|
||||
|
||||
@@ -1,18 +1,13 @@
|
||||
# import requests
|
||||
#
|
||||
# from django.conf import settings
|
||||
#
|
||||
#
|
||||
# def get_link_preview_data(link):
|
||||
# """gets the link preview json response from LinkPreview api"""
|
||||
# api_url = "https://api.linkpreview.net"
|
||||
# api_key = settings.LINK_PREVIEW_API_KEY
|
||||
# target = link
|
||||
#
|
||||
# # TODO: Add additional field `image_size` to help validate image https://docs.linkpreview.net/#image-processing-and-validation
|
||||
# response = requests.get(
|
||||
# api_url,
|
||||
# headers={'X-Linkpreview-Api-Key': api_key},
|
||||
# params={'q': target},
|
||||
# )
|
||||
# return response.json()
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def extract_content(html: str) -> str:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
non_visible_tags = ["style", "script", "head", "meta", "[document]"]
|
||||
for script_or_style in soup(non_visible_tags):
|
||||
script_or_style.decompose()
|
||||
text = soup.get_text(separator="\n")
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
# drop blank lines
|
||||
minimized = [line for line in lines if line]
|
||||
return "\n".join(minimized)
|
||||
|
||||
0
news/management/__init__.py
Normal file
0
news/management/__init__.py
Normal file
0
news/management/commands/__init__.py
Normal file
0
news/management/commands/__init__.py
Normal file
36
news/management/commands/backpopulate_summaries.py
Normal file
36
news/management/commands/backpopulate_summaries.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import djclick as click
|
||||
from news.models import Entry
|
||||
from news.tasks import summary_dispatcher
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--dry-run",
|
||||
is_flag=True,
|
||||
help="Show which entries would be processed without actually dispatching tasks",
|
||||
)
|
||||
def command(dry_run):
|
||||
"""Backpopulate summary field for news entries where summary is not set."""
|
||||
|
||||
entries_without_summary = Entry.objects.filter(summary="")
|
||||
count = entries_without_summary.count()
|
||||
|
||||
if count == 0:
|
||||
click.echo("No entries found without summaries.")
|
||||
return
|
||||
|
||||
if dry_run:
|
||||
click.echo(f"Would process {count} entries:")
|
||||
for entry in entries_without_summary[:10]:
|
||||
click.echo(f" - {entry.pk}: {entry.title}")
|
||||
if count > 10:
|
||||
click.echo(f" ... and {count - 10} more")
|
||||
return
|
||||
|
||||
click.echo(f"Processing {count} entries without summaries...")
|
||||
|
||||
for entry in entries_without_summary:
|
||||
click.echo(f"Dispatching summary task for entry {entry.pk}: {entry.title}")
|
||||
summary_dispatcher.delay(entry.pk)
|
||||
|
||||
click.echo(f"Dispatched summary tasks for {count} entries.")
|
||||
22
news/migrations/0011_entry_summary.py
Normal file
22
news/migrations/0011_entry_summary.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Generated by Django 4.2.16 on 2025-09-02 21:21
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("news", "0010_news_attachment"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="entry",
|
||||
name="summary",
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="AI generated summary. Delete to regenerate.",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -1,5 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from structlog import get_logger
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db import models
|
||||
from django.db.models import Case, Value, When
|
||||
@@ -17,8 +18,11 @@ from core.validators import (
|
||||
)
|
||||
|
||||
from . import acl
|
||||
from .constants import CONTENT_SUMMARIZATION_THRESHOLD
|
||||
from .tasks import summary_dispatcher
|
||||
|
||||
User = get_user_model()
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class EntryManager(models.Manager):
|
||||
@@ -86,6 +90,9 @@ class Entry(models.Model):
|
||||
approved_at = models.DateTimeField(null=True, blank=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
publish_at = models.DateTimeField(default=now)
|
||||
summary = models.TextField(
|
||||
blank=True, default="", help_text="AI generated summary. Delete to regenerate."
|
||||
)
|
||||
|
||||
objects = EntryManager()
|
||||
|
||||
@@ -154,6 +161,21 @@ class Entry(models.Model):
|
||||
result = False
|
||||
return result
|
||||
|
||||
@cached_property
|
||||
def determined_news_type(self):
|
||||
if self.is_blogpost:
|
||||
return "blogpost"
|
||||
elif self.is_link:
|
||||
return "link"
|
||||
elif self.is_news:
|
||||
return "news"
|
||||
elif self.is_poll:
|
||||
return "poll"
|
||||
elif self.is_video:
|
||||
return "video"
|
||||
else:
|
||||
return None
|
||||
|
||||
def approve(self, user, commit=True):
|
||||
"""Mark this entry as approved by the given `user`."""
|
||||
if self.is_approved:
|
||||
@@ -163,10 +185,28 @@ class Entry(models.Model):
|
||||
if commit:
|
||||
self.save(update_fields=["moderator", "approved_at", "modified_at"])
|
||||
|
||||
@cached_property
|
||||
def use_summary(self):
|
||||
return self.summary and (
|
||||
not self.content or len(self.content) > CONTENT_SUMMARIZATION_THRESHOLD
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def visible_content(self):
|
||||
if self.use_summary:
|
||||
return self.summary
|
||||
return self.content
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if not self.slug:
|
||||
self.slug = slugify(self.title)
|
||||
return super().save(*args, **kwargs)
|
||||
result = super().save(*args, **kwargs)
|
||||
|
||||
if not self.summary:
|
||||
logger.info(f"Passing {self.pk=} to dispatcher")
|
||||
summary_dispatcher.delay(self.pk)
|
||||
|
||||
return result
|
||||
|
||||
def get_absolute_url(self):
|
||||
return reverse("news-detail", args=[self.slug])
|
||||
|
||||
142
news/tasks.py
Normal file
142
news/tasks.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from textwrap import dedent
|
||||
from openai import OpenAI, OpenAIError
|
||||
import requests
|
||||
import structlog
|
||||
|
||||
from config.celery import app
|
||||
from config.settings import OPENROUTER_API_KEY, OPENROUTER_URL
|
||||
from news.constants import CONTENT_SUMMARIZATION_THRESHOLD
|
||||
from news.helpers import extract_content
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
@app.task(bind=True, max_retries=3, autoretry_for=(OpenAIError,))
|
||||
def summarize_content(self, content: str, title: str, model: str) -> str:
|
||||
"""Summarize content using an LLM model."""
|
||||
if not content:
|
||||
logger.warning("No content provided to summarize, skipping.")
|
||||
raise ValueError("No content provided to summarize.")
|
||||
logger.info(f"Summarizing {content[:100]=}... with {model=}")
|
||||
max_length = 256
|
||||
system_prompt = dedent(
|
||||
f"""
|
||||
You are an experienced technical writer tasked with summarizing content. Provide
|
||||
a brief description of what the content after the "----" is discussing.
|
||||
The title is also provided and may be in the content, repeating it in the
|
||||
summary would be redundant so should be avoided.
|
||||
Your summary should be concise, clear, and capture the main points of the
|
||||
content. It should be less than {max_length} characters, with a single paragraph
|
||||
of text, without going into detail. Before returning your response, check if
|
||||
it's less than {max_length} characters, if not, shorten it until it is.
|
||||
Write summaries in an impersonal, passive voice, never attributing actions to
|
||||
'the author' or similar.
|
||||
If no content is provided, do not return anything at all.
|
||||
Don't format with markdown, html, or any other markup, just plain text.
|
||||
Avoid adding any personal opinions or extraneous information.
|
||||
Do not allow any NSFW content such as profanity, sexual content, or violence to
|
||||
be returned in the summary, work around it.
|
||||
Do not allow any security vulnerabilities to be returned in the summary, work
|
||||
around them.
|
||||
"""
|
||||
)
|
||||
user_prompt = dedent(
|
||||
f"""
|
||||
Please provide a summary of the following content:
|
||||
----
|
||||
Title: {title}
|
||||
Content: {content}
|
||||
"""
|
||||
)
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
logger.debug(f"{messages=}")
|
||||
content = None
|
||||
try:
|
||||
client = OpenAI(base_url=OPENROUTER_URL, api_key=OPENROUTER_API_KEY)
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
content = response.choices[0].message.content
|
||||
logger.info(
|
||||
f"Received summarized content for {content[:100]=}: {len(content)=}..."
|
||||
)
|
||||
except (AttributeError, IndexError) as e:
|
||||
logger.error(f"Error getting summarized content: {e=}")
|
||||
return content
|
||||
|
||||
|
||||
@app.task
|
||||
def save_entry_summary_value(summary: str, pk: int):
|
||||
from news.models import Entry
|
||||
|
||||
entry = Entry.objects.get(pk=pk)
|
||||
entry.summary = summary
|
||||
entry.save()
|
||||
|
||||
|
||||
@app.task
|
||||
def summary_dispatcher(pk: int):
|
||||
from news.models import Entry
|
||||
|
||||
entry = Entry.objects.get(pk=pk)
|
||||
logger.info(f"Dispatching {pk=} with {entry.news_type=}")
|
||||
handler = {
|
||||
"news": set_summary_for_event_entry,
|
||||
"blogpost": set_summary_for_event_entry,
|
||||
"link": set_summary_for_link_entry,
|
||||
"video": set_summary_for_video_entry,
|
||||
"poll": set_summary_for_poll_entry,
|
||||
}[entry.determined_news_type]
|
||||
logger.info(f"Dispatching summary task for {pk=} to {handler.__name__=}")
|
||||
handler.delay(pk)
|
||||
|
||||
|
||||
@app.task
|
||||
def set_summary_for_event_entry(pk: int):
|
||||
from news.models import Entry
|
||||
|
||||
entry = Entry.objects.get(pk=pk)
|
||||
logger.info(f"dispatching summarize task for {pk=} with {entry.content[:40]=}...")
|
||||
if entry.content and len(entry.content) < CONTENT_SUMMARIZATION_THRESHOLD:
|
||||
logger.warning(f"Content too short to summarize for {pk=}, skipping.")
|
||||
return
|
||||
logger.info(f"handing off {pk=} to summarize_content task")
|
||||
summarize_content.apply_async(
|
||||
(entry.content, entry.title, "gpt-oss-120b"),
|
||||
link=save_entry_summary_value.s(pk),
|
||||
)
|
||||
|
||||
|
||||
@app.task
|
||||
def set_summary_for_link_entry(pk: int):
|
||||
logger.info(f"Setting summary for link entry {pk=}")
|
||||
from news.models import Entry
|
||||
|
||||
entry = Entry.objects.get(pk=pk)
|
||||
try:
|
||||
logger.info(f"Fetching content from {entry.external_url=} for entry.{pk=}")
|
||||
response = requests.get(entry.external_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
markup = response.text
|
||||
logger.debug(f"Fetched {len(markup)=} for entry.{pk=}...")
|
||||
content = extract_content(markup)
|
||||
logger.info(f"extracted content from {entry.external_url=}, {markup[:100]=}")
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching content from {entry.external_url=}: {e=}")
|
||||
return
|
||||
|
||||
logger.info(f"dispatching summarize task for {pk=} with {content[:40]=}...")
|
||||
summarize_content.apply_async(
|
||||
(content, entry.title, "gpt-oss-120b"), link=save_entry_summary_value.s(pk)
|
||||
)
|
||||
|
||||
|
||||
@app.task
|
||||
def set_summary_for_video_entry(pk: int):
|
||||
logger.info("Summarization not implemented")
|
||||
|
||||
|
||||
@app.task
|
||||
def set_summary_for_poll_entry(pk: int):
|
||||
logger.info("Summarization not implemented")
|
||||
@@ -34,6 +34,7 @@ jsoncomment
|
||||
unidecode
|
||||
wordcloud
|
||||
lxml
|
||||
openai
|
||||
|
||||
# Logging
|
||||
django-tracer
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
# uv pip compile ./requirements.in --no-strip-extras --output-file ./requirements.txt
|
||||
amqp==5.3.1
|
||||
# via kombu
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.10.0
|
||||
# via
|
||||
# httpx
|
||||
# openai
|
||||
appdirs==1.4.4
|
||||
# via fs
|
||||
argon2-cffi==25.1.0
|
||||
@@ -40,6 +46,8 @@ celery==5.5.3
|
||||
certifi==2025.8.3
|
||||
# via
|
||||
# elasticsearch
|
||||
# httpcore
|
||||
# httpx
|
||||
# minio
|
||||
# requests
|
||||
cffi==1.17.1
|
||||
@@ -82,7 +90,9 @@ decorator==5.2.1
|
||||
# via ipython
|
||||
distlib==0.4.0
|
||||
# via virtualenv
|
||||
dj-database-url==3.0.1
|
||||
distro==1.9.0
|
||||
# via openai
|
||||
dj-database-url==2.2.0
|
||||
# via environs
|
||||
dj-email-url==1.0.6
|
||||
# via environs
|
||||
@@ -179,11 +189,20 @@ greenlet==3.2.4
|
||||
# gevent
|
||||
gunicorn==23.0.0
|
||||
# via -r ./requirements.in
|
||||
identify==2.6.13
|
||||
h11==0.16.0
|
||||
# via httpcore
|
||||
httpcore==1.0.9
|
||||
# via httpx
|
||||
httpx==0.28.1
|
||||
# via openai
|
||||
identify==2.6.1
|
||||
# via pre-commit
|
||||
idna==3.10
|
||||
# via requests
|
||||
iniconfig==2.1.0
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
interrogate==1.7.0
|
||||
# via -r ./requirements.in
|
||||
@@ -195,6 +214,8 @@ itsdangerous==2.2.0
|
||||
# via -r ./requirements.in
|
||||
jedi==0.19.2
|
||||
# via ipython
|
||||
jiter==0.10.0
|
||||
# via openai
|
||||
jmespath==1.0.1
|
||||
# via
|
||||
# boto3
|
||||
@@ -236,7 +257,9 @@ oauthlib==3.3.1
|
||||
# via
|
||||
# django-allauth
|
||||
# django-oauth-toolkit
|
||||
packaging==25.0
|
||||
openai==1.102.0
|
||||
# via -r ./requirements.in
|
||||
packaging==24.1
|
||||
# via
|
||||
# black
|
||||
# django-haystack
|
||||
@@ -288,16 +311,20 @@ pycparser==2.22
|
||||
# via cffi
|
||||
pycryptodome==3.23.0
|
||||
# via minio
|
||||
pygments==2.19.2
|
||||
pydantic==2.11.7
|
||||
# via openai
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
# via
|
||||
# ipython
|
||||
# ipython-pygments-lexers
|
||||
# pytest
|
||||
pyjwt[crypto]==2.10.1
|
||||
pyjwt[crypto]==2.9.0
|
||||
# via
|
||||
# django-allauth
|
||||
# redis
|
||||
pyparsing==3.2.3
|
||||
pyparsing==3.2.0
|
||||
# via matplotlib
|
||||
pytest==8.4.2
|
||||
# via
|
||||
@@ -353,7 +380,11 @@ six==1.17.0
|
||||
# python-dateutil
|
||||
slack-sdk==3.36.0
|
||||
# via -r ./requirements.in
|
||||
soupsieve==2.8
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# openai
|
||||
soupsieve==2.6
|
||||
# via beautifulsoup4
|
||||
sqlparse==0.5.3
|
||||
# via django
|
||||
@@ -363,16 +394,26 @@ structlog==25.4.0
|
||||
# via -r ./requirements.in
|
||||
tabulate==0.9.0
|
||||
# via interrogate
|
||||
tqdm==4.67.1
|
||||
# via openai
|
||||
traitlets==5.14.3
|
||||
# via
|
||||
# ipython
|
||||
# matplotlib-inline
|
||||
typing-extensions==4.15.0
|
||||
# via
|
||||
# anyio
|
||||
# beautifulsoup4
|
||||
# dj-database-url
|
||||
# ipython
|
||||
# jwcrypto
|
||||
# minio
|
||||
# openai
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# typing-inspection
|
||||
typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
tzdata==2025.2
|
||||
# via
|
||||
# faker
|
||||
|
||||
@@ -78,8 +78,11 @@
|
||||
{% if entry.news.attachment %}
|
||||
<a href="{{ entry.news.attachment.url }}" class="text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange">{{ entry.news.attachment_filename }}</a>
|
||||
{% endif %}
|
||||
|
||||
{{ entry.content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaks }}
|
||||
{% with content_to_display=entry.content|default:entry.visible_content %}
|
||||
{% if content_to_display %}
|
||||
{{ content_to_display|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaks }}
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
</div>
|
||||
|
||||
{% if entry.image %}
|
||||
|
||||
@@ -141,9 +141,16 @@
|
||||
<span class="text-xs">{{ entry.publish_at|date:"M jS, Y" }}</span>
|
||||
</div>
|
||||
</div>
|
||||
{% if entry.content %}
|
||||
<div>
|
||||
<span class="text-base text-gray-500 dark:text-white/70">{{ entry.content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaksbr|multi_truncate_middle:30|truncatechars_html:500 }}</span>
|
||||
{% if entry.visible_content %}
|
||||
<div>
|
||||
<div class="text-base text-gray-500 dark:text-white/70">{{ entry.visible_content|urlize|url_target_blank:'text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange'|linebreaksbr|multi_truncate_middle:30 }}</div>
|
||||
{% if entry.use_summary %}
|
||||
{% if entry.determined_news_type == "link" %}
|
||||
<a href="{{ entry.external_url }}" class="inline-block text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange mt-2">Read more…</a>
|
||||
{% else %}
|
||||
<a href="{{ entry.get_absolute_url }}" class="inline-block text-sky-600 dark:text-sky-300 hover:text-orange dark:hover:text-orange mt-2">Read more…</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user