Extract content of HTML body from rendered asciidoc content (Part of #394)

- Make web service depend on redis
- Add function to extract body from html content
- In view, extract body before returning converted ascciidoc in response
This commit is contained in:
Lacey Williams Henschel
2023-06-06 13:53:02 -07:00
parent 96aabf4412
commit 4e26f55dd1
6 changed files with 42 additions and 5 deletions

View File

@@ -1,5 +1,6 @@
import boto3
from botocore.exceptions import ClientError
from bs4 import BeautifulSoup
import json
import os
import re
@@ -17,6 +18,27 @@ from pygments.formatters.html import HtmlFormatter
logger = structlog.get_logger()
def get_body_from_html(html_string: str) -> str:
"""Use BeautifulSoup to get the body content from an HTML document, without
the <body> tag.
We strip out the <body> tag because we want to use our main Boost template,
which includes its own <body> tag.
Args:
html_string (str): The HTML document as a string
Returns:
str: The body content as a string
"""
soup = BeautifulSoup(html_string, "html.parser")
body = soup.find("body")
body_content = ""
if body:
body_content = "".join(str(tag) for tag in body.contents)
return body_content
def get_content_from_s3(key=None, bucket_name=None):
"""
Get content from S3. Returns the decoded file contents if able

View File

@@ -1,4 +1,12 @@
from ..boostrenderer import get_content_type, get_s3_keys
from ..boostrenderer import get_body_from_html, get_content_type, get_s3_keys
def test_get_body_from_html():
html_string = (
"<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"
)
body_content = get_body_from_html(html_string)
assert body_content == "<h1>Test</h1>"
def test_get_content_type():

View File

@@ -8,7 +8,7 @@ from django.http import Http404, HttpResponse, HttpResponseNotFound
from django.shortcuts import render
from django.views.generic import TemplateView, View
from .boostrenderer import get_content_from_s3
from .boostrenderer import get_body_from_html, get_content_from_s3
from .markdown import process_md
from .tasks import adoc_to_html
@@ -163,6 +163,7 @@ class StaticContentTemplateView(View):
# Content is a byte string, decode it using UTF-8 encoding
html_content = html_content.decode("utf-8")
context = {"content": html_content, "content_type": "text/html"}
# Extract only the contents of the body tag from the HTML
content = get_body_from_html(html_content)
context = {"content": content, "content_type": "text/html"}
return render(request, "adoc_content.html", context)

View File

@@ -28,6 +28,7 @@ services:
command: [ "/bin/bash", "/code/docker/compose-start.sh" ]
depends_on:
- db
- redis
environment:
- "LOCAL_DEVELOPMENT=true"
- "DOCKER_DIR=/code/docker"

View File

@@ -49,9 +49,10 @@ minio
# Packaging
pip-tools==6.13.0
# Markdown and Frontmatter
# Parsing content from external sources (like S3)
mistletoe
python-frontmatter
beautifulsoup4
#Forum
django-machina>=1.2

View File

@@ -16,6 +16,8 @@ async-timeout==4.0.2
# via redis
backcall==0.2.0
# via ipython
beautifulsoup4==4.12.2
# via -r ./requirements.in
billiard==3.6.4.0
# via celery
black==22.3
@@ -288,6 +290,8 @@ six==1.16.0
# django-rest-auth
# fs
# python-dateutil
soupsieve==2.4.1
# via beautifulsoup4
sqlparse==0.4.4
# via django
stack-data==0.6.2