Extract content of HTML body from rendered asciidoc content (Part of #394)

- Make web service depend on redis - Add function to extract body from html content - In view, extract body before returning converted ascciidoc in response
2026-01-19 04:42:17 +00:00 · 2023-06-06 13:53:02 -07:00
parent 96aabf4412
commit 4e26f55dd1
6 changed files with 42 additions and 5 deletions
--- a/core/boostrenderer.py
+++ b/core/boostrenderer.py
@@ -1,5 +1,6 @@
 import boto3
 from botocore.exceptions import ClientError
+from bs4 import BeautifulSoup
 import json
 import os
 import re
@@ -17,6 +18,27 @@ from pygments.formatters.html import HtmlFormatter
 logger = structlog.get_logger()


+def get_body_from_html(html_string: str) -> str:
+    """Use BeautifulSoup to get the body content from an HTML document, without
+    the <body> tag.
+
+    We strip out the <body> tag because we want to use our main Boost template,
+    which includes its own <body> tag.
+
+    Args:
+        html_string (str): The HTML document as a string
+
+    Returns:
+        str: The body content as a string
+    """
+    soup = BeautifulSoup(html_string, "html.parser")
+    body = soup.find("body")
+    body_content = ""
+    if body:
+        body_content = "".join(str(tag) for tag in body.contents)
+    return body_content
+
+
 def get_content_from_s3(key=None, bucket_name=None):
    """
    Get content from S3. Returns the decoded file contents if able
--- a/core/tests/test_renderer.py
+++ b/core/tests/test_renderer.py
@@ -1,4 +1,12 @@
-from ..boostrenderer import get_content_type, get_s3_keys
+from ..boostrenderer import get_body_from_html, get_content_type, get_s3_keys
+
+
+def test_get_body_from_html():
+    html_string = (
+        "<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"
+    )
+    body_content = get_body_from_html(html_string)
+    assert body_content == "<h1>Test</h1>"


 def test_get_content_type():
--- a/core/views.py
+++ b/core/views.py
@@ -8,7 +8,7 @@ from django.http import Http404, HttpResponse, HttpResponseNotFound
 from django.shortcuts import render
 from django.views.generic import TemplateView, View

-from .boostrenderer import get_content_from_s3
+from .boostrenderer import get_body_from_html, get_content_from_s3
 from .markdown import process_md
 from .tasks import adoc_to_html

@@ -163,6 +163,7 @@ class StaticContentTemplateView(View):
            # Content is a byte string, decode it using UTF-8 encoding
            html_content = html_content.decode("utf-8")

-        context = {"content": html_content, "content_type": "text/html"}
-
+        # Extract only the contents of the body tag from the HTML
+        content = get_body_from_html(html_content)
+        context = {"content": content, "content_type": "text/html"}
        return render(request, "adoc_content.html", context)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -28,6 +28,7 @@ services:
    command: [ "/bin/bash", "/code/docker/compose-start.sh" ]
    depends_on:
      - db
+      - redis
    environment:
      - "LOCAL_DEVELOPMENT=true"
      - "DOCKER_DIR=/code/docker"
--- a/requirements.in
+++ b/requirements.in
@@ -49,9 +49,10 @@ minio
 # Packaging
 pip-tools==6.13.0

-# Markdown and Frontmatter
+# Parsing content from external sources (like S3) 
 mistletoe
 python-frontmatter
+beautifulsoup4

 #Forum
 django-machina>=1.2
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,8 @@ async-timeout==4.0.2
    # via redis
 backcall==0.2.0
    # via ipython
+beautifulsoup4==4.12.2
+    # via -r ./requirements.in
 billiard==3.6.4.0
    # via celery
 black==22.3
@@ -288,6 +290,8 @@ six==1.16.0
    #   django-rest-auth
    #   fs
    #   python-dateutil
+soupsieve==2.4.1
+    # via beautifulsoup4
 sqlparse==0.4.4
    # via django
 stack-data==0.6.2