mirror of
https://github.com/boostorg/website-v2.git
synced 2026-01-19 04:42:17 +00:00
233 lines
7.0 KiB
Python
233 lines
7.0 KiB
Python
from bs4 import BeautifulSoup
|
|
from unittest.mock import Mock, patch
|
|
import datetime
|
|
from io import BytesIO
|
|
import pytest
|
|
|
|
from ..boostrenderer import (
|
|
extract_file_data,
|
|
get_body_from_html,
|
|
get_content_type,
|
|
get_file_data,
|
|
get_s3_keys,
|
|
convert_img_paths,
|
|
get_meta_redirect_from_html,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_s3_client():
|
|
return "mock_s3_client"
|
|
|
|
|
|
def test_extract_file_data():
|
|
response = {
|
|
"Body": BytesIO(b"file content"),
|
|
"ContentType": "text/plain",
|
|
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
|
|
}
|
|
s3_key = "example_key.txt"
|
|
|
|
expected_result = {
|
|
"content": b"file content",
|
|
"content_key": s3_key,
|
|
"content_type": "text/plain",
|
|
"last_modified": datetime.datetime(2023, 6, 8, 12, 0, 0),
|
|
}
|
|
|
|
result = extract_file_data(response, s3_key)
|
|
|
|
assert result == expected_result
|
|
|
|
|
|
def test_extract_file_data_utf8_not_double_encoded():
|
|
"""Test that UTF-8 content with non-breaking spaces is not double-encoded.
|
|
|
|
This test ensures that content containing UTF-8 bytes like \xc2\xa0 (non-breaking
|
|
space) is not misdetected as Windows-1252 and incorrectly re-encoded, which would
|
|
cause double-encoding (\xc2\xa0 -> \xc3\x82\xc2\xa0).
|
|
"""
|
|
# HTML content with UTF-8 encoded non-breaking space (\xc2\xa0)
|
|
utf8_content = b"<html><title>Chapter\xc2\xa01.\xc2\xa0Boost.Beast</title></html>"
|
|
|
|
response = {
|
|
"Body": BytesIO(utf8_content),
|
|
"ContentType": "text/html; charset=UTF-8",
|
|
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
|
|
}
|
|
s3_key = "example.html"
|
|
|
|
result = extract_file_data(response, s3_key)
|
|
|
|
# Content should remain as UTF-8, not be double-encoded
|
|
assert result["content"] == utf8_content
|
|
# Should NOT contain double-encoded sequence \xc3\x82\xc2\xa0
|
|
assert b"\xc3\x82\xc2\xa0" not in result["content"]
|
|
# Should contain the original UTF-8 non-breaking space \xc2\xa0
|
|
assert b"\xc2\xa0" in result["content"]
|
|
|
|
|
|
def test_extract_file_data_non_utf8_reencoded():
|
|
"""Test that genuinely non-UTF-8 content is detected and re-encoded to UTF-8."""
|
|
# Latin-1 content with a character not valid in UTF-8
|
|
latin1_content = b"<html><title>Test\xe9</title></html>" # \xe9 is 'é' in Latin-1
|
|
|
|
response = {
|
|
"Body": BytesIO(latin1_content),
|
|
"ContentType": "text/html",
|
|
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
|
|
}
|
|
s3_key = "example.html"
|
|
|
|
result = extract_file_data(response, s3_key)
|
|
|
|
# Content should be re-encoded to UTF-8
|
|
# 'é' in UTF-8 is \xc3\xa9
|
|
assert b"\xc3\xa9" in result["content"]
|
|
# Original Latin-1 byte should not be present
|
|
assert result["content"] != latin1_content
|
|
|
|
|
|
def test_get_body_from_html():
|
|
html_string = (
|
|
"<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"
|
|
)
|
|
body_content = get_body_from_html(html_string)
|
|
assert body_content == "<h1>Test</h1>"
|
|
|
|
|
|
def test_get_body_from_html_strip_footer():
|
|
html_string = """
|
|
<html>
|
|
<head><title>Test</title></head>
|
|
<body>
|
|
<h1>Test</h1>
|
|
<div id='footer'>Some content</div>
|
|
<span id='contains-footer'>More content</span>
|
|
</body>
|
|
</html>
|
|
"""
|
|
body_content = get_body_from_html(html_string)
|
|
assert body_content == "<h1>Test</h1>"
|
|
|
|
|
|
def test_get_meta_redirect_from_html():
|
|
html_string = """
|
|
<html>
|
|
<meta http-equiv="refresh" content="0; url=http://example.com">
|
|
<head><title>Test</title></head>
|
|
<body>
|
|
<h1>Test</h1>
|
|
</body>
|
|
</html>
|
|
"""
|
|
assert get_meta_redirect_from_html(html_string) == "http://example.com"
|
|
|
|
|
|
def test_get_meta_redirect_from_html_no_redirect():
|
|
html_string = """
|
|
<html>
|
|
<head><title>Test</title></head>
|
|
<body>
|
|
<h1>Test</h1>
|
|
</body>
|
|
</html>
|
|
"""
|
|
assert get_meta_redirect_from_html(html_string) is None
|
|
|
|
|
|
def test_get_content_type():
|
|
# HTML file content type is text/html
|
|
assert get_content_type("/marshmallow/index.html", "text/html"), "text/html"
|
|
|
|
# CSS file content type is text/css
|
|
assert get_content_type("/rst.css", "text/css"), "text/css"
|
|
|
|
# Asciidoc content, which comes from S3 with an .adoc extension but not a useful
|
|
# content type, should be changed to text/asciidoc
|
|
assert get_content_type("/site/develop/help.adoc", "text/html"), "text/asciidoc"
|
|
|
|
# JS file content type is always set to application/javascript
|
|
assert get_content_type(
|
|
"/site/develop/doc/html/scripts.js", "text/html"
|
|
), "application/javascript"
|
|
|
|
|
|
def test_get_file_data():
|
|
# Mock the S3 client
|
|
mock_client = Mock()
|
|
mock_response = Mock()
|
|
mock_extract_file_data = Mock(return_value="mock_file_data")
|
|
|
|
# Patch the necessary functions and objects
|
|
with patch("core.boostrenderer.extract_file_data", mock_extract_file_data), patch(
|
|
"core.boostrenderer.logger"
|
|
) as mock_logger:
|
|
# Set up the mock response
|
|
mock_client.get_object.return_value = mock_response
|
|
|
|
bucket_name = "my-bucket"
|
|
s3_key = "/path/to/file.txt"
|
|
|
|
expected_result = "mock_file_data"
|
|
|
|
# Call the function being tested
|
|
result = get_file_data(mock_client, bucket_name, s3_key)
|
|
|
|
# Assert the expected behavior and result
|
|
assert result == expected_result
|
|
mock_client.get_object.assert_called_once_with(
|
|
Bucket=bucket_name, Key=s3_key.lstrip("/")
|
|
)
|
|
mock_extract_file_data.assert_called_once_with(mock_response, s3_key)
|
|
assert not mock_logger.exception.called
|
|
|
|
|
|
def test_get_s3_keys():
|
|
"""
|
|
Test cases for get_s3_keys function.
|
|
|
|
Test cases:
|
|
|
|
- "/marshmallow/index.html" -> "site/develop/tools/auto_index/index.html"
|
|
- "/marshmallow/about.html" -> "site/develop/doc/html/about.html"
|
|
- "/rst.css" -> "site/develop/rst.css"
|
|
- "/site/develop/doc/html/about.html" -> "site/develop/doc/html/about.html"
|
|
"""
|
|
assert "/site-docs/develop/user-guide/index.html" in get_s3_keys(
|
|
"/doc/user-guide/index.html"
|
|
)
|
|
assert "/site-docs/develop/contributor-guide/index.html" in get_s3_keys(
|
|
"/doc/contributor-guide/index.html"
|
|
)
|
|
assert "/site-docs/develop/release-process/index.html" in get_s3_keys(
|
|
"/doc/release-process/index.html"
|
|
)
|
|
|
|
|
|
def test_convert_img_paths():
|
|
# Test data
|
|
html_content = """
|
|
<html>
|
|
<body>
|
|
<img src="image1.png" alt="Image 1"/>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# Expected output after conversion
|
|
expected_html = """
|
|
<html>
|
|
<body>
|
|
<img src="/images/site-pages/develop/image1.png" alt="Image 1"/>
|
|
</body>
|
|
</html>
|
|
""" # noqa
|
|
s3_path = "/images/site-pages/develop"
|
|
|
|
result = convert_img_paths(html_content, s3_path)
|
|
|
|
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
result_soup = BeautifulSoup(result, "html.parser")
|
|
assert result_soup == expected_soup
|