Files
website-v2/core/tests/test_htmlhelper.py

796 lines
21 KiB
Python

from bs4 import BeautifulSoup
import pytest
from pytest_django.asserts import assertHTMLEqual
from core.htmlhelper import (
REMOVE_ALL,
REMOVE_CSS_CLASSES,
REMOVE_TAGS,
convert_h1_to_h2,
get_library_documentation_urls,
modernize_legacy_page,
remove_css,
remove_duplicate_tag,
remove_first_tag,
remove_ids,
remove_release_classes,
remove_tables,
remove_tags,
remove_unwanted,
style_links,
modernize_release_notes,
)
BASE_HEAD = """
<link rel="stylesheet" href="mystyle.css" />
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<style>
body {background-color: powderblue;}
h1 {color: red;}
p {color: blue;}
</style>
<meta name="description" content="Unit Test" />
<meta name="keywords" content="HTML, CSS, JavaScript" />
"""
BASE_TOKEN = "<!-- Add your content here -->"
BASE_BODY = f"""
<div id="boost-legacy-docs-header">The Header, Log In/Log Out, Menu</div>
<h1>A very important heading</h1>
<p>My first paragraph.</p>
<div id="other-block-content-id">
<p>My second paragraph.</p>
</div>
<div id="boost-legacy-docs-body">
{BASE_TOKEN}
</div>
"""
BASE_HTML = f"""<!DOCTYPE html>
<head>{BASE_HEAD}</head>
<html>
<body>{BASE_BODY}</body>
</html>
"""
LEGACY_HEAD = """
<title>A Meaningful Page Title</title>
<link rel="stylesheet" href="old.css" />
"""
LEGACY_BODY = """
<h1>My Legacy Heading</h1>
<p>My legacy paragraph.</p>
"""
def _build_tag(tag_name, tag_attrs, inner="Something"):
tag = f"<{tag_name}"
if tag_attrs:
tag += " " + " ".join(f'{k}="{v}"' for k, v in tag_attrs.items())
if tag_name in ["img", "link"]:
tag += "/>"
elif tag_name == "hr":
tag += ">"
else:
tag += f">{inner}</{tag_name}>"
return tag
def _build_expected_body(expected_body):
return BASE_BODY.replace(
BASE_TOKEN,
f"""
{BASE_TOKEN}
{expected_body}
""",
)
def test_modernize_legacy_page_unchanged_empty():
original = """Something else"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html=BASE_HTML)
assertHTMLEqual(result, original)
def test_modernize_legacy_page_unchanged_simple():
original = """<!DOCTYPE html>
<html>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html="")
assertHTMLEqual(result, original)
def test_modernize_legacy_page_unchanged_no_head():
original = f"""<!DOCTYPE html>
<html>
<body>
{LEGACY_BODY}
</body>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html="")
assertHTMLEqual(result, original)
def test_modernize_legacy_page_unchanged_no_body():
original = f"""<!DOCTYPE html>
<head>
{LEGACY_HEAD}
</head>
<html>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html="")
assertHTMLEqual(result, original)
def test_modernize_legacy_page_adds_head_if_missing():
original = """<!DOCTYPE html>
<html>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html=BASE_HTML)
expected = f"""<!DOCTYPE html>
<html>
<head>
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
</html>
"""
assertHTMLEqual(result, expected)
def test_modernize_legacy_page_appends_head_if_existing():
original = f"""<!DOCTYPE html>
<html>
<head>
{LEGACY_HEAD}
</head>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html=BASE_HTML)
expected = f"""<!DOCTYPE html>
<html>
<head>
{LEGACY_HEAD}
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
</html>
"""
assertHTMLEqual(result, expected)
def test_modernize_legacy_page_mangles_body():
original = f"""<!DOCTYPE html>
<html>
<body>
{LEGACY_BODY}
</body>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
result = modernize_legacy_page(soup, base_html=BASE_HTML)
expected = f"""<!DOCTYPE html>
<html>
<head>
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
<body>
{_build_expected_body(LEGACY_BODY)}
</body>
</html>
"""
assertHTMLEqual(result, expected)
@pytest.mark.parametrize("tag_name, tag_attrs", REMOVE_TAGS)
def test_modernize_legacy_page_remove_first_tag_found(tag_name, tag_attrs):
tag = _build_tag(tag_name, tag_attrs)
original = f"""<!DOCTYPE html>
<html>
<body>
<h1>My Legacy Heading</h1>
{tag}
<p>My legacy paragraph.</p>
{tag}
</body>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
soup = remove_unwanted(soup)
result = modernize_legacy_page(
soup, base_html=BASE_HTML, skip_replace_boostlook=True
)
body = _build_expected_body(LEGACY_BODY + tag)
expected = f"""<!DOCTYPE html>
<html>
<head>
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
<body>
{body}
</body>
</html>
"""
assertHTMLEqual(result, expected)
@pytest.mark.parametrize("tag_name, tag_attrs", REMOVE_ALL)
def test_modernize_legacy_page_remove_all_tags_found(tag_name, tag_attrs):
tag = _build_tag(tag_name, tag_attrs)
original = f"""<!DOCTYPE html>
<html>
<body>
{tag}
<h1>My Legacy Heading</h1>
{tag}
<p>My legacy paragraph.</p>
{tag}
</body>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
soup = remove_unwanted(soup)
result = modernize_legacy_page(soup, base_html=BASE_HTML)
expected = f"""<!DOCTYPE html>
<html>
<head>
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
<body>
{_build_expected_body(LEGACY_BODY)}
</body>
</html>
"""
assertHTMLEqual(result, expected)
@pytest.mark.parametrize("tag_name, tag_attrs", REMOVE_CSS_CLASSES)
def test_modernize_legacy_page_remove_only_css_class(tag_name, tag_attrs):
tag_attrs = tag_attrs.copy()
tag_attrs.setdefault("class", "something-to-remove")
tag = _build_tag(tag_name, tag_attrs)
original = f"""<!DOCTYPE html>
<html>
<body>
{tag}
<h1 class="some-class">My Legacy Heading</h1>
{tag}
<p>My legacy paragraph.</p>
{tag}
<div class="not-a-match"><p>Hello World</p></div>
{tag}
</body>
</html>
"""
soup = BeautifulSoup(original, "html.parser")
soup = remove_unwanted(soup)
result = modernize_legacy_page(soup, base_html=BASE_HTML)
# Build expected result
tag_attrs.pop("class")
tag_without_class = _build_tag(tag_name, tag_attrs)
expected_body = f"""
{tag_without_class}
<h1 class="some-class">My Legacy Heading</h1>
{tag_without_class}
<p>My legacy paragraph.</p>
{tag_without_class}
<div class="not-a-match"><p>Hello World</p></div>
{tag_without_class}
"""
body = _build_expected_body(expected_body)
expected = f"""<!DOCTYPE html>
<html>
<head>
<!-- BEGIN Manually appending items -->
{BASE_HEAD}
<!-- END Manually appending items -->
</head>
<body>
{body}
</body>
</html>
"""
assertHTMLEqual(result, expected)
def test_get_library_documentation_urls():
# HTML string for testing
test_content = """
<h2><a name="Alphabetically"></a></h2>
<ul>
<li><a href="/docs/path1">Library1</a></li>
<li><a href="/docs/path2">Library2</a></li>
</ul>
"""
expected_output = [("Library1", "/docs/path1"), ("Library2", "/docs/path2")]
result = get_library_documentation_urls(test_content)
assert result == expected_output
def test_get_library_documentation_urls_no_library_section():
# HTML string with no library section
test_content = """
<h2><a name="NotTheRightSection"></a></h2>
<ul>
<li><a href="/docs/path1">Library1</a></li>
<li><a href="/docs/path2">Library2</a></li>
</ul>
"""
result = get_library_documentation_urls(test_content)
assert result == []
def test_get_library_documentation_urls_no_libraries():
# HTML string with a library section but no libraries
test_content = """
<h2><a name="Alphabetically"></a></h2>
<ul>
<!-- No libraries -->
</ul>
"""
result = get_library_documentation_urls(test_content)
assert result == []
def test_get_library_documentation_urls_with_name_and_parent():
# HTML string for testing
test_content = """
<div><a name="CustomSection"></a></div>
<ul>
<li><a href="/docs/path1">CustomLibrary1</a></li>
<li><a href="/docs/path2">CustomLibrary2</a></li>
</ul>
"""
expected_output = [
("CustomLibrary1", "/docs/path1"),
("CustomLibrary2", "/docs/path2"),
]
result = get_library_documentation_urls(
test_content, name="CustomSection", parent="div"
)
assert result == expected_output
def test_convert_h1_to_h2():
html_content = """
<html>
<body>
<h1>Title 1</h1>
<h1>Title 2</h1>
<p>Some text here</p>
</body>
</html>
"""
soup = BeautifulSoup(html_content, "html.parser")
new_soup = convert_h1_to_h2(soup)
# Check if there are no h1 tags left
assert not new_soup.find_all("h1")
# Check if the h2 tags have the correct content
h2_tags = new_soup.find_all("h2")
assert len(h2_tags) == 2
assert h2_tags[0].get_text() == "Title 1"
assert h2_tags[1].get_text() == "Title 2"
def test_convert_h1_to_h2_none():
html_content = """
<html>
<body>
<p>Some text here</p>
</body>
</html>
"""
soup = BeautifulSoup(html_content, "html.parser")
new_soup = convert_h1_to_h2(soup)
h2_tags = new_soup.find_all("h2")
assert len(h2_tags) == 0
def test_remove_css():
# Sample HTML content with specific classes for testing
html_content = """
<html>
<body>
<div class="body-0">Content 0</div>
<div class="body-1">Content 1</div>
<div class="body-2">Content 2</div>
<p class="class3">Content 3</p>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Call the function with the REMOVE_CSS_CLASSES constant
soup = remove_css(soup, REMOVE_CSS_CLASSES)
# Assertions
# Check each tag in REMOVE_CSS_CLASSES to ensure its class has been removed
for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
found_tag = soup.find(tag_name, tag_attrs)
if found_tag:
assert "class" not in found_tag.attrs
else:
# Handle the case where the tag wasn't found
print(f"Tag not found: {tag_name} with attributes {tag_attrs}")
# Check that other tags not in REMOVE_CSS_CLASSES are unaffected
assert "class" in soup.find("p", {"class": "class3"}).attrs
def test_remove_css_none():
"""Test that remove_css still works if none of the CSS classes are present"""
# Sample HTML content with specific classes for testing
html_content = """
<html>
<body>
<div>Content 0</div>
<div>Content 1</div>
<div>Content 2</div>
<p>Content 3</p>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Call the function with the REMOVE_CSS_CLASSES constant
soup = remove_css(soup, REMOVE_CSS_CLASSES)
# Assertions
# Check each tag in REMOVE_CSS_CLASSES to ensure class has been removed
for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
found_tags = soup.find_all(tag_name, **tag_attrs)
for found_tag in found_tags:
assert found_tag is None or "class" not in found_tag.attrs
# Check that other tags not in REMOVE_CSS_CLASSES are unaffected
other_tag = soup.find("p") # Find the first p tag regardless of class
if other_tag and "class" in other_tag.attrs:
assert "class" in other_tag.attrs # Assert that p tag still has its class
else:
assert other_tag is not None # Assert that the p tag was found
def test_remove_duplicate_tag():
# Sample HTML content with duplicate <h2> tags
html_content = """
<html>
<body>
<h2>Header 1</h2>
<h2>Header 1</h2> <!-- Duplicate -->
<h2>Header 2</h2>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Call the function
soup = remove_duplicate_tag(soup, "h2")
# Assertions
h2_tags = soup.find_all("h2")
assert len(h2_tags) == 2
assert h2_tags[0].get_text(strip=True) == "Header 1"
assert h2_tags[1].get_text(strip=True) == "Header 2"
def test_remove_duplicate_tag_none():
# Sample HTML content with duplicate <h2> tags
html_content = """
<html>
<body>
<h2>Header 1</h2>
<h2>Header 2</h2>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Call the function
soup = remove_duplicate_tag(soup, "h2")
# Assertions
h2_tags = soup.find_all("h2")
assert len(h2_tags) == 2
assert h2_tags[0].get_text(strip=True) == "Header 1"
assert h2_tags[1].get_text(strip=True) == "Header 2"
def test_remove_first_tag():
# Sample HTML content with multiple occurrences of certain tags
html_content = """
<html>
<body>
<div id="header1">Header 1</div>
<div id="header2">Header 2</div>
<div id="header1">Another Header 1</div>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Tags to be removed
tags_to_remove = [("div", {"id": "header1"}), ("div", {"id": "header2"})]
# Call the function
soup = remove_first_tag(soup, tags_to_remove)
# Assertions
# The first occurrence of each tag should be removed
assert (
soup.find("div", {"id": "header1"}).get_text(strip=True) == "Another Header 1"
)
assert (
soup.find("div", {"id": "header2"}) is None
) # Second div with 'header2' should be removed
def test_remove_first_tag_none():
# Sample HTML content with multiple occurrences of certain tags
html_content = """
<html>
<body>
<div id="header1">Header 1</div>
<div id="header2">Header 2</div>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Tags to be removed
tags_to_remove = [("div", {"id": "header1"}), ("div", {"id": "header2"})]
# Call the function
soup = remove_first_tag(soup, tags_to_remove)
# The first occurrence of each tag should be removed
assert soup.find("div", {"id": "header1"}) is None
assert soup.find("div", {"id": "header2"}) is None
def test_remove_ids():
# Sample HTML content with multiple tags having specific ids
html_content = """
<html>
<body>
<div id="remove1">Remove me</div>
<div id="unwrap1"><p>Unwrap me</p></div>
<div id="keep1">Keep me</div>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Ids to be processed
ids_to_process = ["remove1", "unwrap1"]
# Call the function
soup = remove_ids(soup, ids_to_process)
# Assertions
# The tag with id 'remove1' should be removed
assert soup.find(id="remove1") is None
# The tag with id 'unwrap1' should be unwrapped
assert soup.find(id="unwrap1") is None
assert soup.find("p").get_text(strip=True) == "Unwrap me"
# The tag with id 'keep1' should not be affected
assert soup.find(id="keep1") is not None
assert soup.find(id="keep1").get_text(strip=True) == "Keep me"
def test_remove_release_classes():
# Sample HTML content with multiple tags having specific classes
html_content = """
<html>
<body>
<div class="class-to-remove">Remove me</div>
<div class="class-to-unwrap"><p>Unwrap me</p></div>
<div class="class-to-keep">Keep me</div>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Classes to be processed
classes_to_process = ["class-to-remove", "class-to-unwrap"]
# Call the function
soup = remove_release_classes(soup, classes_to_process)
# Assertions
# The tag with class 'class-to-remove' should be removed
assert soup.find(class_="class-to-remove") is None
# The tag with class 'class-to-unwrap' should be unwrapped
assert soup.find(class_="class-to-unwrap") is None
assert soup.find("p").get_text(strip=True) == "Unwrap me"
# The tag with class 'class-to-keep' should not be affected
assert soup.find(class_="class-to-keep") is not None
assert soup.find(class_="class-to-keep").get_text(strip=True) == "Keep me"
def test_remove_tables():
# Sample HTML content with multiple tables
html_content = """
<html>
<body>
<table class="table-to-remove">Content 1</table>
<table class="table-to-remove">Content 2</table>
<table class="other-class">Content 3</table>
<table>Content 4</table>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# The class name of the tables to be removed
class_name_to_remove = "table-to-remove"
# Call the function
soup = remove_tables(soup, class_name_to_remove)
# Assertions
assert (
soup.find_all("table", {"class": class_name_to_remove}) == []
) # All tables with class 'table-to-remove' should be removed
assert (
soup.find("table", {"class": "other-class"}).get_text(strip=True) == "Content 3"
) # This table should not be affected
assert (
soup.find("table", class_=False).get_text(strip=True) == "Content 4"
) # The table without a class should not be affected
def test_remove_tags():
# Sample HTML content with multiple tags and attributes
html_content = """
<html>
<body>
<div class="class-to-remove">Content 1</div>
<p id="id-to-remove">Content 2</p>
<span custom-attr="value-to-remove">Content 3</span>
<div>Content 4</div>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Tags with attributes to be removed
tags_to_remove = [
("div", {"class": "class-to-remove"}),
("p", {"id": "id-to-remove"}),
("span", {"custom-attr": "value-to-remove"}),
]
# Call the function
soup = remove_tags(soup, tags_to_remove)
# Assertions
assert soup.find("div", {"class": "class-to-remove"}) is None
assert soup.find("p", {"id": "id-to-remove"}) is None
assert soup.find("span", {"custom-attr": "value-to-remove"}) is None
assert (
soup.find("div").get_text(strip=True) == "Content 4"
) # The last div should not be affected
def test_style_links():
# Sample HTML content with multiple links
html_content = """
<html>
<body>
<a href="link1.html">Link 1</a>
<a href="link2.html">Link 2</a>
<a href="link3.html">Link 3</a>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Class name to be added to all links
new_class_name = "styled-link"
# Call the function
soup = style_links(soup, new_class_name)
# Assertions
for a_tag in soup.find_all("a"):
assert new_class_name in a_tag.get("class", []), "Class not added to link"
def test_style_links_no_links():
# Sample HTML content with multiple links
html_content = """
<html>
<body>
<p>Sample</p>
</body>
</html>
"""
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Class name to be added to all links
new_class_name = "styled-link"
# Call the function
soup = style_links(soup, new_class_name)
# Assertions
for a_tag in soup.find_all("a"):
assert new_class_name in a_tag.get("class", []), "Class not added to link"
def test_modernize_release_notes():
content = open("core/tests/content/boost_release_notes_sample.html", "rb").read()
output = modernize_release_notes(content)
expected_output = (
open("core/tests/content/boost_release_notes_filtered_output.html")
.read()
.strip()
)
assert output == expected_output