mirror of
https://github.com/boostorg/website-v2.git
synced 2026-01-19 04:42:17 +00:00
398 lines
12 KiB
Python
398 lines
12 KiB
Python
import re
|
|
import djclick as click
|
|
import json
|
|
import requests
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from bs4 import BeautifulSoup
|
|
from versions.utils.common import load_json_list, has_index_files
|
|
|
|
FILE_FILTER_EXTENSIONS = (
|
|
".html",
|
|
".htm",
|
|
".js",
|
|
".xml",
|
|
".css",
|
|
".txt",
|
|
".c",
|
|
".mso",
|
|
".cpp",
|
|
".hpp",
|
|
".ipp",
|
|
".php",
|
|
".py",
|
|
".md",
|
|
".rst",
|
|
".pdf",
|
|
".qbk",
|
|
".docx",
|
|
".xlsx",
|
|
".csv",
|
|
".json",
|
|
".yaml",
|
|
".yml",
|
|
".txt",
|
|
".txt.gz",
|
|
".txt.bz2",
|
|
".txt.xz",
|
|
".txt.zst",
|
|
".txt.lz4.in",
|
|
".v2",
|
|
".dat",
|
|
".dat.gz",
|
|
".dat.bz2",
|
|
".dat.xz",
|
|
".dat.zst",
|
|
".dat.lz4",
|
|
".dot",
|
|
".ico",
|
|
".toyxml",
|
|
".svg",
|
|
".png",
|
|
".jpg",
|
|
".jpeg",
|
|
)
|
|
|
|
|
|
def href_pass(url: str) -> bool:
|
|
"""Check if URL is local (relative or absolute local path)."""
|
|
url = url.strip()
|
|
if not url:
|
|
return False
|
|
|
|
# stage 1: quick checks, don't require filesystem access
|
|
if any(
|
|
[
|
|
url.startswith(("http://", "https://", "javascript:", "mailto:")),
|
|
url.startswith("{{") and url.endswith("}}"), # Jinja2 style
|
|
"#" in url,
|
|
"://" in url,
|
|
Path(url).suffix in FILE_FILTER_EXTENSIONS,
|
|
re.match(r"^[./]+$", url), # catch relative paths, "./", "../", "../../"
|
|
]
|
|
):
|
|
return False
|
|
|
|
# stage 2: filesystem check only if all quick checks passed, mitigates exception
|
|
# trip ups in lazily evaluated any() statement above
|
|
return not has_index_files(Path(url))
|
|
|
|
|
|
def extract_href_urls_from_content(content: str) -> list[str]:
|
|
"""Extract and filter href URLs from HTML content using BeautifulSoup."""
|
|
try:
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
return [
|
|
a_tag.get("href")
|
|
for a_tag in soup.find_all("a", href=True)
|
|
if a_tag.get("href") and href_pass(a_tag.get("href"))
|
|
]
|
|
except (AttributeError, TypeError, ValueError):
|
|
return []
|
|
|
|
|
|
def process_single_file(file_path: Path, relative_path: str) -> dict[str, list[str]]:
|
|
"""Process a single HTML file and return dict of URLs -> [files that reference them]."""
|
|
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
filtered_urls = extract_href_urls_from_content(content)
|
|
return {url: [relative_path] for url in filtered_urls}
|
|
|
|
|
|
def process_version_files(
|
|
version_dir: Path, doc_files: list[str]
|
|
) -> tuple[dict[str, list[str]], int]:
|
|
"""Process all doc files for a version and return dict of URLs -> referencing files."""
|
|
url_references = {}
|
|
files_processed = 0
|
|
|
|
for doc_file in doc_files:
|
|
file_path = version_dir / doc_file
|
|
file_url_dict = process_single_file(file_path, doc_file)
|
|
|
|
# Merge URLs into main dict, combining referencing file lists
|
|
for url, referencing_files in file_url_dict.items():
|
|
if url in url_references:
|
|
url_references[url].extend(referencing_files)
|
|
else:
|
|
url_references[url] = referencing_files[:]
|
|
|
|
if file_path.exists():
|
|
files_processed += 1
|
|
|
|
return url_references, files_processed
|
|
|
|
|
|
def check_path_exists(base_dir: Path, path: str) -> tuple[bool, bool]:
|
|
"""Check if path exists and return (is_file, is_directory)."""
|
|
try:
|
|
full_path = base_dir / path
|
|
if not full_path.exists():
|
|
return False, False
|
|
return full_path.is_file(), full_path.is_dir()
|
|
except ValueError:
|
|
return False, False
|
|
|
|
|
|
def resolve_target_path(ref_file: str, url: str, version_dir: Path) -> Path:
|
|
"""Resolve a URL relative to a referencing file's directory."""
|
|
ref_file_path = version_dir / ref_file
|
|
ref_file_dir = ref_file_path.parent
|
|
target_path = ref_file_dir / url
|
|
return target_path.resolve()
|
|
|
|
|
|
def check_directory_contents(target_dir: Path) -> tuple[bool, bool]:
|
|
"""Check if directory has index files and other files."""
|
|
has_index = False
|
|
has_files = False
|
|
|
|
if target_dir.exists() and target_dir.is_dir():
|
|
has_index = has_index_files(target_dir)
|
|
files_in_dir = [f for f in target_dir.iterdir() if f.is_file()]
|
|
has_files = len(files_in_dir) > 0
|
|
|
|
return has_index, has_files
|
|
|
|
|
|
@dataclass
|
|
class PathData:
|
|
"""Standardized path data with consistent structure."""
|
|
|
|
references: list[dict[str, str]]
|
|
is_file: bool = False
|
|
is_directory: bool = False
|
|
is_server_url: bool = False
|
|
has_index: bool = False
|
|
has_files: bool = False
|
|
|
|
|
|
def create_path_data(relative_target: Path, version_dir: Path) -> dict[str, Any]:
|
|
"""Create path data with existence flags and directory metadata."""
|
|
is_file, is_directory = check_path_exists(version_dir, str(relative_target))
|
|
|
|
has_index = has_files = False
|
|
if is_directory:
|
|
target_dir = version_dir / relative_target
|
|
has_index, has_files = check_directory_contents(target_dir)
|
|
|
|
path_data = PathData(
|
|
references=[],
|
|
is_server_url=False,
|
|
is_file=is_file,
|
|
is_directory=is_directory,
|
|
has_index=has_index,
|
|
has_files=has_files,
|
|
)
|
|
result = asdict(path_data)
|
|
del result["references"] # Will be created from reference_set
|
|
result["reference_set"] = set()
|
|
return result
|
|
|
|
|
|
def add_reference_to_path(
|
|
existing_path_data: dict[str, Any], ref_file: str, url: str
|
|
) -> None:
|
|
"""Add a reference to path data in place."""
|
|
if "reference_set" not in existing_path_data:
|
|
existing_path_data["reference_set"] = set()
|
|
|
|
existing_path_data["reference_set"].add((ref_file, url))
|
|
|
|
|
|
def check_filesystem(
|
|
url: str,
|
|
referencing_files: list[str],
|
|
version_dir: Path,
|
|
existing_paths: dict[str, Any],
|
|
) -> dict[str, Any]:
|
|
"""Check filesystem for URL references and return updated paths."""
|
|
updated_paths = existing_paths.copy()
|
|
|
|
for ref_file in referencing_files:
|
|
try:
|
|
normalized_target = resolve_target_path(ref_file, url, version_dir)
|
|
relative_target = normalized_target.relative_to(version_dir.resolve())
|
|
relative_target_str = str(relative_target)
|
|
|
|
if relative_target_str not in updated_paths:
|
|
updated_paths[relative_target_str] = create_path_data(
|
|
relative_target, version_dir
|
|
)
|
|
|
|
add_reference_to_path(updated_paths[relative_target_str], ref_file, url)
|
|
|
|
except ValueError as e:
|
|
print(f"Error resolving path: {e}")
|
|
continue
|
|
|
|
return updated_paths
|
|
|
|
|
|
def check_url_status(url: str) -> bool:
|
|
"""Check if a URL returns a 404 status (single attempt, no retries)."""
|
|
try:
|
|
response = requests.head(url, timeout=10, allow_redirects=True)
|
|
return response.status_code != 404
|
|
except requests.RequestException:
|
|
return False
|
|
|
|
|
|
def check_server(
|
|
url: str,
|
|
referencing_files: list[str],
|
|
version_dir: Path,
|
|
existing_paths: dict[str, Any],
|
|
version_slug: str = "",
|
|
) -> dict[str, Any]:
|
|
"""Check server for URL references by fetching HTML from server and checking URLs."""
|
|
updated_paths = existing_paths.copy()
|
|
|
|
for ref_file in referencing_files:
|
|
try:
|
|
# Extract version number from slug (boost-1-79-0 -> 1_79_0)
|
|
version_number = version_slug.replace("boost-", "").replace("-", "_")
|
|
response = requests.get(
|
|
f"http://web:8000/doc/libs/{version_number}/{ref_file}", timeout=15
|
|
)
|
|
if response.status_code != 200:
|
|
continue
|
|
|
|
all_hrefs = extract_href_urls_from_content(response.text)
|
|
if url in all_hrefs:
|
|
url_exists = check_url_status(url)
|
|
if url not in updated_paths:
|
|
path_data = PathData(
|
|
references=[],
|
|
is_server_url=True,
|
|
is_file=url_exists,
|
|
is_directory=False,
|
|
has_index=False,
|
|
has_files=False,
|
|
)
|
|
result = asdict(path_data)
|
|
del result["references"] # Will be created from reference_set
|
|
result["reference_set"] = set()
|
|
updated_paths[url] = result
|
|
|
|
add_reference_to_path(updated_paths[url], ref_file, url)
|
|
|
|
except (requests.RequestException, ValueError, KeyError):
|
|
continue
|
|
|
|
return updated_paths
|
|
|
|
|
|
def is_django_template_url(url: str) -> bool:
|
|
"""Check if URL looks like a Django template (contains template syntax)."""
|
|
return "{%" in url or "{{" in url
|
|
|
|
|
|
def process_url_reference(
|
|
url: str,
|
|
referencing_files: list[str],
|
|
version_dir: Path,
|
|
existing_paths: dict[str, Any],
|
|
version_slug: str = "",
|
|
) -> dict[str, Any]:
|
|
"""Process a single URL and its referencing files, returning updated paths."""
|
|
if is_django_template_url(url):
|
|
return check_server(
|
|
url, referencing_files, version_dir, existing_paths, version_slug
|
|
)
|
|
else:
|
|
return check_filesystem(url, referencing_files, version_dir, existing_paths)
|
|
|
|
|
|
def analyze_version_urls(version_data: dict[str, Any], base_dir: str) -> dict[str, Any]:
|
|
"""Analyze all documentation files for a version, extract URLs, and verify paths."""
|
|
version_name = version_data.get("version")
|
|
slug = version_data.get("slug")
|
|
doc_files = version_data.get("doc_files", [])
|
|
directory_exists = version_data.get("directory_exists", False)
|
|
|
|
if not version_name or not slug:
|
|
raise ValueError(
|
|
f"Missing required fields: version_name={version_name}, slug={slug}"
|
|
)
|
|
|
|
if not directory_exists:
|
|
return {"version": version_name, "directory_exists": False, "paths": {}}
|
|
|
|
version_dir = Path(base_dir) / slug.replace("-", "_")
|
|
url_references, files_processed = process_version_files(version_dir, doc_files)
|
|
|
|
# Process each URL and verify paths
|
|
paths_result = {}
|
|
for url, referencing_files in url_references.items():
|
|
paths_result = process_url_reference(
|
|
url, referencing_files, version_dir, paths_result, slug
|
|
)
|
|
|
|
# Convert reference sets to lists for JSON serialization
|
|
for path_data in paths_result.values():
|
|
path_data["references"] = [
|
|
{"referencing_file": ref_file, "original_url": url}
|
|
for ref_file, url in path_data.pop("reference_set", set())
|
|
]
|
|
|
|
return {
|
|
"version": version_name,
|
|
"directory_exists": directory_exists,
|
|
"version_directory": slug.replace("-", "_"),
|
|
"total_doc_files": len(doc_files),
|
|
"files_processed": files_processed,
|
|
"paths": paths_result,
|
|
}
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
"--json-file", required=True, help="JSON file containing documentation information"
|
|
)
|
|
@click.option(
|
|
"--base-dir",
|
|
default="tarballs",
|
|
help="Base directory containing extracted tarballs",
|
|
)
|
|
@click.option(
|
|
"--output-dir",
|
|
required=True,
|
|
help="Directory to write individual version JSON files",
|
|
)
|
|
def command(json_file: str, base_dir: str, output_dir: str):
|
|
"""Analyze local documentation URLs and verify that referenced paths exist.
|
|
|
|
Takes a JSON file with documentation file information, scans each HTML file
|
|
to extract local href URLs, then verifies that all referenced files and
|
|
directories actually exist in the extracted tarballs. Writes individual
|
|
JSON files for each version.
|
|
|
|
Examples:
|
|
python manage.py analyze_docs_urls --json-file=tarballs/docs_files.json --output-dir=nginx_redirects_data
|
|
"""
|
|
docs_data = load_json_list(json_file)
|
|
if not docs_data:
|
|
return
|
|
|
|
if not Path(base_dir).exists():
|
|
click.echo(f"Warning: Base directory '{base_dir}' does not exist")
|
|
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
for version_data in docs_data:
|
|
version_name = version_data.get("version", "unknown")
|
|
version_slug = version_name.replace("boost-", "")
|
|
output_file = Path(output_dir) / f"{version_slug}_paths.json"
|
|
|
|
if output_file.exists():
|
|
click.echo(f"Skipping {version_name} - {output_file} already exists")
|
|
continue
|
|
|
|
result = analyze_version_urls(version_data, base_dir)
|
|
|
|
output_file.write_text(json.dumps([result], indent=2))
|
|
output_file.chmod(0o666)
|
|
|
|
click.echo(f"Written {output_file}")
|