Files
website-v2/versions/management/commands/analyze_docs_urls.py

398 lines
12 KiB
Python

import re
import djclick as click
import json
import requests
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any
from bs4 import BeautifulSoup
from versions.utils.common import load_json_list, has_index_files
FILE_FILTER_EXTENSIONS = (
".html",
".htm",
".js",
".xml",
".css",
".txt",
".c",
".mso",
".cpp",
".hpp",
".ipp",
".php",
".py",
".md",
".rst",
".pdf",
".qbk",
".docx",
".xlsx",
".csv",
".json",
".yaml",
".yml",
".txt",
".txt.gz",
".txt.bz2",
".txt.xz",
".txt.zst",
".txt.lz4.in",
".v2",
".dat",
".dat.gz",
".dat.bz2",
".dat.xz",
".dat.zst",
".dat.lz4",
".dot",
".ico",
".toyxml",
".svg",
".png",
".jpg",
".jpeg",
)
def href_pass(url: str) -> bool:
"""Check if URL is local (relative or absolute local path)."""
url = url.strip()
if not url:
return False
# stage 1: quick checks, don't require filesystem access
if any(
[
url.startswith(("http://", "https://", "javascript:", "mailto:")),
url.startswith("{{") and url.endswith("}}"), # Jinja2 style
"#" in url,
"://" in url,
Path(url).suffix in FILE_FILTER_EXTENSIONS,
re.match(r"^[./]+$", url), # catch relative paths, "./", "../", "../../"
]
):
return False
# stage 2: filesystem check only if all quick checks passed, mitigates exception
# trip ups in lazily evaluated any() statement above
return not has_index_files(Path(url))
def extract_href_urls_from_content(content: str) -> list[str]:
"""Extract and filter href URLs from HTML content using BeautifulSoup."""
try:
soup = BeautifulSoup(content, "html.parser")
return [
a_tag.get("href")
for a_tag in soup.find_all("a", href=True)
if a_tag.get("href") and href_pass(a_tag.get("href"))
]
except (AttributeError, TypeError, ValueError):
return []
def process_single_file(file_path: Path, relative_path: str) -> dict[str, list[str]]:
"""Process a single HTML file and return dict of URLs -> [files that reference them]."""
content = file_path.read_text(encoding="utf-8", errors="ignore")
filtered_urls = extract_href_urls_from_content(content)
return {url: [relative_path] for url in filtered_urls}
def process_version_files(
version_dir: Path, doc_files: list[str]
) -> tuple[dict[str, list[str]], int]:
"""Process all doc files for a version and return dict of URLs -> referencing files."""
url_references = {}
files_processed = 0
for doc_file in doc_files:
file_path = version_dir / doc_file
file_url_dict = process_single_file(file_path, doc_file)
# Merge URLs into main dict, combining referencing file lists
for url, referencing_files in file_url_dict.items():
if url in url_references:
url_references[url].extend(referencing_files)
else:
url_references[url] = referencing_files[:]
if file_path.exists():
files_processed += 1
return url_references, files_processed
def check_path_exists(base_dir: Path, path: str) -> tuple[bool, bool]:
"""Check if path exists and return (is_file, is_directory)."""
try:
full_path = base_dir / path
if not full_path.exists():
return False, False
return full_path.is_file(), full_path.is_dir()
except ValueError:
return False, False
def resolve_target_path(ref_file: str, url: str, version_dir: Path) -> Path:
"""Resolve a URL relative to a referencing file's directory."""
ref_file_path = version_dir / ref_file
ref_file_dir = ref_file_path.parent
target_path = ref_file_dir / url
return target_path.resolve()
def check_directory_contents(target_dir: Path) -> tuple[bool, bool]:
"""Check if directory has index files and other files."""
has_index = False
has_files = False
if target_dir.exists() and target_dir.is_dir():
has_index = has_index_files(target_dir)
files_in_dir = [f for f in target_dir.iterdir() if f.is_file()]
has_files = len(files_in_dir) > 0
return has_index, has_files
@dataclass
class PathData:
"""Standardized path data with consistent structure."""
references: list[dict[str, str]]
is_file: bool = False
is_directory: bool = False
is_server_url: bool = False
has_index: bool = False
has_files: bool = False
def create_path_data(relative_target: Path, version_dir: Path) -> dict[str, Any]:
"""Create path data with existence flags and directory metadata."""
is_file, is_directory = check_path_exists(version_dir, str(relative_target))
has_index = has_files = False
if is_directory:
target_dir = version_dir / relative_target
has_index, has_files = check_directory_contents(target_dir)
path_data = PathData(
references=[],
is_server_url=False,
is_file=is_file,
is_directory=is_directory,
has_index=has_index,
has_files=has_files,
)
result = asdict(path_data)
del result["references"] # Will be created from reference_set
result["reference_set"] = set()
return result
def add_reference_to_path(
existing_path_data: dict[str, Any], ref_file: str, url: str
) -> None:
"""Add a reference to path data in place."""
if "reference_set" not in existing_path_data:
existing_path_data["reference_set"] = set()
existing_path_data["reference_set"].add((ref_file, url))
def check_filesystem(
url: str,
referencing_files: list[str],
version_dir: Path,
existing_paths: dict[str, Any],
) -> dict[str, Any]:
"""Check filesystem for URL references and return updated paths."""
updated_paths = existing_paths.copy()
for ref_file in referencing_files:
try:
normalized_target = resolve_target_path(ref_file, url, version_dir)
relative_target = normalized_target.relative_to(version_dir.resolve())
relative_target_str = str(relative_target)
if relative_target_str not in updated_paths:
updated_paths[relative_target_str] = create_path_data(
relative_target, version_dir
)
add_reference_to_path(updated_paths[relative_target_str], ref_file, url)
except ValueError as e:
print(f"Error resolving path: {e}")
continue
return updated_paths
def check_url_status(url: str) -> bool:
"""Check if a URL returns a 404 status (single attempt, no retries)."""
try:
response = requests.head(url, timeout=10, allow_redirects=True)
return response.status_code != 404
except requests.RequestException:
return False
def check_server(
url: str,
referencing_files: list[str],
version_dir: Path,
existing_paths: dict[str, Any],
version_slug: str = "",
) -> dict[str, Any]:
"""Check server for URL references by fetching HTML from server and checking URLs."""
updated_paths = existing_paths.copy()
for ref_file in referencing_files:
try:
# Extract version number from slug (boost-1-79-0 -> 1_79_0)
version_number = version_slug.replace("boost-", "").replace("-", "_")
response = requests.get(
f"http://web:8000/doc/libs/{version_number}/{ref_file}", timeout=15
)
if response.status_code != 200:
continue
all_hrefs = extract_href_urls_from_content(response.text)
if url in all_hrefs:
url_exists = check_url_status(url)
if url not in updated_paths:
path_data = PathData(
references=[],
is_server_url=True,
is_file=url_exists,
is_directory=False,
has_index=False,
has_files=False,
)
result = asdict(path_data)
del result["references"] # Will be created from reference_set
result["reference_set"] = set()
updated_paths[url] = result
add_reference_to_path(updated_paths[url], ref_file, url)
except (requests.RequestException, ValueError, KeyError):
continue
return updated_paths
def is_django_template_url(url: str) -> bool:
"""Check if URL looks like a Django template (contains template syntax)."""
return "{%" in url or "{{" in url
def process_url_reference(
url: str,
referencing_files: list[str],
version_dir: Path,
existing_paths: dict[str, Any],
version_slug: str = "",
) -> dict[str, Any]:
"""Process a single URL and its referencing files, returning updated paths."""
if is_django_template_url(url):
return check_server(
url, referencing_files, version_dir, existing_paths, version_slug
)
else:
return check_filesystem(url, referencing_files, version_dir, existing_paths)
def analyze_version_urls(version_data: dict[str, Any], base_dir: str) -> dict[str, Any]:
"""Analyze all documentation files for a version, extract URLs, and verify paths."""
version_name = version_data.get("version")
slug = version_data.get("slug")
doc_files = version_data.get("doc_files", [])
directory_exists = version_data.get("directory_exists", False)
if not version_name or not slug:
raise ValueError(
f"Missing required fields: version_name={version_name}, slug={slug}"
)
if not directory_exists:
return {"version": version_name, "directory_exists": False, "paths": {}}
version_dir = Path(base_dir) / slug.replace("-", "_")
url_references, files_processed = process_version_files(version_dir, doc_files)
# Process each URL and verify paths
paths_result = {}
for url, referencing_files in url_references.items():
paths_result = process_url_reference(
url, referencing_files, version_dir, paths_result, slug
)
# Convert reference sets to lists for JSON serialization
for path_data in paths_result.values():
path_data["references"] = [
{"referencing_file": ref_file, "original_url": url}
for ref_file, url in path_data.pop("reference_set", set())
]
return {
"version": version_name,
"directory_exists": directory_exists,
"version_directory": slug.replace("-", "_"),
"total_doc_files": len(doc_files),
"files_processed": files_processed,
"paths": paths_result,
}
@click.command()
@click.option(
"--json-file", required=True, help="JSON file containing documentation information"
)
@click.option(
"--base-dir",
default="tarballs",
help="Base directory containing extracted tarballs",
)
@click.option(
"--output-dir",
required=True,
help="Directory to write individual version JSON files",
)
def command(json_file: str, base_dir: str, output_dir: str):
"""Analyze local documentation URLs and verify that referenced paths exist.
Takes a JSON file with documentation file information, scans each HTML file
to extract local href URLs, then verifies that all referenced files and
directories actually exist in the extracted tarballs. Writes individual
JSON files for each version.
Examples:
python manage.py analyze_docs_urls --json-file=tarballs/docs_files.json --output-dir=nginx_redirects_data
"""
docs_data = load_json_list(json_file)
if not docs_data:
return
if not Path(base_dir).exists():
click.echo(f"Warning: Base directory '{base_dir}' does not exist")
Path(output_dir).mkdir(parents=True, exist_ok=True)
for version_data in docs_data:
version_name = version_data.get("version", "unknown")
version_slug = version_name.replace("boost-", "")
output_file = Path(output_dir) / f"{version_slug}_paths.json"
if output_file.exists():
click.echo(f"Skipping {version_name} - {output_file} already exists")
continue
result = analyze_version_urls(version_data, base_dir)
output_file.write_text(json.dumps([result], indent=2))
output_file.chmod(0o666)
click.echo(f"Written {output_file}")