YAGNI cleanup for misra_help generation

data-douser · data-douser · commit b9d62e6270a8 · 2026-04-22T14:18:08.000-06:00
This commit addresses YAGNI cleanup and other review feedback for PR #1114.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@
 # query compilation caches
 .cache
 
+# MISRA help generator docling cache
+scripts/generate_rules/misra_help/cache/
+
 # qltest projects and artifacts
 **/test/**/*.testproj
 **/test/**/*.actual
diff --git a/scripts/generate_rules/misra_help/README.md b/scripts/generate_rules/misra_help/README.md
diff --git a/scripts/generate_rules/misra_help/cache.py b/scripts/generate_rules/misra_help/cache.py
@@ -0,0 +1,34 @@
+"""Shared helpers for locating and reading the MISRA rule cache."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+DEFAULT_CACHE_DIR = SCRIPT_DIR / "cache"
+DEFAULT_HELP_REPO = SCRIPT_DIR.parents[2].parent / "codeql-coding-standards-help"
+
+
+def cache_path_for(help_repo: Path, standard: str) -> Path:
+    """Return the path to the JSON cache file for a standard."""
+    return help_repo / ".misra-rule-cache" / f"{standard}.json"
+
+
+def load_cache(help_repo: Path, standard: str) -> dict[str, Any]:
+    """Load and return the JSON cache for a standard."""
+    path = cache_path_for(help_repo, standard)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Cache not found: {path}. Run dump_rules_json.py first."
+        )
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def save_cache(help_repo: Path, standard: str, data: dict[str, Any]) -> Path:
+    """Write the JSON cache for a standard and return the path."""
+    path = cache_path_for(help_repo, standard)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+    return path
diff --git a/scripts/generate_rules/misra_help/dump_rules_json.py b/scripts/generate_rules/misra_help/dump_rules_json.py
@@ -65,6 +65,7 @@
 
 sys.path.insert(0, str(Path(__file__).parent))
 from extract_rules import extract_rules, Rule  # noqa: E402
+from cache import cache_path_for, save_cache  # noqa: E402
 from populate_help import (  # noqa: E402
     STANDARD_INFO,
     SUPPORTED_STANDARDS,
@@ -152,7 +153,7 @@ def main() -> int:
     ap.add_argument("--help-repo", type=Path, default=DEFAULT_HELP_REPO)
     ap.add_argument("--pdf", type=Path, default=None)
     ap.add_argument("--cache-dir", type=Path,
-                    default=Path("/tmp/misra-pdf-probe/repo-cache"),
+                    default=Path(__file__).resolve().parent / "cache",
                     help="docling JSON cache dir")
     ap.add_argument("--output", type=Path, default=None,
                     help="output path (default: "
@@ -188,8 +189,7 @@ def main() -> int:
         "queries": queries_json,
     }
 
-    out_path = args.output or (args.help_repo / ".misra-rule-cache"
-                               / f"{args.standard}.json")
+    out_path = args.output or cache_path_for(args.help_repo, args.standard)
     out_path.parent.mkdir(parents=True, exist_ok=True)
     out_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
                         encoding="utf-8")
diff --git a/scripts/generate_rules/misra_help/extract_rules.py b/scripts/generate_rules/misra_help/extract_rules.py
@@ -62,12 +62,22 @@ def _load_words() -> set[str]:
     if _WORDS_CACHE is not None:
         return _WORDS_CACHE
     words: set[str] = set(_EXTRA_WORDS)
+    found_system = False
     for p in _WORDLIST_PATHS:
         path = Path(p)
         if path.exists():
             with path.open() as f:
                 words |= {w.strip().lower() for w in f if w.strip()}
+            found_system = True
             break
+    if not found_system:
+        import warnings
+        warnings.warn(
+            "No system wordlist found; ligature repair will rely on "
+            "the built-in word list only. Install a words file at "
+            f"{_WORDLIST_PATHS[0]} for full coverage.",
+            stacklevel=2,
+        )
     _WORDS_CACHE = words
     return words
 
@@ -80,7 +90,7 @@ def _load_words() -> set[str]:
 #   prevents mis-substitution on legitimate CamelCase identifiers
 #   containing `A` or `C`.
 _SUSPECT_GLYPHS = set("0123456789CA^%")
-_SUSPECT_TOKEN_RE = re.compile(r"[A-Za-z0-9CA\^%]*[0-9CA\^%][A-Za-z0-9CA\^%]*")
+_SUSPECT_TOKEN_RE = re.compile(r"[A-Za-z0-9\^%]+")
 
 
 def repair_ligatures(text: str) -> str:
@@ -100,32 +110,28 @@ def fix(tok: str) -> str:
         # though they start or end with a suspect glyph.
         if not any(c.isalpha() for c in tok):
             return tok
+        # Skip tokens that contain no suspect glyphs at all.
+        if not any(c in _SUSPECT_GLYPHS for c in tok):
+            return tok
         low = tok.lower()
         if low.isalpha() and low in words:
             return tok
         out = tok
-        for _ in range(4):  # at most a few rounds for ffl etc.
-            changed = False
-            for i, ch in enumerate(out):
-                if ch not in _SUSPECT_GLYPHS:
-                    continue
-                # The substitution is acceptable if at least one side of
-                # the suspect glyph is a letter (or the token edge).
-                left_ok = (i == 0) or out[i - 1].isalpha()
-                right_ok = (i == len(out) - 1) or out[i + 1].isalpha()
-                if not (left_ok and right_ok):
-                    continue
-                hits = []
-                for lig in _LIGS:
-                    cand = (out[:i] + lig + out[i + 1 :]).lower()
-                    if cand in words:
-                        hits.append(lig)
-                if len(hits) == 1:
-                    out = out[:i] + hits[0] + out[i + 1 :]
-                    changed = True
-                    break
-            if not changed:
-                break
+        for i, ch in enumerate(out):
+            if ch not in _SUSPECT_GLYPHS:
+                continue
+            left_ok = (i == 0) or out[i - 1].isalpha()
+            right_ok = (i == len(out) - 1) or out[i + 1].isalpha()
+            if not (left_ok and right_ok):
+                continue
+            hits = []
+            for lig in _LIGS:
+                cand = (out[:i] + lig + out[i + 1 :]).lower()
+                if cand in words:
+                    hits.append(lig)
+            if len(hits) == 1:
+                out = out[:i] + hits[0] + out[i + 1 :]
+                break  # indices shifted; one repair per token suffices
         return out
 
     return _SUSPECT_TOKEN_RE.sub(lambda m: fix(m.group(0)), text)
@@ -654,38 +660,29 @@ def to_dict(rule: Rule) -> dict:
 # CLI
 # ----------------------------------------------------------------------------
 
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+
 if __name__ == "__main__":
-    import argparse, csv
+    import argparse
     ap = argparse.ArgumentParser()
     ap.add_argument("pdf")
     ap.add_argument("--standard", required=True, choices=list(STD_DISPLAY))
-    ap.add_argument("--lang", default=None,
-                    help="override the language used to render code fences "
-                         "(default: derived from --standard)")
-    ap.add_argument("--cache-dir", default="/tmp/misra-pdf-probe")
-    ap.add_argument("--out-dir", default="/tmp/misra-pdf-probe/extracted")
+    ap.add_argument("--cache-dir",
+                    default=str(_REPO_ROOT / "scripts" / "generate_rules"
+                                / "misra_help" / "cache"))
     ap.add_argument("--rule", action="append", help="only emit these rule IDs")
-    ap.add_argument("--check-csv", default="/Users/data-douser/Git/github/codeql-coding-standards/rules.csv",
-                    help="cross-check coverage against this rules.csv")
+    ap.add_argument("--json", default=None,
+                    help="write extracted rules to this JSON file")
     args = ap.parse_args()
     rules = extract_rules(Path(args.pdf), args.standard, Path(args.cache_dir))
-    print(f"Extracted {len(rules)} rules from {args.pdf}")
-    out = Path(args.out_dir)
-    out.mkdir(parents=True, exist_ok=True)
-    (out / f"{args.standard}.json").write_text(
-        json.dumps([to_dict(r) for r in rules], indent=2),
-        encoding="utf-8",
-    )
-    if args.check_csv:
-        csv_std = "MISRA-C-2012" if args.standard == "MISRA-C-2023" else args.standard
-        expected = {row["ID"] for row in csv.DictReader(open(args.check_csv))
-                    if row["Standard"] == csv_std}
-        got = {r.rule_id for r in rules}
-        print(f"  csv-coverage: {len(got & expected)}/{len(expected)} matched, "
-              f"missing={sorted(expected - got)[:10]}, extra={sorted(got - expected)[:10]}")
     selected = [r for r in rules if not args.rule or r.rule_id in args.rule]
-    for r in selected:
-        d = out / args.standard / r.rule_id
-        d.mkdir(parents=True, exist_ok=True)
-        (d / "extracted.md").write_text(render_help(r, args.lang or ("cpp" if "C++" in args.standard else "c")), encoding="utf-8")
-    print(f"Wrote {len(selected)} help files under {out / args.standard}/")
+    print(f"Extracted {len(rules)} rules from {args.pdf}"
+          f" ({len(selected)} selected)")
+    if args.json:
+        out = Path(args.json)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(
+            json.dumps([to_dict(r) for r in selected], indent=2),
+            encoding="utf-8",
+        )
+        print(f"Wrote {out}")
diff --git a/scripts/generate_rules/misra_help/harness.py b/scripts/generate_rules/misra_help/harness.py
@@ -72,7 +72,8 @@ def main() -> int:
     ap.add_argument("--pdf", required=True)
     ap.add_argument("--standard", required=True, choices=list(STD_DISPLAY))
     ap.add_argument("-n", "--iterations", type=int, default=3)
-    ap.add_argument("--cache-dir", default="/tmp/misra-pdf-probe/det-cache")
+    ap.add_argument("--cache-dir",
+                    default=str(Path(__file__).resolve().parent / "cache"))
     ap.add_argument("--keep-cache", action="store_true",
                     help="do NOT clear docling cache between runs (tests just the post-docling stages)")
     ap.add_argument("--report", default="/tmp/misra-pdf-probe/determinism-report.json")
diff --git a/scripts/generate_rules/misra_help/populate_help.py b/scripts/generate_rules/misra_help/populate_help.py
@@ -29,6 +29,7 @@
 
 DEFAULT_HELP_REPO = Path(__file__).resolve().parents[3].parent / "codeql-coding-standards-help"
 DEFAULT_QUERY_REPO = Path(__file__).resolve().parents[3]
+DEFAULT_CACHE_DIR = Path(__file__).resolve().parent / "cache"
 
 # standard → (lang, relative source dir under the queries repo).
 # A MISRA standard implies its language; users do not pass --lang.
@@ -218,41 +219,7 @@ def write_help(rule: Rule, ql_path: Path, lang: str, help_repo: Path,
                query_repo: Path, lang_src: Path,
                no_overwrite: bool, dry_run: bool,
                rule_trusted: bool) -> str:
-    """Write one help .md. Returns a status string.
-
-    By default, regenerates every file from the rule description,
-    overwriting any existing content (this is what makes the tool a
-    single source of truth for query documentation). Pass
-    `no_overwrite=True` to leave existing files untouched.
-
-    In the default (overwriting) mode, files whose render matches the
-    existing bytes are reported as `unchanged` and are not touched on
-    disk — so re-runs yield `wrote-changed: 0`, which is the
-    idempotency signal.
-
-    If the rule's identity could not be verified against any of the
-    queries in its directory (`rule_trusted=False`), this query's
-    `.md` is not written and `title-mismatch` is reported. The caller
-    computes `rule_trusted` by comparing the `@name` title of every
-    query for the rule to the PDF-extracted rule title; if at least
-    one matches (exactly, by prefix, or by sufficiently long substring)
-    the rule is trusted for all queries in that directory. This guards
-    against two real failure modes:
-      - MISRA C rule-numbering drift between 2012 and 2023 (queries
-        are tagged for 2012 but the only available PDF is the 2023
-        edition), and
-      - docling rule-anchor detection failures that leave a rule with
-        an empty or garbled title.
-
-    Status is one of:
-        wrote-new        file did not exist, was written
-        wrote-changed    file existed, content differs, was rewritten
-        unchanged        file existed and render matches byte-for-byte
-        skip-existing    file existed and --no-overwrite was passed
-        title-mismatch   rule title was not verifiable from any query;
-                         skipped to preserve existing content
-        would-*          dry-run variants of the above
-    """
+    """Write one help .md and return a short status string."""
     rel_dir = ql_path.parent.relative_to(query_repo / lang_src)
     target_dir = help_repo / lang_src / rel_dir
     target = target_dir / (ql_path.stem + ".md")
@@ -293,7 +260,7 @@ def main() -> int:
                     help="path to the licensed MISRA PDF (overrides env var "
                          "and help-repo glob)")
     ap.add_argument("--cache-dir", type=Path,
-                    default=Path("/tmp/misra-pdf-probe/repo-cache"),
+                    default=DEFAULT_CACHE_DIR,
                     help="docling JSON cache dir (deterministic across runs)")
     ap.add_argument("--rule", action="append", default=[],
                     help="restrict to specific RULE-X-Y[-Z] (repeatable)")
diff --git a/scripts/generate_rules/misra_help/refresh_help.py b/scripts/generate_rules/misra_help/refresh_help.py
@@ -27,6 +27,7 @@
 
 sys.path.insert(0, str(Path(__file__).parent))
 from extract_rules import Rule, render_help, _format_code_lines  # noqa: E402
+from cache import load_cache as _load_cache, save_cache  # noqa: E402
 
 SCRIPT_DIR = Path(__file__).resolve().parent
 QUERY_REPO = SCRIPT_DIR.parents[2]
@@ -152,12 +153,11 @@ def main() -> int:
     args = p.parse_args()
 
     help_repo = args.help_repo.resolve()
-    cache_path = help_repo / ".misra-rule-cache" / f"{args.standard}.json"
-    if not cache_path.exists():
-        print(f"Cache not found: {cache_path}", file=sys.stderr)
+    try:
+        cache = _load_cache(help_repo, args.standard)
+    except FileNotFoundError as e:
+        print(str(e), file=sys.stderr)
         return 2
-
-    cache = json.loads(cache_path.read_text(encoding="utf-8"))
     total_queries = sum(len(v) for v in cache["queries"].values())
     print(f"Loaded cache: {len(cache['rules'])} rules, {total_queries} queries")
 
@@ -169,8 +169,7 @@ def main() -> int:
     # Patch cache with fresh existing_md + implementation_scope.
     print("\n=== Patching cache ===")
     cache = patch_cache(cache, help_repo, args.query_repo, args.standard)
-    cache_path.write_text(
-        json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
+    save_cache(help_repo, args.standard, cache)
     impl_count = sum(
         1 for qs in cache["queries"].values()
         for q in qs if q.get("implementation_scope")
diff --git a/scripts/generate_rules/misra_help/rewrite_help.py b/scripts/generate_rules/misra_help/rewrite_help.py
@@ -39,6 +39,8 @@
 
 import requests
 
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
 
 SUPPORTED_STANDARDS = ("MISRA-C-2012", "MISRA-C-2023", "MISRA-C++-2023")
 STD_DISPLAY = {
@@ -400,14 +402,7 @@ def unwrap_fence(text: str) -> str:
 # Main rewrite loop
 # ---------------------------------------------------------------------------
 
-
-def load_cache(help_repo: Path, standard: str) -> dict[str, Any]:
-    cache_path = help_repo / ".misra-rule-cache" / f"{standard}.json"
-    if not cache_path.exists():
-        raise FileNotFoundError(
-            f"Cache not found: {cache_path}. Run dump_rules_json.py first."
-        )
-    return json.loads(cache_path.read_text(encoding="utf-8"))
+from cache import load_cache  # noqa: E402
 
 
 def iter_work(