Skip to content

Commit b9d62e6

Browse files
committed
YAGNI cleanup for misra_help generation
This commit addresses YAGNI cleanup and other review feedback for PR #1114.
1 parent 40a7294 commit b9d62e6

9 files changed

Lines changed: 135 additions & 423 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# query compilation caches
77
.cache
88

9+
# MISRA help generator docling cache
10+
scripts/generate_rules/misra_help/cache/
11+
912
# qltest projects and artifacts
1013
**/test/**/*.testproj
1114
**/test/**/*.actual

scripts/generate_rules/misra_help/README.md

Lines changed: 34 additions & 318 deletions
Large diffs are not rendered by default.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Shared helpers for locating and reading the MISRA rule cache."""
2+
from __future__ import annotations
3+
import json
4+
from pathlib import Path
5+
from typing import Any
6+
7+
SCRIPT_DIR = Path(__file__).resolve().parent
8+
DEFAULT_CACHE_DIR = SCRIPT_DIR / "cache"
9+
DEFAULT_HELP_REPO = SCRIPT_DIR.parents[2].parent / "codeql-coding-standards-help"
10+
11+
12+
def cache_path_for(help_repo: Path, standard: str) -> Path:
13+
"""Return the path to the JSON cache file for a standard."""
14+
return help_repo / ".misra-rule-cache" / f"{standard}.json"
15+
16+
17+
def load_cache(help_repo: Path, standard: str) -> dict[str, Any]:
18+
"""Load and return the JSON cache for a standard."""
19+
path = cache_path_for(help_repo, standard)
20+
if not path.exists():
21+
raise FileNotFoundError(
22+
f"Cache not found: {path}. Run dump_rules_json.py first."
23+
)
24+
return json.loads(path.read_text(encoding="utf-8"))
25+
26+
27+
def save_cache(help_repo: Path, standard: str, data: dict[str, Any]) -> Path:
28+
"""Write the JSON cache for a standard and return the path."""
29+
path = cache_path_for(help_repo, standard)
30+
path.parent.mkdir(parents=True, exist_ok=True)
31+
path.write_text(
32+
json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
33+
)
34+
return path

scripts/generate_rules/misra_help/dump_rules_json.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565

6666
sys.path.insert(0, str(Path(__file__).parent))
6767
from extract_rules import extract_rules, Rule # noqa: E402
68+
from cache import cache_path_for, save_cache # noqa: E402
6869
from populate_help import ( # noqa: E402
6970
STANDARD_INFO,
7071
SUPPORTED_STANDARDS,
@@ -152,7 +153,7 @@ def main() -> int:
152153
ap.add_argument("--help-repo", type=Path, default=DEFAULT_HELP_REPO)
153154
ap.add_argument("--pdf", type=Path, default=None)
154155
ap.add_argument("--cache-dir", type=Path,
155-
default=Path("/tmp/misra-pdf-probe/repo-cache"),
156+
default=Path(__file__).resolve().parent / "cache",
156157
help="docling JSON cache dir")
157158
ap.add_argument("--output", type=Path, default=None,
158159
help="output path (default: "
@@ -188,8 +189,7 @@ def main() -> int:
188189
"queries": queries_json,
189190
}
190191

191-
out_path = args.output or (args.help_repo / ".misra-rule-cache"
192-
/ f"{args.standard}.json")
192+
out_path = args.output or cache_path_for(args.help_repo, args.standard)
193193
out_path.parent.mkdir(parents=True, exist_ok=True)
194194
out_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
195195
encoding="utf-8")

scripts/generate_rules/misra_help/extract_rules.py

Lines changed: 47 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,22 @@ def _load_words() -> set[str]:
6262
if _WORDS_CACHE is not None:
6363
return _WORDS_CACHE
6464
words: set[str] = set(_EXTRA_WORDS)
65+
found_system = False
6566
for p in _WORDLIST_PATHS:
6667
path = Path(p)
6768
if path.exists():
6869
with path.open() as f:
6970
words |= {w.strip().lower() for w in f if w.strip()}
71+
found_system = True
7072
break
73+
if not found_system:
74+
import warnings
75+
warnings.warn(
76+
"No system wordlist found; ligature repair will rely on "
77+
"the built-in word list only. Install a words file at "
78+
f"{_WORDLIST_PATHS[0]} for full coverage.",
79+
stacklevel=2,
80+
)
7181
_WORDS_CACHE = words
7282
return words
7383

@@ -80,7 +90,7 @@ def _load_words() -> set[str]:
8090
# prevents mis-substitution on legitimate CamelCase identifiers
8191
# containing `A` or `C`.
8292
_SUSPECT_GLYPHS = set("0123456789CA^%")
83-
_SUSPECT_TOKEN_RE = re.compile(r"[A-Za-z0-9CA\^%]*[0-9CA\^%][A-Za-z0-9CA\^%]*")
93+
_SUSPECT_TOKEN_RE = re.compile(r"[A-Za-z0-9\^%]+")
8494

8595

8696
def repair_ligatures(text: str) -> str:
@@ -100,32 +110,28 @@ def fix(tok: str) -> str:
100110
# though they start or end with a suspect glyph.
101111
if not any(c.isalpha() for c in tok):
102112
return tok
113+
# Skip tokens that contain no suspect glyphs at all.
114+
if not any(c in _SUSPECT_GLYPHS for c in tok):
115+
return tok
103116
low = tok.lower()
104117
if low.isalpha() and low in words:
105118
return tok
106119
out = tok
107-
for _ in range(4): # at most a few rounds for ffl etc.
108-
changed = False
109-
for i, ch in enumerate(out):
110-
if ch not in _SUSPECT_GLYPHS:
111-
continue
112-
# The substitution is acceptable if at least one side of
113-
# the suspect glyph is a letter (or the token edge).
114-
left_ok = (i == 0) or out[i - 1].isalpha()
115-
right_ok = (i == len(out) - 1) or out[i + 1].isalpha()
116-
if not (left_ok and right_ok):
117-
continue
118-
hits = []
119-
for lig in _LIGS:
120-
cand = (out[:i] + lig + out[i + 1 :]).lower()
121-
if cand in words:
122-
hits.append(lig)
123-
if len(hits) == 1:
124-
out = out[:i] + hits[0] + out[i + 1 :]
125-
changed = True
126-
break
127-
if not changed:
128-
break
120+
for i, ch in enumerate(out):
121+
if ch not in _SUSPECT_GLYPHS:
122+
continue
123+
left_ok = (i == 0) or out[i - 1].isalpha()
124+
right_ok = (i == len(out) - 1) or out[i + 1].isalpha()
125+
if not (left_ok and right_ok):
126+
continue
127+
hits = []
128+
for lig in _LIGS:
129+
cand = (out[:i] + lig + out[i + 1 :]).lower()
130+
if cand in words:
131+
hits.append(lig)
132+
if len(hits) == 1:
133+
out = out[:i] + hits[0] + out[i + 1 :]
134+
break # indices shifted; one repair per token suffices
129135
return out
130136

131137
return _SUSPECT_TOKEN_RE.sub(lambda m: fix(m.group(0)), text)
@@ -654,38 +660,29 @@ def to_dict(rule: Rule) -> dict:
654660
# CLI
655661
# ----------------------------------------------------------------------------
656662

663+
_REPO_ROOT = Path(__file__).resolve().parents[3]
664+
657665
if __name__ == "__main__":
658-
import argparse, csv
666+
import argparse
659667
ap = argparse.ArgumentParser()
660668
ap.add_argument("pdf")
661669
ap.add_argument("--standard", required=True, choices=list(STD_DISPLAY))
662-
ap.add_argument("--lang", default=None,
663-
help="override the language used to render code fences "
664-
"(default: derived from --standard)")
665-
ap.add_argument("--cache-dir", default="/tmp/misra-pdf-probe")
666-
ap.add_argument("--out-dir", default="/tmp/misra-pdf-probe/extracted")
670+
ap.add_argument("--cache-dir",
671+
default=str(_REPO_ROOT / "scripts" / "generate_rules"
672+
/ "misra_help" / "cache"))
667673
ap.add_argument("--rule", action="append", help="only emit these rule IDs")
668-
ap.add_argument("--check-csv", default="/Users/data-douser/Git/github/codeql-coding-standards/rules.csv",
669-
help="cross-check coverage against this rules.csv")
674+
ap.add_argument("--json", default=None,
675+
help="write extracted rules to this JSON file")
670676
args = ap.parse_args()
671677
rules = extract_rules(Path(args.pdf), args.standard, Path(args.cache_dir))
672-
print(f"Extracted {len(rules)} rules from {args.pdf}")
673-
out = Path(args.out_dir)
674-
out.mkdir(parents=True, exist_ok=True)
675-
(out / f"{args.standard}.json").write_text(
676-
json.dumps([to_dict(r) for r in rules], indent=2),
677-
encoding="utf-8",
678-
)
679-
if args.check_csv:
680-
csv_std = "MISRA-C-2012" if args.standard == "MISRA-C-2023" else args.standard
681-
expected = {row["ID"] for row in csv.DictReader(open(args.check_csv))
682-
if row["Standard"] == csv_std}
683-
got = {r.rule_id for r in rules}
684-
print(f" csv-coverage: {len(got & expected)}/{len(expected)} matched, "
685-
f"missing={sorted(expected - got)[:10]}, extra={sorted(got - expected)[:10]}")
686678
selected = [r for r in rules if not args.rule or r.rule_id in args.rule]
687-
for r in selected:
688-
d = out / args.standard / r.rule_id
689-
d.mkdir(parents=True, exist_ok=True)
690-
(d / "extracted.md").write_text(render_help(r, args.lang or ("cpp" if "C++" in args.standard else "c")), encoding="utf-8")
691-
print(f"Wrote {len(selected)} help files under {out / args.standard}/")
679+
print(f"Extracted {len(rules)} rules from {args.pdf}"
680+
f" ({len(selected)} selected)")
681+
if args.json:
682+
out = Path(args.json)
683+
out.parent.mkdir(parents=True, exist_ok=True)
684+
out.write_text(
685+
json.dumps([to_dict(r) for r in selected], indent=2),
686+
encoding="utf-8",
687+
)
688+
print(f"Wrote {out}")

scripts/generate_rules/misra_help/harness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ def main() -> int:
7272
ap.add_argument("--pdf", required=True)
7373
ap.add_argument("--standard", required=True, choices=list(STD_DISPLAY))
7474
ap.add_argument("-n", "--iterations", type=int, default=3)
75-
ap.add_argument("--cache-dir", default="/tmp/misra-pdf-probe/det-cache")
75+
ap.add_argument("--cache-dir",
76+
default=str(Path(__file__).resolve().parent / "cache"))
7677
ap.add_argument("--keep-cache", action="store_true",
7778
help="do NOT clear docling cache between runs (tests just the post-docling stages)")
7879
ap.add_argument("--report", default="/tmp/misra-pdf-probe/determinism-report.json")

scripts/generate_rules/misra_help/populate_help.py

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
DEFAULT_HELP_REPO = Path(__file__).resolve().parents[3].parent / "codeql-coding-standards-help"
3131
DEFAULT_QUERY_REPO = Path(__file__).resolve().parents[3]
32+
DEFAULT_CACHE_DIR = Path(__file__).resolve().parent / "cache"
3233

3334
# standard → (lang, relative source dir under the queries repo).
3435
# A MISRA standard implies its language; users do not pass --lang.
@@ -218,41 +219,7 @@ def write_help(rule: Rule, ql_path: Path, lang: str, help_repo: Path,
218219
query_repo: Path, lang_src: Path,
219220
no_overwrite: bool, dry_run: bool,
220221
rule_trusted: bool) -> str:
221-
"""Write one help .md. Returns a status string.
222-
223-
By default, regenerates every file from the rule description,
224-
overwriting any existing content (this is what makes the tool a
225-
single source of truth for query documentation). Pass
226-
`no_overwrite=True` to leave existing files untouched.
227-
228-
In the default (overwriting) mode, files whose render matches the
229-
existing bytes are reported as `unchanged` and are not touched on
230-
disk — so re-runs yield `wrote-changed: 0`, which is the
231-
idempotency signal.
232-
233-
If the rule's identity could not be verified against any of the
234-
queries in its directory (`rule_trusted=False`), this query's
235-
`.md` is not written and `title-mismatch` is reported. The caller
236-
computes `rule_trusted` by comparing the `@name` title of every
237-
query for the rule to the PDF-extracted rule title; if at least
238-
one matches (exactly, by prefix, or by sufficiently long substring)
239-
the rule is trusted for all queries in that directory. This guards
240-
against two real failure modes:
241-
- MISRA C rule-numbering drift between 2012 and 2023 (queries
242-
are tagged for 2012 but the only available PDF is the 2023
243-
edition), and
244-
- docling rule-anchor detection failures that leave a rule with
245-
an empty or garbled title.
246-
247-
Status is one of:
248-
wrote-new file did not exist, was written
249-
wrote-changed file existed, content differs, was rewritten
250-
unchanged file existed and render matches byte-for-byte
251-
skip-existing file existed and --no-overwrite was passed
252-
title-mismatch rule title was not verifiable from any query;
253-
skipped to preserve existing content
254-
would-* dry-run variants of the above
255-
"""
222+
"""Write one help .md and return a short status string."""
256223
rel_dir = ql_path.parent.relative_to(query_repo / lang_src)
257224
target_dir = help_repo / lang_src / rel_dir
258225
target = target_dir / (ql_path.stem + ".md")
@@ -293,7 +260,7 @@ def main() -> int:
293260
help="path to the licensed MISRA PDF (overrides env var "
294261
"and help-repo glob)")
295262
ap.add_argument("--cache-dir", type=Path,
296-
default=Path("/tmp/misra-pdf-probe/repo-cache"),
263+
default=DEFAULT_CACHE_DIR,
297264
help="docling JSON cache dir (deterministic across runs)")
298265
ap.add_argument("--rule", action="append", default=[],
299266
help="restrict to specific RULE-X-Y[-Z] (repeatable)")

scripts/generate_rules/misra_help/refresh_help.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
sys.path.insert(0, str(Path(__file__).parent))
2929
from extract_rules import Rule, render_help, _format_code_lines # noqa: E402
30+
from cache import load_cache as _load_cache, save_cache # noqa: E402
3031

3132
SCRIPT_DIR = Path(__file__).resolve().parent
3233
QUERY_REPO = SCRIPT_DIR.parents[2]
@@ -152,12 +153,11 @@ def main() -> int:
152153
args = p.parse_args()
153154

154155
help_repo = args.help_repo.resolve()
155-
cache_path = help_repo / ".misra-rule-cache" / f"{args.standard}.json"
156-
if not cache_path.exists():
157-
print(f"Cache not found: {cache_path}", file=sys.stderr)
156+
try:
157+
cache = _load_cache(help_repo, args.standard)
158+
except FileNotFoundError as e:
159+
print(str(e), file=sys.stderr)
158160
return 2
159-
160-
cache = json.loads(cache_path.read_text(encoding="utf-8"))
161161
total_queries = sum(len(v) for v in cache["queries"].values())
162162
print(f"Loaded cache: {len(cache['rules'])} rules, {total_queries} queries")
163163

@@ -169,8 +169,7 @@ def main() -> int:
169169
# Patch cache with fresh existing_md + implementation_scope.
170170
print("\n=== Patching cache ===")
171171
cache = patch_cache(cache, help_repo, args.query_repo, args.standard)
172-
cache_path.write_text(
173-
json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8")
172+
save_cache(help_repo, args.standard, cache)
174173
impl_count = sum(
175174
1 for qs in cache["queries"].values()
176175
for q in qs if q.get("implementation_scope")

scripts/generate_rules/misra_help/rewrite_help.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939

4040
import requests
4141

42+
sys.path.insert(0, str(Path(__file__).resolve().parent))
43+
4244

4345
SUPPORTED_STANDARDS = ("MISRA-C-2012", "MISRA-C-2023", "MISRA-C++-2023")
4446
STD_DISPLAY = {
@@ -400,14 +402,7 @@ def unwrap_fence(text: str) -> str:
400402
# Main rewrite loop
401403
# ---------------------------------------------------------------------------
402404

403-
404-
def load_cache(help_repo: Path, standard: str) -> dict[str, Any]:
405-
cache_path = help_repo / ".misra-rule-cache" / f"{standard}.json"
406-
if not cache_path.exists():
407-
raise FileNotFoundError(
408-
f"Cache not found: {cache_path}. Run dump_rules_json.py first."
409-
)
410-
return json.loads(cache_path.read_text(encoding="utf-8"))
405+
from cache import load_cache # noqa: E402
411406

412407

413408
def iter_work(

0 commit comments

Comments
 (0)