@@ -62,12 +62,22 @@ def _load_words() -> set[str]:
6262 if _WORDS_CACHE is not None :
6363 return _WORDS_CACHE
6464 words : set [str ] = set (_EXTRA_WORDS )
65+ found_system = False
6566 for p in _WORDLIST_PATHS :
6667 path = Path (p )
6768 if path .exists ():
6869 with path .open () as f :
6970 words |= {w .strip ().lower () for w in f if w .strip ()}
71+ found_system = True
7072 break
73+ if not found_system :
74+ import warnings
75+ warnings .warn (
76+ "No system wordlist found; ligature repair will rely on "
77+ "the built-in word list only. Install a words file at "
78+ f"{ _WORDLIST_PATHS [0 ]} for full coverage." ,
79+ stacklevel = 2 ,
80+ )
7181 _WORDS_CACHE = words
7282 return words
7383
@@ -80,7 +90,7 @@ def _load_words() -> set[str]:
8090# prevents mis-substitution on legitimate CamelCase identifiers
8191# containing `A` or `C`.
8292_SUSPECT_GLYPHS = set ("0123456789CA^%" )
83- _SUSPECT_TOKEN_RE = re .compile (r"[A-Za-z0-9CA \^%]*[0-9CA\^%][A-Za-z0-9CA\^%]* " )
93+ _SUSPECT_TOKEN_RE = re .compile (r"[A-Za-z0-9 \^%]+ " )
8494
8595
8696def repair_ligatures (text : str ) -> str :
@@ -100,32 +110,28 @@ def fix(tok: str) -> str:
100110 # though they start or end with a suspect glyph.
101111 if not any (c .isalpha () for c in tok ):
102112 return tok
113+ # Skip tokens that contain no suspect glyphs at all.
114+ if not any (c in _SUSPECT_GLYPHS for c in tok ):
115+ return tok
103116 low = tok .lower ()
104117 if low .isalpha () and low in words :
105118 return tok
106119 out = tok
107- for _ in range (4 ): # at most a few rounds for ffl etc.
108- changed = False
109- for i , ch in enumerate (out ):
110- if ch not in _SUSPECT_GLYPHS :
111- continue
112- # The substitution is acceptable if at least one side of
113- # the suspect glyph is a letter (or the token edge).
114- left_ok = (i == 0 ) or out [i - 1 ].isalpha ()
115- right_ok = (i == len (out ) - 1 ) or out [i + 1 ].isalpha ()
116- if not (left_ok and right_ok ):
117- continue
118- hits = []
119- for lig in _LIGS :
120- cand = (out [:i ] + lig + out [i + 1 :]).lower ()
121- if cand in words :
122- hits .append (lig )
123- if len (hits ) == 1 :
124- out = out [:i ] + hits [0 ] + out [i + 1 :]
125- changed = True
126- break
127- if not changed :
128- break
120+ for i , ch in enumerate (out ):
121+ if ch not in _SUSPECT_GLYPHS :
122+ continue
123+ left_ok = (i == 0 ) or out [i - 1 ].isalpha ()
124+ right_ok = (i == len (out ) - 1 ) or out [i + 1 ].isalpha ()
125+ if not (left_ok and right_ok ):
126+ continue
127+ hits = []
128+ for lig in _LIGS :
129+ cand = (out [:i ] + lig + out [i + 1 :]).lower ()
130+ if cand in words :
131+ hits .append (lig )
132+ if len (hits ) == 1 :
133+ out = out [:i ] + hits [0 ] + out [i + 1 :]
134+ break # indices shifted; one repair per token suffices
129135 return out
130136
131137 return _SUSPECT_TOKEN_RE .sub (lambda m : fix (m .group (0 )), text )
@@ -654,38 +660,29 @@ def to_dict(rule: Rule) -> dict:
654660# CLI
655661# ----------------------------------------------------------------------------
656662
663+ _REPO_ROOT = Path (__file__ ).resolve ().parents [3 ]
664+
657665if __name__ == "__main__" :
658- import argparse , csv
666+ import argparse
659667 ap = argparse .ArgumentParser ()
660668 ap .add_argument ("pdf" )
661669 ap .add_argument ("--standard" , required = True , choices = list (STD_DISPLAY ))
662- ap .add_argument ("--lang" , default = None ,
663- help = "override the language used to render code fences "
664- "(default: derived from --standard)" )
665- ap .add_argument ("--cache-dir" , default = "/tmp/misra-pdf-probe" )
666- ap .add_argument ("--out-dir" , default = "/tmp/misra-pdf-probe/extracted" )
670+ ap .add_argument ("--cache-dir" ,
671+ default = str (_REPO_ROOT / "scripts" / "generate_rules"
672+ / "misra_help" / "cache" ))
667673 ap .add_argument ("--rule" , action = "append" , help = "only emit these rule IDs" )
668- ap .add_argument ("--check-csv " , default = "/Users/data-douser/Git/github/codeql-coding-standards/rules.csv" ,
669- help = "cross-check coverage against this rules.csv " )
674+ ap .add_argument ("--json " , default = None ,
675+ help = "write extracted rules to this JSON file " )
670676 args = ap .parse_args ()
671677 rules = extract_rules (Path (args .pdf ), args .standard , Path (args .cache_dir ))
672- print (f"Extracted { len (rules )} rules from { args .pdf } " )
673- out = Path (args .out_dir )
674- out .mkdir (parents = True , exist_ok = True )
675- (out / f"{ args .standard } .json" ).write_text (
676- json .dumps ([to_dict (r ) for r in rules ], indent = 2 ),
677- encoding = "utf-8" ,
678- )
679- if args .check_csv :
680- csv_std = "MISRA-C-2012" if args .standard == "MISRA-C-2023" else args .standard
681- expected = {row ["ID" ] for row in csv .DictReader (open (args .check_csv ))
682- if row ["Standard" ] == csv_std }
683- got = {r .rule_id for r in rules }
684- print (f" csv-coverage: { len (got & expected )} /{ len (expected )} matched, "
685- f"missing={ sorted (expected - got )[:10 ]} , extra={ sorted (got - expected )[:10 ]} " )
686678 selected = [r for r in rules if not args .rule or r .rule_id in args .rule ]
687- for r in selected :
688- d = out / args .standard / r .rule_id
689- d .mkdir (parents = True , exist_ok = True )
690- (d / "extracted.md" ).write_text (render_help (r , args .lang or ("cpp" if "C++" in args .standard else "c" )), encoding = "utf-8" )
691- print (f"Wrote { len (selected )} help files under { out / args .standard } /" )
679+ print (f"Extracted { len (rules )} rules from { args .pdf } "
680+ f" ({ len (selected )} selected)" )
681+ if args .json :
682+ out = Path (args .json )
683+ out .parent .mkdir (parents = True , exist_ok = True )
684+ out .write_text (
685+ json .dumps ([to_dict (r ) for r in selected ], indent = 2 ),
686+ encoding = "utf-8" ,
687+ )
688+ print (f"Wrote { out } " )
0 commit comments