Skip to content

Commit 5291662

Browse files
authored
fix: strip Unicode Tag Characters (U+E0020–U+E007F) in hardenUnicodeText (#28059)
1 parent ea3a159 commit 5291662

2 files changed

Lines changed: 58 additions & 0 deletions

File tree

actions/setup/js/sanitize_content.test.cjs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1788,6 +1788,56 @@ describe("sanitize_content.cjs", () => {
17881788
});
17891789
});
17901790

1791+
describe("Unicode Tag Characters removal (U+E0000–U+E007F, Plane 14)", () => {
1792+
it("should strip a single Tag Characters codepoint (U+E0041 = TAG LATIN CAPITAL LETTER A)", () => {
1793+
// \uDB40\uDC41 is the surrogate pair for U+E0041
1794+
const input = "Hello\uDB40\uDC41World";
1795+
expect(sanitizeContent(input)).toBe("HelloWorld");
1796+
});
1797+
1798+
it("should strip LANGUAGE TAG (U+E0001) at the boundary of the Tag block", () => {
1799+
// \uDB40\uDC01 is the surrogate pair for U+E0001
1800+
const input = "test\uDB40\uDC01";
1801+
expect(sanitizeContent(input)).toBe("test");
1802+
});
1803+
1804+
it("should strip CANCEL TAG (U+E007F) at the upper boundary of the Tag block", () => {
1805+
// \uDB40\uDC7F is the surrogate pair for U+E007F
1806+
const input = "\uDB40\uDC7Ftest";
1807+
expect(sanitizeContent(input)).toBe("test");
1808+
});
1809+
1810+
it("should strip a full ASCII string encoded in Tag Characters — invisible payload attack", () => {
1811+
// Encode "SECRET" using Tag Characters: each ASCII char C -> U+E0000+C
1812+
// S=0x53, E=0x45, C=0x43, R=0x52, E=0x45, T=0x54
1813+
const tagS = "\uDB40\uDC53";
1814+
const tagE = "\uDB40\uDC45";
1815+
const tagC = "\uDB40\uDC43";
1816+
const tagR = "\uDB40\uDC52";
1817+
const tagT = "\uDB40\uDC54";
1818+
const encoded = tagS + tagE + tagC + tagR + tagE + tagT;
1819+
expect(sanitizeContent(encoded)).toBe("");
1820+
});
1821+
1822+
it("should strip Tag Characters mixed with normal ASCII text", () => {
1823+
// Tag-encoded 'A' (U+E0041) interspersed with normal letters
1824+
const input = "a\uDB40\uDC41b\uDB40\uDC42c";
1825+
expect(sanitizeContent(input)).toBe("abc");
1826+
});
1827+
1828+
it("should strip multiple adjacent Tag Characters", () => {
1829+
// TAG LATIN CAPITAL LETTER A through D (U+E0041–U+E0044)
1830+
const input = "\uDB40\uDC41\uDB40\uDC42\uDB40\uDC43\uDB40\uDC44";
1831+
expect(sanitizeContent(input)).toBe("");
1832+
});
1833+
1834+
it("should neutralize @mention bypass using Tag Characters between @ and username", () => {
1835+
// Inserting a Tag Character between @ and username to bypass mention detection
1836+
const input = "@\uDB40\uDC41admin please review";
1837+
expect(sanitizeContent(input)).toBe("`@admin` please review");
1838+
});
1839+
});
1840+
17911841
describe("@mention bypass prevention via invisible characters", () => {
17921842
it("should neutralize @mention with U+200F (RTL mark) inserted between @ and username", () => {
17931843
const input = "@\u200Fadmin please review";

actions/setup/js/sanitize_content_core.cjs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,6 +1091,14 @@ function hardenUnicodeText(text) {
10911091
// and byte order mark
10921092
result = result.replace(/[\u00AD\u034F\u200B-\u200F\u2060-\u2064\uFEFF]/g, "");
10931093

1094+
// Step 3b: Strip Unicode Tag Characters block (U+E0000–U+E007F, Plane 14).
1095+
// These 128 Cf-category codepoints have exact 1:1 ASCII equivalents
1096+
// (e.g. U+E0041 = TAG LATIN CAPITAL LETTER A) and are completely invisible
1097+
// in all standard renderers including GitHub Markdown, enabling fully
1098+
// invisible prompt-injection payloads that decode 1:1 to ASCII content.
1099+
// Represented as surrogate pairs \uDB40\uDC00–\uDB40\uDC7F in JavaScript.
1100+
result = result.replace(/\uDB40[\uDC00-\uDC7F]/g, "");
1101+
10941102
// Step 4: Remove bidirectional text override controls
10951103
// These can be used to reverse text direction and create visual spoofs
10961104
result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, "");

0 commit comments

Comments
 (0)