@@ -1788,6 +1788,56 @@ describe("sanitize_content.cjs", () => {
17881788 } ) ;
17891789 } ) ;
17901790
1791+ describe ( "Unicode Tag Characters removal (U+E0000–U+E007F, Plane 14)" , ( ) => {
1792+ it ( "should strip a single Tag Characters codepoint (U+E0041 = TAG LATIN CAPITAL LETTER A)" , ( ) => {
1793+ // \uDB40\uDC41 is the surrogate pair for U+E0041
1794+ const input = "Hello\uDB40\uDC41World" ;
1795+ expect ( sanitizeContent ( input ) ) . toBe ( "HelloWorld" ) ;
1796+ } ) ;
1797+
1798+ it ( "should strip LANGUAGE TAG (U+E0001) at the boundary of the Tag block" , ( ) => {
1799+ // \uDB40\uDC01 is the surrogate pair for U+E0001
1800+ const input = "test\uDB40\uDC01" ;
1801+ expect ( sanitizeContent ( input ) ) . toBe ( "test" ) ;
1802+ } ) ;
1803+
1804+ it ( "should strip CANCEL TAG (U+E007F) at the upper boundary of the Tag block" , ( ) => {
1805+ // \uDB40\uDC7F is the surrogate pair for U+E007F
1806+ const input = "\uDB40\uDC7Ftest" ;
1807+ expect ( sanitizeContent ( input ) ) . toBe ( "test" ) ;
1808+ } ) ;
1809+
1810+ it ( "should strip a full ASCII string encoded in Tag Characters — invisible payload attack" , ( ) => {
1811+ // Encode "SECRET" using Tag Characters: each ASCII char C -> U+E0000+C
1812+ // S=0x53, E=0x45, C=0x43, R=0x52, E=0x45, T=0x54
1813+ const tagS = "\uDB40\uDC53" ;
1814+ const tagE = "\uDB40\uDC45" ;
1815+ const tagC = "\uDB40\uDC43" ;
1816+ const tagR = "\uDB40\uDC52" ;
1817+ const tagT = "\uDB40\uDC54" ;
1818+ const encoded = tagS + tagE + tagC + tagR + tagE + tagT ;
1819+ expect ( sanitizeContent ( encoded ) ) . toBe ( "" ) ;
1820+ } ) ;
1821+
1822+ it ( "should strip Tag Characters mixed with normal ASCII text" , ( ) => {
1823+ // Tag-encoded 'A' (U+E0041) interspersed with normal letters
1824+ const input = "a\uDB40\uDC41b\uDB40\uDC42c" ;
1825+ expect ( sanitizeContent ( input ) ) . toBe ( "abc" ) ;
1826+ } ) ;
1827+
1828+ it ( "should strip multiple adjacent Tag Characters" , ( ) => {
1829+ // TAG LATIN CAPITAL LETTER A through D (U+E0041–U+E0044)
1830+ const input = "\uDB40\uDC41\uDB40\uDC42\uDB40\uDC43\uDB40\uDC44" ;
1831+ expect ( sanitizeContent ( input ) ) . toBe ( "" ) ;
1832+ } ) ;
1833+
1834+ it ( "should neutralize @mention bypass using Tag Characters between @ and username" , ( ) => {
1835+ // Inserting a Tag Character between @ and username to bypass mention detection
1836+ const input = "@\uDB40\uDC41admin please review" ;
1837+ expect ( sanitizeContent ( input ) ) . toBe ( "`@admin` please review" ) ;
1838+ } ) ;
1839+ } ) ;
1840+
17911841 describe ( "@mention bypass prevention via invisible characters" , ( ) => {
17921842 it ( "should neutralize @mention with U+200F (RTL mark) inserted between @ and username" , ( ) => {
17931843 const input = "@\u200Fadmin please review" ;
0 commit comments