|
| 1 | +package sanitize |
| 2 | + |
| 3 | +import ( |
| 4 | +"testing" |
| 5 | + |
| 6 | +"github.com/stretchr/testify/assert" |
| 7 | +) |
| 8 | + |
| 9 | +funcTestFilterInvisibleCharacters(t*testing.T) { |
| 10 | +tests:= []struct { |
| 11 | +namestring |
| 12 | +inputstring |
| 13 | +expectedstring |
| 14 | +}{ |
| 15 | +{ |
| 16 | +name:"empty string", |
| 17 | +input:"", |
| 18 | +expected:"", |
| 19 | +}, |
| 20 | +{ |
| 21 | +name:"normal text without invisible characters", |
| 22 | +input:"Hello World", |
| 23 | +expected:"Hello World", |
| 24 | +}, |
| 25 | +{ |
| 26 | +name:"text with zero width space", |
| 27 | +input:"Hello\u200BWorld", |
| 28 | +expected:"HelloWorld", |
| 29 | +}, |
| 30 | +{ |
| 31 | +name:"text with zero width non-joiner", |
| 32 | +input:"Hello\u200CWorld", |
| 33 | +expected:"HelloWorld", |
| 34 | +}, |
| 35 | +{ |
| 36 | +name:"text with left-to-right mark", |
| 37 | +input:"Hello\u200EWorld", |
| 38 | +expected:"HelloWorld", |
| 39 | +}, |
| 40 | +{ |
| 41 | +name:"text with right-to-left mark", |
| 42 | +input:"Hello\u200FWorld", |
| 43 | +expected:"HelloWorld", |
| 44 | +}, |
| 45 | +{ |
| 46 | +name:"text with soft hyphen", |
| 47 | +input:"Hello\u00ADWorld", |
| 48 | +expected:"HelloWorld", |
| 49 | +}, |
| 50 | +{ |
| 51 | +name:"text with zero width no-break space (BOM)", |
| 52 | +input:"Hello\uFEFFWorld", |
| 53 | +expected:"HelloWorld", |
| 54 | +}, |
| 55 | +{ |
| 56 | +name:"text with mongolian vowel separator", |
| 57 | +input:"Hello\u180EWorld", |
| 58 | +expected:"HelloWorld", |
| 59 | +}, |
| 60 | +{ |
| 61 | +name:"text with unicode tag character", |
| 62 | +input:"Hello\U000E0001World", |
| 63 | +expected:"HelloWorld", |
| 64 | +}, |
| 65 | +{ |
| 66 | +name:"text with unicode tag range characters", |
| 67 | +input:"Hello\U000E0020World\U000E007FTest", |
| 68 | +expected:"HelloWorldTest", |
| 69 | +}, |
| 70 | +{ |
| 71 | +name:"text with bidi control characters", |
| 72 | +input:"Hello\u202AWorld\u202BTest\u202CEnd\u202DMore\u202EFinal", |
| 73 | +expected:"HelloWorldTestEndMoreFinal", |
| 74 | +}, |
| 75 | +{ |
| 76 | +name:"text with bidi isolate characters", |
| 77 | +input:"Hello\u2066World\u2067Test\u2068End\u2069Final", |
| 78 | +expected:"HelloWorldTestEndFinal", |
| 79 | +}, |
| 80 | +{ |
| 81 | +name:"text with hidden modifier characters", |
| 82 | +input:"Hello\u2060World\u2061Test\u2062End\u2063More\u2064Final", |
| 83 | +expected:"HelloWorldTestEndMoreFinal", |
| 84 | +}, |
| 85 | +{ |
| 86 | +name:"multiple invisible characters mixed", |
| 87 | +input:"Hello\u200B\u200C\u200E\u200F\u00AD\uFEFF\u180E\U000E0001World", |
| 88 | +expected:"HelloWorld", |
| 89 | +}, |
| 90 | +{ |
| 91 | +name:"text with normal unicode characters (should be preserved)", |
| 92 | +input:"Hello 世界 🌍 αβγ", |
| 93 | +expected:"Hello 世界 🌍 αβγ", |
| 94 | +}, |
| 95 | +{ |
| 96 | +name:"invisible characters at start and end", |
| 97 | +input:"\u200BHello World\u200C", |
| 98 | +expected:"Hello World", |
| 99 | +}, |
| 100 | +{ |
| 101 | +name:"only invisible characters", |
| 102 | +input:"\u200B\u200C\u200E\u200F", |
| 103 | +expected:"", |
| 104 | +}, |
| 105 | +{ |
| 106 | +name:"real-world example with title", |
| 107 | +input:"Fix\u200B bug\u00AD in\u202A authentication\u202C", |
| 108 | +expected:"Fix bug in authentication", |
| 109 | +}, |
| 110 | +{ |
| 111 | +name:"issue body with mixed content", |
| 112 | +input:"This is a\u200B bug report.\n\nSteps to reproduce:\u200C\n1. Do this\u200E\n2. Do that\u200F", |
| 113 | +expected:"This is a bug report.\n\nSteps to reproduce:\n1. Do this\n2. Do that", |
| 114 | +}, |
| 115 | +} |
| 116 | + |
| 117 | +for_,tt:=rangetests { |
| 118 | +t.Run(tt.name,func(t*testing.T) { |
| 119 | +result:=FilterInvisibleCharacters(tt.input) |
| 120 | +assert.Equal(t,tt.expected,result) |
| 121 | +}) |
| 122 | +} |
| 123 | +} |
| 124 | + |
| 125 | +funcTestShouldRemoveRune(t*testing.T) { |
| 126 | +tests:= []struct { |
| 127 | +namestring |
| 128 | +runerune |
| 129 | +expectedbool |
| 130 | +}{ |
| 131 | +// Individual characters that should be removed |
| 132 | +{name:"zero width space",rune:0x200B,expected:true}, |
| 133 | +{name:"zero width non-joiner",rune:0x200C,expected:true}, |
| 134 | +{name:"left-to-right mark",rune:0x200E,expected:true}, |
| 135 | +{name:"right-to-left mark",rune:0x200F,expected:true}, |
| 136 | +{name:"soft hyphen",rune:0x00AD,expected:true}, |
| 137 | +{name:"zero width no-break space",rune:0xFEFF,expected:true}, |
| 138 | +{name:"mongolian vowel separator",rune:0x180E,expected:true}, |
| 139 | +{name:"unicode tag",rune:0xE0001,expected:true}, |
| 140 | + |
| 141 | +// Range tests - Unicode tags: U+E0020–U+E007F |
| 142 | +{name:"unicode tag range start",rune:0xE0020,expected:true}, |
| 143 | +{name:"unicode tag range middle",rune:0xE0050,expected:true}, |
| 144 | +{name:"unicode tag range end",rune:0xE007F,expected:true}, |
| 145 | +{name:"before unicode tag range",rune:0xE001F,expected:false}, |
| 146 | +{name:"after unicode tag range",rune:0xE0080,expected:false}, |
| 147 | + |
| 148 | +// Range tests - BiDi controls: U+202A–U+202E |
| 149 | +{name:"bidi control range start",rune:0x202A,expected:true}, |
| 150 | +{name:"bidi control range middle",rune:0x202C,expected:true}, |
| 151 | +{name:"bidi control range end",rune:0x202E,expected:true}, |
| 152 | +{name:"before bidi control range",rune:0x2029,expected:false}, |
| 153 | +{name:"after bidi control range",rune:0x202F,expected:false}, |
| 154 | + |
| 155 | +// Range tests - BiDi isolates: U+2066–U+2069 |
| 156 | +{name:"bidi isolate range start",rune:0x2066,expected:true}, |
| 157 | +{name:"bidi isolate range middle",rune:0x2067,expected:true}, |
| 158 | +{name:"bidi isolate range end",rune:0x2069,expected:true}, |
| 159 | +{name:"before bidi isolate range",rune:0x2065,expected:false}, |
| 160 | +{name:"after bidi isolate range",rune:0x206A,expected:false}, |
| 161 | + |
| 162 | +// Range tests - Hidden modifiers: U+2060–U+2064 |
| 163 | +{name:"hidden modifier range start",rune:0x2060,expected:true}, |
| 164 | +{name:"hidden modifier range middle",rune:0x2062,expected:true}, |
| 165 | +{name:"hidden modifier range end",rune:0x2064,expected:true}, |
| 166 | +{name:"before hidden modifier range",rune:0x205F,expected:false}, |
| 167 | +{name:"after hidden modifier range",rune:0x2065,expected:false}, |
| 168 | + |
| 169 | +// Characters that should NOT be removed |
| 170 | +{name:"regular ascii letter",rune:'A',expected:false}, |
| 171 | +{name:"regular ascii digit",rune:'1',expected:false}, |
| 172 | +{name:"regular ascii space",rune:' ',expected:false}, |
| 173 | +{name:"newline",rune:'\n',expected:false}, |
| 174 | +{name:"tab",rune:'\t',expected:false}, |
| 175 | +{name:"unicode letter",rune:'世',expected:false}, |
| 176 | +{name:"emoji",rune:'🌍',expected:false}, |
| 177 | +{name:"greek letter",rune:'α',expected:false}, |
| 178 | +{name:"punctuation",rune:'.',expected:false}, |
| 179 | +{name:"hyphen (normal)",rune:'-',expected:false}, |
| 180 | +} |
| 181 | + |
| 182 | +for_,tt:=rangetests { |
| 183 | +t.Run(tt.name,func(t*testing.T) { |
| 184 | +result:=shouldRemoveRune(tt.rune) |
| 185 | +assert.Equal(t,tt.expected,result,"rune: U+%04X (%c)",tt.rune,tt.rune) |
| 186 | +}) |
| 187 | +} |
| 188 | +} |